]> bbs.cooldavid.org Git - net-next-2.6.git/blame_incremental - net/ipv4/route.c
netns: register net.ipv4.route.flush in each namespace
[net-next-2.6.git] / net / ipv4 / route.c
... / ...
CommitLineData
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
71#include <linux/mm.h>
72#include <linux/bootmem.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/workqueue.h>
83#include <linux/skbuff.h>
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
93#include <net/dst.h>
94#include <net/net_namespace.h>
95#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
105#include <net/netevent.h>
106#include <net/rtnetlink.h>
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
118static int ip_rt_max_size;
119static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
120static int ip_rt_gc_interval __read_mostly = 60 * HZ;
121static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
122static int ip_rt_redirect_number __read_mostly = 9;
123static int ip_rt_redirect_load __read_mostly = HZ / 50;
124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125static int ip_rt_error_cost __read_mostly = HZ;
126static int ip_rt_error_burst __read_mostly = 5 * HZ;
127static int ip_rt_gc_elasticity __read_mostly = 8;
128static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
132
133static void rt_worker_func(struct work_struct *work);
134static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
135static struct timer_list rt_secret_timer;
136
137/*
138 * Interface to generic destination cache.
139 */
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142static void ipv4_dst_destroy(struct dst_entry *dst);
143static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148static int rt_garbage_collect(struct dst_ops *ops);
149
150
151static struct dst_ops ipv4_dst_ops = {
152 .family = AF_INET,
153 .protocol = __constant_htons(ETH_P_IP),
154 .gc = rt_garbage_collect,
155 .check = ipv4_dst_check,
156 .destroy = ipv4_dst_destroy,
157 .ifdown = ipv4_dst_ifdown,
158 .negative_advice = ipv4_negative_advice,
159 .link_failure = ipv4_link_failure,
160 .update_pmtu = ip_rt_update_pmtu,
161 .local_out = __ip_local_out,
162 .entry_size = sizeof(struct rtable),
163 .entries = ATOMIC_INIT(0),
164};
165
166#define ECN_OR_COST(class) TC_PRIO_##class
167
168const __u8 ip_tos2prio[16] = {
169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
185};
186
187
188/*
189 * Route cache.
190 */
191
192/* The locking scheme is rather straight forward:
193 *
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
199 * lock held.
200 */
201
202struct rt_hash_bucket {
203 struct rtable *chain;
204};
205#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
206 defined(CONFIG_PROVE_LOCKING)
207/*
208 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
209 * The size of this table is a power of two and depends on the number of CPUS.
210 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
211 */
212#ifdef CONFIG_LOCKDEP
213# define RT_HASH_LOCK_SZ 256
214#else
215# if NR_CPUS >= 32
216# define RT_HASH_LOCK_SZ 4096
217# elif NR_CPUS >= 16
218# define RT_HASH_LOCK_SZ 2048
219# elif NR_CPUS >= 8
220# define RT_HASH_LOCK_SZ 1024
221# elif NR_CPUS >= 4
222# define RT_HASH_LOCK_SZ 512
223# else
224# define RT_HASH_LOCK_SZ 256
225# endif
226#endif
227
228static spinlock_t *rt_hash_locks;
229# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
230
231static __init void rt_hash_lock_init(void)
232{
233 int i;
234
235 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
236 GFP_KERNEL);
237 if (!rt_hash_locks)
238 panic("IP: failed to allocate rt_hash_locks\n");
239
240 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
241 spin_lock_init(&rt_hash_locks[i]);
242}
243#else
244# define rt_hash_lock_addr(slot) NULL
245
246static inline void rt_hash_lock_init(void)
247{
248}
249#endif
250
251static struct rt_hash_bucket *rt_hash_table __read_mostly;
252static unsigned rt_hash_mask __read_mostly;
253static unsigned int rt_hash_log __read_mostly;
254static atomic_t rt_genid __read_mostly;
255
256static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257#define RT_CACHE_STAT_INC(field) \
258 (__raw_get_cpu_var(rt_cache_stat).field++)
259
260static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
261{
262 return jhash_3words((__force u32)(__be32)(daddr),
263 (__force u32)(__be32)(saddr),
264 idx, atomic_read(&rt_genid))
265 & rt_hash_mask;
266}
267
268#ifdef CONFIG_PROC_FS
269struct rt_cache_iter_state {
270 struct seq_net_private p;
271 int bucket;
272 int genid;
273};
274
275static struct rtable *rt_cache_get_first(struct seq_file *seq)
276{
277 struct rt_cache_iter_state *st = seq->private;
278 struct rtable *r = NULL;
279
280 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
281 rcu_read_lock_bh();
282 r = rcu_dereference(rt_hash_table[st->bucket].chain);
283 while (r) {
284 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
285 r->rt_genid == st->genid)
286 return r;
287 r = rcu_dereference(r->u.dst.rt_next);
288 }
289 rcu_read_unlock_bh();
290 }
291 return r;
292}
293
294static struct rtable *__rt_cache_get_next(struct seq_file *seq,
295 struct rtable *r)
296{
297 struct rt_cache_iter_state *st = seq->private;
298 r = r->u.dst.rt_next;
299 while (!r) {
300 rcu_read_unlock_bh();
301 if (--st->bucket < 0)
302 break;
303 rcu_read_lock_bh();
304 r = rt_hash_table[st->bucket].chain;
305 }
306 return rcu_dereference(r);
307}
308
309static struct rtable *rt_cache_get_next(struct seq_file *seq,
310 struct rtable *r)
311{
312 struct rt_cache_iter_state *st = seq->private;
313 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
314 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
315 continue;
316 if (r->rt_genid == st->genid)
317 break;
318 }
319 return r;
320}
321
322static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
323{
324 struct rtable *r = rt_cache_get_first(seq);
325
326 if (r)
327 while (pos && (r = rt_cache_get_next(seq, r)))
328 --pos;
329 return pos ? NULL : r;
330}
331
332static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
333{
334 struct rt_cache_iter_state *st = seq->private;
335 if (*pos)
336 return rt_cache_get_idx(seq, *pos - 1);
337 st->genid = atomic_read(&rt_genid);
338 return SEQ_START_TOKEN;
339}
340
341static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
342{
343 struct rtable *r;
344
345 if (v == SEQ_START_TOKEN)
346 r = rt_cache_get_first(seq);
347 else
348 r = rt_cache_get_next(seq, v);
349 ++*pos;
350 return r;
351}
352
353static void rt_cache_seq_stop(struct seq_file *seq, void *v)
354{
355 if (v && v != SEQ_START_TOKEN)
356 rcu_read_unlock_bh();
357}
358
359static int rt_cache_seq_show(struct seq_file *seq, void *v)
360{
361 if (v == SEQ_START_TOKEN)
362 seq_printf(seq, "%-127s\n",
363 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
364 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
365 "HHUptod\tSpecDst");
366 else {
367 struct rtable *r = v;
368 int len;
369
370 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
371 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
372 r->u.dst.dev ? r->u.dst.dev->name : "*",
373 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
374 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
375 r->u.dst.__use, 0, (unsigned long)r->rt_src,
376 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
377 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
378 dst_metric(&r->u.dst, RTAX_WINDOW),
379 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
380 dst_metric(&r->u.dst, RTAX_RTTVAR)),
381 r->fl.fl4_tos,
382 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
383 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
384 dev_queue_xmit) : 0,
385 r->rt_spec_dst, &len);
386
387 seq_printf(seq, "%*s\n", 127 - len, "");
388 }
389 return 0;
390}
391
392static const struct seq_operations rt_cache_seq_ops = {
393 .start = rt_cache_seq_start,
394 .next = rt_cache_seq_next,
395 .stop = rt_cache_seq_stop,
396 .show = rt_cache_seq_show,
397};
398
399static int rt_cache_seq_open(struct inode *inode, struct file *file)
400{
401 return seq_open_net(inode, file, &rt_cache_seq_ops,
402 sizeof(struct rt_cache_iter_state));
403}
404
405static const struct file_operations rt_cache_seq_fops = {
406 .owner = THIS_MODULE,
407 .open = rt_cache_seq_open,
408 .read = seq_read,
409 .llseek = seq_lseek,
410 .release = seq_release_net,
411};
412
413
414static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
415{
416 int cpu;
417
418 if (*pos == 0)
419 return SEQ_START_TOKEN;
420
421 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
422 if (!cpu_possible(cpu))
423 continue;
424 *pos = cpu+1;
425 return &per_cpu(rt_cache_stat, cpu);
426 }
427 return NULL;
428}
429
430static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
431{
432 int cpu;
433
434 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
435 if (!cpu_possible(cpu))
436 continue;
437 *pos = cpu+1;
438 return &per_cpu(rt_cache_stat, cpu);
439 }
440 return NULL;
441
442}
443
444static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
445{
446
447}
448
449static int rt_cpu_seq_show(struct seq_file *seq, void *v)
450{
451 struct rt_cache_stat *st = v;
452
453 if (v == SEQ_START_TOKEN) {
454 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
455 return 0;
456 }
457
458 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
459 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
460 atomic_read(&ipv4_dst_ops.entries),
461 st->in_hit,
462 st->in_slow_tot,
463 st->in_slow_mc,
464 st->in_no_route,
465 st->in_brd,
466 st->in_martian_dst,
467 st->in_martian_src,
468
469 st->out_hit,
470 st->out_slow_tot,
471 st->out_slow_mc,
472
473 st->gc_total,
474 st->gc_ignored,
475 st->gc_goal_miss,
476 st->gc_dst_overflow,
477 st->in_hlist_search,
478 st->out_hlist_search
479 );
480 return 0;
481}
482
483static const struct seq_operations rt_cpu_seq_ops = {
484 .start = rt_cpu_seq_start,
485 .next = rt_cpu_seq_next,
486 .stop = rt_cpu_seq_stop,
487 .show = rt_cpu_seq_show,
488};
489
490
491static int rt_cpu_seq_open(struct inode *inode, struct file *file)
492{
493 return seq_open(file, &rt_cpu_seq_ops);
494}
495
496static const struct file_operations rt_cpu_seq_fops = {
497 .owner = THIS_MODULE,
498 .open = rt_cpu_seq_open,
499 .read = seq_read,
500 .llseek = seq_lseek,
501 .release = seq_release,
502};
503
504#ifdef CONFIG_NET_CLS_ROUTE
505static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
506 int length, int *eof, void *data)
507{
508 unsigned int i;
509
510 if ((offset & 3) || (length & 3))
511 return -EIO;
512
513 if (offset >= sizeof(struct ip_rt_acct) * 256) {
514 *eof = 1;
515 return 0;
516 }
517
518 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
519 length = sizeof(struct ip_rt_acct) * 256 - offset;
520 *eof = 1;
521 }
522
523 offset /= sizeof(u32);
524
525 if (length > 0) {
526 u32 *dst = (u32 *) buffer;
527
528 *start = buffer;
529 memset(dst, 0, length);
530
531 for_each_possible_cpu(i) {
532 unsigned int j;
533 u32 *src;
534
535 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
536 for (j = 0; j < length/4; j++)
537 dst[j] += src[j];
538 }
539 }
540 return length;
541}
542#endif
543
544static int __net_init ip_rt_do_proc_init(struct net *net)
545{
546 struct proc_dir_entry *pde;
547
548 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
549 &rt_cache_seq_fops);
550 if (!pde)
551 goto err1;
552
553 pde = proc_create("rt_cache", S_IRUGO,
554 net->proc_net_stat, &rt_cpu_seq_fops);
555 if (!pde)
556 goto err2;
557
558#ifdef CONFIG_NET_CLS_ROUTE
559 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
560 ip_rt_acct_read, NULL);
561 if (!pde)
562 goto err3;
563#endif
564 return 0;
565
566#ifdef CONFIG_NET_CLS_ROUTE
567err3:
568 remove_proc_entry("rt_cache", net->proc_net_stat);
569#endif
570err2:
571 remove_proc_entry("rt_cache", net->proc_net);
572err1:
573 return -ENOMEM;
574}
575
576static void __net_exit ip_rt_do_proc_exit(struct net *net)
577{
578 remove_proc_entry("rt_cache", net->proc_net_stat);
579 remove_proc_entry("rt_cache", net->proc_net);
580 remove_proc_entry("rt_acct", net->proc_net);
581}
582
583static struct pernet_operations ip_rt_proc_ops __net_initdata = {
584 .init = ip_rt_do_proc_init,
585 .exit = ip_rt_do_proc_exit,
586};
587
588static int __init ip_rt_proc_init(void)
589{
590 return register_pernet_subsys(&ip_rt_proc_ops);
591}
592
593#else
594static inline int ip_rt_proc_init(void)
595{
596 return 0;
597}
598#endif /* CONFIG_PROC_FS */
599
600static inline void rt_free(struct rtable *rt)
601{
602 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
603}
604
605static inline void rt_drop(struct rtable *rt)
606{
607 ip_rt_put(rt);
608 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
609}
610
611static inline int rt_fast_clean(struct rtable *rth)
612{
613 /* Kill broadcast/multicast entries very aggresively, if they
614 collide in hash table with more useful entries */
615 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
616 rth->fl.iif && rth->u.dst.rt_next;
617}
618
619static inline int rt_valuable(struct rtable *rth)
620{
621 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
622 rth->u.dst.expires;
623}
624
625static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
626{
627 unsigned long age;
628 int ret = 0;
629
630 if (atomic_read(&rth->u.dst.__refcnt))
631 goto out;
632
633 ret = 1;
634 if (rth->u.dst.expires &&
635 time_after_eq(jiffies, rth->u.dst.expires))
636 goto out;
637
638 age = jiffies - rth->u.dst.lastuse;
639 ret = 0;
640 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
641 (age <= tmo2 && rt_valuable(rth)))
642 goto out;
643 ret = 1;
644out: return ret;
645}
646
647/* Bits of score are:
648 * 31: very valuable
649 * 30: not quite useless
650 * 29..0: usage counter
651 */
652static inline u32 rt_score(struct rtable *rt)
653{
654 u32 score = jiffies - rt->u.dst.lastuse;
655
656 score = ~score & ~(3<<30);
657
658 if (rt_valuable(rt))
659 score |= (1<<31);
660
661 if (!rt->fl.iif ||
662 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
663 score |= (1<<30);
664
665 return score;
666}
667
668static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
669{
670 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
671 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
672 (fl1->mark ^ fl2->mark) |
673 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
674 *(u16 *)&fl2->nl_u.ip4_u.tos) |
675 (fl1->oif ^ fl2->oif) |
676 (fl1->iif ^ fl2->iif)) == 0;
677}
678
679static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
680{
681 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
682}
683
684/*
685 * Perform a full scan of hash table and free all entries.
686 * Can be called by a softirq or a process.
687 * In the later case, we want to be reschedule if necessary
688 */
689static void rt_do_flush(int process_context)
690{
691 unsigned int i;
692 struct rtable *rth, *next;
693
694 for (i = 0; i <= rt_hash_mask; i++) {
695 if (process_context && need_resched())
696 cond_resched();
697 rth = rt_hash_table[i].chain;
698 if (!rth)
699 continue;
700
701 spin_lock_bh(rt_hash_lock_addr(i));
702 rth = rt_hash_table[i].chain;
703 rt_hash_table[i].chain = NULL;
704 spin_unlock_bh(rt_hash_lock_addr(i));
705
706 for (; rth; rth = next) {
707 next = rth->u.dst.rt_next;
708 rt_free(rth);
709 }
710 }
711}
712
713static void rt_check_expire(void)
714{
715 static unsigned int rover;
716 unsigned int i = rover, goal;
717 struct rtable *rth, **rthp;
718 u64 mult;
719
720 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
721 if (ip_rt_gc_timeout > 1)
722 do_div(mult, ip_rt_gc_timeout);
723 goal = (unsigned int)mult;
724 if (goal > rt_hash_mask)
725 goal = rt_hash_mask + 1;
726 for (; goal > 0; goal--) {
727 unsigned long tmo = ip_rt_gc_timeout;
728
729 i = (i + 1) & rt_hash_mask;
730 rthp = &rt_hash_table[i].chain;
731
732 if (need_resched())
733 cond_resched();
734
735 if (*rthp == NULL)
736 continue;
737 spin_lock_bh(rt_hash_lock_addr(i));
738 while ((rth = *rthp) != NULL) {
739 if (rth->rt_genid != atomic_read(&rt_genid)) {
740 *rthp = rth->u.dst.rt_next;
741 rt_free(rth);
742 continue;
743 }
744 if (rth->u.dst.expires) {
745 /* Entry is expired even if it is in use */
746 if (time_before_eq(jiffies, rth->u.dst.expires)) {
747 tmo >>= 1;
748 rthp = &rth->u.dst.rt_next;
749 continue;
750 }
751 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
752 tmo >>= 1;
753 rthp = &rth->u.dst.rt_next;
754 continue;
755 }
756
757 /* Cleanup aged off entries. */
758 *rthp = rth->u.dst.rt_next;
759 rt_free(rth);
760 }
761 spin_unlock_bh(rt_hash_lock_addr(i));
762 }
763 rover = i;
764}
765
766/*
767 * rt_worker_func() is run in process context.
768 * we call rt_check_expire() to scan part of the hash table
769 */
770static void rt_worker_func(struct work_struct *work)
771{
772 rt_check_expire();
773 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
774}
775
776/*
777 * Pertubation of rt_genid by a small quantity [1..256]
778 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
779 * many times (2^24) without giving recent rt_genid.
780 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
781 */
782static void rt_cache_invalidate(void)
783{
784 unsigned char shuffle;
785
786 get_random_bytes(&shuffle, sizeof(shuffle));
787 atomic_add(shuffle + 1U, &rt_genid);
788}
789
790/*
791 * delay < 0 : invalidate cache (fast : entries will be deleted later)
792 * delay >= 0 : invalidate & flush cache (can be long)
793 */
794void rt_cache_flush(struct net *net, int delay)
795{
796 rt_cache_invalidate();
797 if (delay >= 0)
798 rt_do_flush(!in_softirq());
799}
800
801/*
802 * We change rt_genid and let gc do the cleanup
803 */
804static void rt_secret_rebuild(unsigned long dummy)
805{
806 rt_cache_invalidate();
807 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
808}
809
810/*
811 Short description of GC goals.
812
813 We want to build algorithm, which will keep routing cache
814 at some equilibrium point, when number of aged off entries
815 is kept approximately equal to newly generated ones.
816
817 Current expiration strength is variable "expire".
818 We try to adjust it dynamically, so that if networking
819 is idle expires is large enough to keep enough of warm entries,
820 and when load increases it reduces to limit cache size.
821 */
822
823static int rt_garbage_collect(struct dst_ops *ops)
824{
825 static unsigned long expire = RT_GC_TIMEOUT;
826 static unsigned long last_gc;
827 static int rover;
828 static int equilibrium;
829 struct rtable *rth, **rthp;
830 unsigned long now = jiffies;
831 int goal;
832
833 /*
834 * Garbage collection is pretty expensive,
835 * do not make it too frequently.
836 */
837
838 RT_CACHE_STAT_INC(gc_total);
839
840 if (now - last_gc < ip_rt_gc_min_interval &&
841 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
842 RT_CACHE_STAT_INC(gc_ignored);
843 goto out;
844 }
845
846 /* Calculate number of entries, which we want to expire now. */
847 goal = atomic_read(&ipv4_dst_ops.entries) -
848 (ip_rt_gc_elasticity << rt_hash_log);
849 if (goal <= 0) {
850 if (equilibrium < ipv4_dst_ops.gc_thresh)
851 equilibrium = ipv4_dst_ops.gc_thresh;
852 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
853 if (goal > 0) {
854 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
855 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
856 }
857 } else {
858 /* We are in dangerous area. Try to reduce cache really
859 * aggressively.
860 */
861 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
862 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
863 }
864
865 if (now - last_gc >= ip_rt_gc_min_interval)
866 last_gc = now;
867
868 if (goal <= 0) {
869 equilibrium += goal;
870 goto work_done;
871 }
872
873 do {
874 int i, k;
875
876 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
877 unsigned long tmo = expire;
878
879 k = (k + 1) & rt_hash_mask;
880 rthp = &rt_hash_table[k].chain;
881 spin_lock_bh(rt_hash_lock_addr(k));
882 while ((rth = *rthp) != NULL) {
883 if (rth->rt_genid == atomic_read(&rt_genid) &&
884 !rt_may_expire(rth, tmo, expire)) {
885 tmo >>= 1;
886 rthp = &rth->u.dst.rt_next;
887 continue;
888 }
889 *rthp = rth->u.dst.rt_next;
890 rt_free(rth);
891 goal--;
892 }
893 spin_unlock_bh(rt_hash_lock_addr(k));
894 if (goal <= 0)
895 break;
896 }
897 rover = k;
898
899 if (goal <= 0)
900 goto work_done;
901
902 /* Goal is not achieved. We stop process if:
903
904 - if expire reduced to zero. Otherwise, expire is halfed.
905 - if table is not full.
906 - if we are called from interrupt.
907 - jiffies check is just fallback/debug loop breaker.
908 We will not spin here for long time in any case.
909 */
910
911 RT_CACHE_STAT_INC(gc_goal_miss);
912
913 if (expire == 0)
914 break;
915
916 expire >>= 1;
917#if RT_CACHE_DEBUG >= 2
918 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
919 atomic_read(&ipv4_dst_ops.entries), goal, i);
920#endif
921
922 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
923 goto out;
924 } while (!in_softirq() && time_before_eq(jiffies, now));
925
926 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
927 goto out;
928 if (net_ratelimit())
929 printk(KERN_WARNING "dst cache overflow\n");
930 RT_CACHE_STAT_INC(gc_dst_overflow);
931 return 1;
932
933work_done:
934 expire += ip_rt_gc_min_interval;
935 if (expire > ip_rt_gc_timeout ||
936 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
937 expire = ip_rt_gc_timeout;
938#if RT_CACHE_DEBUG >= 2
939 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
940 atomic_read(&ipv4_dst_ops.entries), goal, rover);
941#endif
942out: return 0;
943}
944
945static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
946{
947 struct rtable *rth, **rthp;
948 unsigned long now;
949 struct rtable *cand, **candp;
950 u32 min_score;
951 int chain_length;
952 int attempts = !in_softirq();
953
954restart:
955 chain_length = 0;
956 min_score = ~(u32)0;
957 cand = NULL;
958 candp = NULL;
959 now = jiffies;
960
961 rthp = &rt_hash_table[hash].chain;
962
963 spin_lock_bh(rt_hash_lock_addr(hash));
964 while ((rth = *rthp) != NULL) {
965 if (rth->rt_genid != atomic_read(&rt_genid)) {
966 *rthp = rth->u.dst.rt_next;
967 rt_free(rth);
968 continue;
969 }
970 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
971 /* Put it first */
972 *rthp = rth->u.dst.rt_next;
973 /*
974 * Since lookup is lockfree, the deletion
975 * must be visible to another weakly ordered CPU before
976 * the insertion at the start of the hash chain.
977 */
978 rcu_assign_pointer(rth->u.dst.rt_next,
979 rt_hash_table[hash].chain);
980 /*
981 * Since lookup is lockfree, the update writes
982 * must be ordered for consistency on SMP.
983 */
984 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
985
986 dst_use(&rth->u.dst, now);
987 spin_unlock_bh(rt_hash_lock_addr(hash));
988
989 rt_drop(rt);
990 *rp = rth;
991 return 0;
992 }
993
994 if (!atomic_read(&rth->u.dst.__refcnt)) {
995 u32 score = rt_score(rth);
996
997 if (score <= min_score) {
998 cand = rth;
999 candp = rthp;
1000 min_score = score;
1001 }
1002 }
1003
1004 chain_length++;
1005
1006 rthp = &rth->u.dst.rt_next;
1007 }
1008
1009 if (cand) {
1010 /* ip_rt_gc_elasticity used to be average length of chain
1011 * length, when exceeded gc becomes really aggressive.
1012 *
1013 * The second limit is less certain. At the moment it allows
1014 * only 2 entries per bucket. We will see.
1015 */
1016 if (chain_length > ip_rt_gc_elasticity) {
1017 *candp = cand->u.dst.rt_next;
1018 rt_free(cand);
1019 }
1020 }
1021
1022 /* Try to bind route to arp only if it is output
1023 route or unicast forwarding path.
1024 */
1025 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1026 int err = arp_bind_neighbour(&rt->u.dst);
1027 if (err) {
1028 spin_unlock_bh(rt_hash_lock_addr(hash));
1029
1030 if (err != -ENOBUFS) {
1031 rt_drop(rt);
1032 return err;
1033 }
1034
1035 /* Neighbour tables are full and nothing
1036 can be released. Try to shrink route cache,
1037 it is most likely it holds some neighbour records.
1038 */
1039 if (attempts-- > 0) {
1040 int saved_elasticity = ip_rt_gc_elasticity;
1041 int saved_int = ip_rt_gc_min_interval;
1042 ip_rt_gc_elasticity = 1;
1043 ip_rt_gc_min_interval = 0;
1044 rt_garbage_collect(&ipv4_dst_ops);
1045 ip_rt_gc_min_interval = saved_int;
1046 ip_rt_gc_elasticity = saved_elasticity;
1047 goto restart;
1048 }
1049
1050 if (net_ratelimit())
1051 printk(KERN_WARNING "Neighbour table overflow.\n");
1052 rt_drop(rt);
1053 return -ENOBUFS;
1054 }
1055 }
1056
1057 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1058#if RT_CACHE_DEBUG >= 2
1059 if (rt->u.dst.rt_next) {
1060 struct rtable *trt;
1061 printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1062 NIPQUAD(rt->rt_dst));
1063 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1064 printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1065 printk("\n");
1066 }
1067#endif
1068 rt_hash_table[hash].chain = rt;
1069 spin_unlock_bh(rt_hash_lock_addr(hash));
1070 *rp = rt;
1071 return 0;
1072}
1073
1074void rt_bind_peer(struct rtable *rt, int create)
1075{
1076 static DEFINE_SPINLOCK(rt_peer_lock);
1077 struct inet_peer *peer;
1078
1079 peer = inet_getpeer(rt->rt_dst, create);
1080
1081 spin_lock_bh(&rt_peer_lock);
1082 if (rt->peer == NULL) {
1083 rt->peer = peer;
1084 peer = NULL;
1085 }
1086 spin_unlock_bh(&rt_peer_lock);
1087 if (peer)
1088 inet_putpeer(peer);
1089}
1090
1091/*
1092 * Peer allocation may fail only in serious out-of-memory conditions. However
1093 * we still can generate some output.
1094 * Random ID selection looks a bit dangerous because we have no chances to
1095 * select ID being unique in a reasonable period of time.
1096 * But broken packet identifier may be better than no packet at all.
1097 */
1098static void ip_select_fb_ident(struct iphdr *iph)
1099{
1100 static DEFINE_SPINLOCK(ip_fb_id_lock);
1101 static u32 ip_fallback_id;
1102 u32 salt;
1103
1104 spin_lock_bh(&ip_fb_id_lock);
1105 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1106 iph->id = htons(salt & 0xFFFF);
1107 ip_fallback_id = salt;
1108 spin_unlock_bh(&ip_fb_id_lock);
1109}
1110
1111void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1112{
1113 struct rtable *rt = (struct rtable *) dst;
1114
1115 if (rt) {
1116 if (rt->peer == NULL)
1117 rt_bind_peer(rt, 1);
1118
1119 /* If peer is attached to destination, it is never detached,
1120 so that we need not to grab a lock to dereference it.
1121 */
1122 if (rt->peer) {
1123 iph->id = htons(inet_getid(rt->peer, more));
1124 return;
1125 }
1126 } else
1127 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1128 __builtin_return_address(0));
1129
1130 ip_select_fb_ident(iph);
1131}
1132
1133static void rt_del(unsigned hash, struct rtable *rt)
1134{
1135 struct rtable **rthp, *aux;
1136
1137 rthp = &rt_hash_table[hash].chain;
1138 spin_lock_bh(rt_hash_lock_addr(hash));
1139 ip_rt_put(rt);
1140 while ((aux = *rthp) != NULL) {
1141 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1142 *rthp = aux->u.dst.rt_next;
1143 rt_free(aux);
1144 continue;
1145 }
1146 rthp = &aux->u.dst.rt_next;
1147 }
1148 spin_unlock_bh(rt_hash_lock_addr(hash));
1149}
1150
1151void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1152 __be32 saddr, struct net_device *dev)
1153{
1154 int i, k;
1155 struct in_device *in_dev = in_dev_get(dev);
1156 struct rtable *rth, **rthp;
1157 __be32 skeys[2] = { saddr, 0 };
1158 int ikeys[2] = { dev->ifindex, 0 };
1159 struct netevent_redirect netevent;
1160 struct net *net;
1161
1162 if (!in_dev)
1163 return;
1164
1165 net = dev_net(dev);
1166 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1167 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1168 || ipv4_is_zeronet(new_gw))
1169 goto reject_redirect;
1170
1171 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1172 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1173 goto reject_redirect;
1174 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1175 goto reject_redirect;
1176 } else {
1177 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1178 goto reject_redirect;
1179 }
1180
1181 for (i = 0; i < 2; i++) {
1182 for (k = 0; k < 2; k++) {
1183 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1184
1185 rthp=&rt_hash_table[hash].chain;
1186
1187 rcu_read_lock();
1188 while ((rth = rcu_dereference(*rthp)) != NULL) {
1189 struct rtable *rt;
1190
1191 if (rth->fl.fl4_dst != daddr ||
1192 rth->fl.fl4_src != skeys[i] ||
1193 rth->fl.oif != ikeys[k] ||
1194 rth->fl.iif != 0 ||
1195 rth->rt_genid != atomic_read(&rt_genid) ||
1196 !net_eq(dev_net(rth->u.dst.dev), net)) {
1197 rthp = &rth->u.dst.rt_next;
1198 continue;
1199 }
1200
1201 if (rth->rt_dst != daddr ||
1202 rth->rt_src != saddr ||
1203 rth->u.dst.error ||
1204 rth->rt_gateway != old_gw ||
1205 rth->u.dst.dev != dev)
1206 break;
1207
1208 dst_hold(&rth->u.dst);
1209 rcu_read_unlock();
1210
1211 rt = dst_alloc(&ipv4_dst_ops);
1212 if (rt == NULL) {
1213 ip_rt_put(rth);
1214 in_dev_put(in_dev);
1215 return;
1216 }
1217
1218 /* Copy all the information. */
1219 *rt = *rth;
1220 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1221 rt->u.dst.__use = 1;
1222 atomic_set(&rt->u.dst.__refcnt, 1);
1223 rt->u.dst.child = NULL;
1224 if (rt->u.dst.dev)
1225 dev_hold(rt->u.dst.dev);
1226 if (rt->idev)
1227 in_dev_hold(rt->idev);
1228 rt->u.dst.obsolete = 0;
1229 rt->u.dst.lastuse = jiffies;
1230 rt->u.dst.path = &rt->u.dst;
1231 rt->u.dst.neighbour = NULL;
1232 rt->u.dst.hh = NULL;
1233 rt->u.dst.xfrm = NULL;
1234 rt->rt_genid = atomic_read(&rt_genid);
1235 rt->rt_flags |= RTCF_REDIRECTED;
1236
1237 /* Gateway is different ... */
1238 rt->rt_gateway = new_gw;
1239
1240 /* Redirect received -> path was valid */
1241 dst_confirm(&rth->u.dst);
1242
1243 if (rt->peer)
1244 atomic_inc(&rt->peer->refcnt);
1245
1246 if (arp_bind_neighbour(&rt->u.dst) ||
1247 !(rt->u.dst.neighbour->nud_state &
1248 NUD_VALID)) {
1249 if (rt->u.dst.neighbour)
1250 neigh_event_send(rt->u.dst.neighbour, NULL);
1251 ip_rt_put(rth);
1252 rt_drop(rt);
1253 goto do_next;
1254 }
1255
1256 netevent.old = &rth->u.dst;
1257 netevent.new = &rt->u.dst;
1258 call_netevent_notifiers(NETEVENT_REDIRECT,
1259 &netevent);
1260
1261 rt_del(hash, rth);
1262 if (!rt_intern_hash(hash, rt, &rt))
1263 ip_rt_put(rt);
1264 goto do_next;
1265 }
1266 rcu_read_unlock();
1267 do_next:
1268 ;
1269 }
1270 }
1271 in_dev_put(in_dev);
1272 return;
1273
1274reject_redirect:
1275#ifdef CONFIG_IP_ROUTE_VERBOSE
1276 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1277 printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1278 NIPQUAD_FMT " ignored.\n"
1279 " Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1280 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1281 NIPQUAD(saddr), NIPQUAD(daddr));
1282#endif
1283 in_dev_put(in_dev);
1284}
1285
1286static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1287{
1288 struct rtable *rt = (struct rtable *)dst;
1289 struct dst_entry *ret = dst;
1290
1291 if (rt) {
1292 if (dst->obsolete) {
1293 ip_rt_put(rt);
1294 ret = NULL;
1295 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1296 rt->u.dst.expires) {
1297 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1298 rt->fl.oif);
1299#if RT_CACHE_DEBUG >= 1
1300 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1301 NIPQUAD_FMT "/%02x dropped\n",
1302 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1303#endif
1304 rt_del(hash, rt);
1305 ret = NULL;
1306 }
1307 }
1308 return ret;
1309}
1310
1311/*
1312 * Algorithm:
1313 * 1. The first ip_rt_redirect_number redirects are sent
1314 * with exponential backoff, then we stop sending them at all,
1315 * assuming that the host ignores our redirects.
1316 * 2. If we did not see packets requiring redirects
1317 * during ip_rt_redirect_silence, we assume that the host
1318 * forgot redirected route and start to send redirects again.
1319 *
1320 * This algorithm is much cheaper and more intelligent than dumb load limiting
1321 * in icmp.c.
1322 *
1323 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1324 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1325 */
1326
1327void ip_rt_send_redirect(struct sk_buff *skb)
1328{
1329 struct rtable *rt = skb->rtable;
1330 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1331
1332 if (!in_dev)
1333 return;
1334
1335 if (!IN_DEV_TX_REDIRECTS(in_dev))
1336 goto out;
1337
1338 /* No redirected packets during ip_rt_redirect_silence;
1339 * reset the algorithm.
1340 */
1341 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1342 rt->u.dst.rate_tokens = 0;
1343
1344 /* Too many ignored redirects; do not send anything
1345 * set u.dst.rate_last to the last seen redirected packet.
1346 */
1347 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1348 rt->u.dst.rate_last = jiffies;
1349 goto out;
1350 }
1351
1352 /* Check for load limit; set rate_last to the latest sent
1353 * redirect.
1354 */
1355 if (rt->u.dst.rate_tokens == 0 ||
1356 time_after(jiffies,
1357 (rt->u.dst.rate_last +
1358 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1359 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1360 rt->u.dst.rate_last = jiffies;
1361 ++rt->u.dst.rate_tokens;
1362#ifdef CONFIG_IP_ROUTE_VERBOSE
1363 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1364 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1365 net_ratelimit())
1366 printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1367 "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1368 NIPQUAD(rt->rt_src), rt->rt_iif,
1369 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1370#endif
1371 }
1372out:
1373 in_dev_put(in_dev);
1374}
1375
1376static int ip_error(struct sk_buff *skb)
1377{
1378 struct rtable *rt = skb->rtable;
1379 unsigned long now;
1380 int code;
1381
1382 switch (rt->u.dst.error) {
1383 case EINVAL:
1384 default:
1385 goto out;
1386 case EHOSTUNREACH:
1387 code = ICMP_HOST_UNREACH;
1388 break;
1389 case ENETUNREACH:
1390 code = ICMP_NET_UNREACH;
1391 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1392 break;
1393 case EACCES:
1394 code = ICMP_PKT_FILTERED;
1395 break;
1396 }
1397
1398 now = jiffies;
1399 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1400 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1401 rt->u.dst.rate_tokens = ip_rt_error_burst;
1402 rt->u.dst.rate_last = now;
1403 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1404 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1405 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1406 }
1407
1408out: kfree_skb(skb);
1409 return 0;
1410}
1411
1412/*
1413 * The last two values are not from the RFC but
1414 * are needed for AMPRnet AX.25 paths.
1415 */
1416
1417static const unsigned short mtu_plateau[] =
1418{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1419
1420static inline unsigned short guess_mtu(unsigned short old_mtu)
1421{
1422 int i;
1423
1424 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1425 if (old_mtu > mtu_plateau[i])
1426 return mtu_plateau[i];
1427 return 68;
1428}
1429
1430unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1431 unsigned short new_mtu,
1432 struct net_device *dev)
1433{
1434 int i, k;
1435 unsigned short old_mtu = ntohs(iph->tot_len);
1436 struct rtable *rth;
1437 int ikeys[2] = { dev->ifindex, 0 };
1438 __be32 skeys[2] = { iph->saddr, 0, };
1439 __be32 daddr = iph->daddr;
1440 unsigned short est_mtu = 0;
1441
1442 if (ipv4_config.no_pmtu_disc)
1443 return 0;
1444
1445 for (k = 0; k < 2; k++) {
1446 for (i = 0; i < 2; i++) {
1447 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1448
1449 rcu_read_lock();
1450 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1451 rth = rcu_dereference(rth->u.dst.rt_next)) {
1452 unsigned short mtu = new_mtu;
1453
1454 if (rth->fl.fl4_dst != daddr ||
1455 rth->fl.fl4_src != skeys[i] ||
1456 rth->rt_dst != daddr ||
1457 rth->rt_src != iph->saddr ||
1458 rth->fl.oif != ikeys[k] ||
1459 rth->fl.iif != 0 ||
1460 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1461 !net_eq(dev_net(rth->u.dst.dev), net) ||
1462 rth->rt_genid != atomic_read(&rt_genid))
1463 continue;
1464
1465 if (new_mtu < 68 || new_mtu >= old_mtu) {
1466
1467 /* BSD 4.2 compatibility hack :-( */
1468 if (mtu == 0 &&
1469 old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
1470 old_mtu >= 68 + (iph->ihl << 2))
1471 old_mtu -= iph->ihl << 2;
1472
1473 mtu = guess_mtu(old_mtu);
1474 }
1475 if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
1476 if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
1477 dst_confirm(&rth->u.dst);
1478 if (mtu < ip_rt_min_pmtu) {
1479 mtu = ip_rt_min_pmtu;
1480 rth->u.dst.metrics[RTAX_LOCK-1] |=
1481 (1 << RTAX_MTU);
1482 }
1483 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1484 dst_set_expires(&rth->u.dst,
1485 ip_rt_mtu_expires);
1486 }
1487 est_mtu = mtu;
1488 }
1489 }
1490 rcu_read_unlock();
1491 }
1492 }
1493 return est_mtu ? : new_mtu;
1494}
1495
1496static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1497{
1498 if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
1499 !(dst_metric_locked(dst, RTAX_MTU))) {
1500 if (mtu < ip_rt_min_pmtu) {
1501 mtu = ip_rt_min_pmtu;
1502 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1503 }
1504 dst->metrics[RTAX_MTU-1] = mtu;
1505 dst_set_expires(dst, ip_rt_mtu_expires);
1506 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1507 }
1508}
1509
1510static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1511{
1512 return NULL;
1513}
1514
1515static void ipv4_dst_destroy(struct dst_entry *dst)
1516{
1517 struct rtable *rt = (struct rtable *) dst;
1518 struct inet_peer *peer = rt->peer;
1519 struct in_device *idev = rt->idev;
1520
1521 if (peer) {
1522 rt->peer = NULL;
1523 inet_putpeer(peer);
1524 }
1525
1526 if (idev) {
1527 rt->idev = NULL;
1528 in_dev_put(idev);
1529 }
1530}
1531
1532static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1533 int how)
1534{
1535 struct rtable *rt = (struct rtable *) dst;
1536 struct in_device *idev = rt->idev;
1537 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1538 struct in_device *loopback_idev =
1539 in_dev_get(dev_net(dev)->loopback_dev);
1540 if (loopback_idev) {
1541 rt->idev = loopback_idev;
1542 in_dev_put(idev);
1543 }
1544 }
1545}
1546
1547static void ipv4_link_failure(struct sk_buff *skb)
1548{
1549 struct rtable *rt;
1550
1551 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1552
1553 rt = skb->rtable;
1554 if (rt)
1555 dst_set_expires(&rt->u.dst, 0);
1556}
1557
1558static int ip_rt_bug(struct sk_buff *skb)
1559{
1560 printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
1561 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1562 skb->dev ? skb->dev->name : "?");
1563 kfree_skb(skb);
1564 return 0;
1565}
1566
1567/*
1568 We do not cache source address of outgoing interface,
1569 because it is used only by IP RR, TS and SRR options,
1570 so that it out of fast path.
1571
1572 BTW remember: "addr" is allowed to be not aligned
1573 in IP options!
1574 */
1575
1576void ip_rt_get_source(u8 *addr, struct rtable *rt)
1577{
1578 __be32 src;
1579 struct fib_result res;
1580
1581 if (rt->fl.iif == 0)
1582 src = rt->rt_src;
1583 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1584 src = FIB_RES_PREFSRC(res);
1585 fib_res_put(&res);
1586 } else
1587 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1588 RT_SCOPE_UNIVERSE);
1589 memcpy(addr, &src, 4);
1590}
1591
1592#ifdef CONFIG_NET_CLS_ROUTE
1593static void set_class_tag(struct rtable *rt, u32 tag)
1594{
1595 if (!(rt->u.dst.tclassid & 0xFFFF))
1596 rt->u.dst.tclassid |= tag & 0xFFFF;
1597 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1598 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1599}
1600#endif
1601
1602static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1603{
1604 struct fib_info *fi = res->fi;
1605
1606 if (fi) {
1607 if (FIB_RES_GW(*res) &&
1608 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1609 rt->rt_gateway = FIB_RES_GW(*res);
1610 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1611 sizeof(rt->u.dst.metrics));
1612 if (fi->fib_mtu == 0) {
1613 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1614 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1615 rt->rt_gateway != rt->rt_dst &&
1616 rt->u.dst.dev->mtu > 576)
1617 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1618 }
1619#ifdef CONFIG_NET_CLS_ROUTE
1620 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1621#endif
1622 } else
1623 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1624
1625 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1626 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1627 if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
1628 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1629 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1630 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1631 ip_rt_min_advmss);
1632 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1633 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1634
1635#ifdef CONFIG_NET_CLS_ROUTE
1636#ifdef CONFIG_IP_MULTIPLE_TABLES
1637 set_class_tag(rt, fib_rules_tclass(res));
1638#endif
1639 set_class_tag(rt, itag);
1640#endif
1641 rt->rt_type = res->type;
1642}
1643
1644static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1645 u8 tos, struct net_device *dev, int our)
1646{
1647 unsigned hash;
1648 struct rtable *rth;
1649 __be32 spec_dst;
1650 struct in_device *in_dev = in_dev_get(dev);
1651 u32 itag = 0;
1652
1653 /* Primary sanity checks. */
1654
1655 if (in_dev == NULL)
1656 return -EINVAL;
1657
1658 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1659 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1660 goto e_inval;
1661
1662 if (ipv4_is_zeronet(saddr)) {
1663 if (!ipv4_is_local_multicast(daddr))
1664 goto e_inval;
1665 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1666 } else if (fib_validate_source(saddr, 0, tos, 0,
1667 dev, &spec_dst, &itag) < 0)
1668 goto e_inval;
1669
1670 rth = dst_alloc(&ipv4_dst_ops);
1671 if (!rth)
1672 goto e_nobufs;
1673
1674 rth->u.dst.output= ip_rt_bug;
1675
1676 atomic_set(&rth->u.dst.__refcnt, 1);
1677 rth->u.dst.flags= DST_HOST;
1678 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1679 rth->u.dst.flags |= DST_NOPOLICY;
1680 rth->fl.fl4_dst = daddr;
1681 rth->rt_dst = daddr;
1682 rth->fl.fl4_tos = tos;
1683 rth->fl.mark = skb->mark;
1684 rth->fl.fl4_src = saddr;
1685 rth->rt_src = saddr;
1686#ifdef CONFIG_NET_CLS_ROUTE
1687 rth->u.dst.tclassid = itag;
1688#endif
1689 rth->rt_iif =
1690 rth->fl.iif = dev->ifindex;
1691 rth->u.dst.dev = init_net.loopback_dev;
1692 dev_hold(rth->u.dst.dev);
1693 rth->idev = in_dev_get(rth->u.dst.dev);
1694 rth->fl.oif = 0;
1695 rth->rt_gateway = daddr;
1696 rth->rt_spec_dst= spec_dst;
1697 rth->rt_genid = atomic_read(&rt_genid);
1698 rth->rt_flags = RTCF_MULTICAST;
1699 rth->rt_type = RTN_MULTICAST;
1700 if (our) {
1701 rth->u.dst.input= ip_local_deliver;
1702 rth->rt_flags |= RTCF_LOCAL;
1703 }
1704
1705#ifdef CONFIG_IP_MROUTE
1706 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1707 rth->u.dst.input = ip_mr_input;
1708#endif
1709 RT_CACHE_STAT_INC(in_slow_mc);
1710
1711 in_dev_put(in_dev);
1712 hash = rt_hash(daddr, saddr, dev->ifindex);
1713 return rt_intern_hash(hash, rth, &skb->rtable);
1714
1715e_nobufs:
1716 in_dev_put(in_dev);
1717 return -ENOBUFS;
1718
1719e_inval:
1720 in_dev_put(in_dev);
1721 return -EINVAL;
1722}
1723
1724
1725static void ip_handle_martian_source(struct net_device *dev,
1726 struct in_device *in_dev,
1727 struct sk_buff *skb,
1728 __be32 daddr,
1729 __be32 saddr)
1730{
1731 RT_CACHE_STAT_INC(in_martian_src);
1732#ifdef CONFIG_IP_ROUTE_VERBOSE
1733 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1734 /*
1735 * RFC1812 recommendation, if source is martian,
1736 * the only hint is MAC header.
1737 */
1738 printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1739 NIPQUAD_FMT", on dev %s\n",
1740 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1741 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1742 int i;
1743 const unsigned char *p = skb_mac_header(skb);
1744 printk(KERN_WARNING "ll header: ");
1745 for (i = 0; i < dev->hard_header_len; i++, p++) {
1746 printk("%02x", *p);
1747 if (i < (dev->hard_header_len - 1))
1748 printk(":");
1749 }
1750 printk("\n");
1751 }
1752 }
1753#endif
1754}
1755
1756static int __mkroute_input(struct sk_buff *skb,
1757 struct fib_result *res,
1758 struct in_device *in_dev,
1759 __be32 daddr, __be32 saddr, u32 tos,
1760 struct rtable **result)
1761{
1762
1763 struct rtable *rth;
1764 int err;
1765 struct in_device *out_dev;
1766 unsigned flags = 0;
1767 __be32 spec_dst;
1768 u32 itag;
1769
1770 /* get a working reference to the output device */
1771 out_dev = in_dev_get(FIB_RES_DEV(*res));
1772 if (out_dev == NULL) {
1773 if (net_ratelimit())
1774 printk(KERN_CRIT "Bug in ip_route_input" \
1775 "_slow(). Please, report\n");
1776 return -EINVAL;
1777 }
1778
1779
1780 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1781 in_dev->dev, &spec_dst, &itag);
1782 if (err < 0) {
1783 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1784 saddr);
1785
1786 err = -EINVAL;
1787 goto cleanup;
1788 }
1789
1790 if (err)
1791 flags |= RTCF_DIRECTSRC;
1792
1793 if (out_dev == in_dev && err &&
1794 (IN_DEV_SHARED_MEDIA(out_dev) ||
1795 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1796 flags |= RTCF_DOREDIRECT;
1797
1798 if (skb->protocol != htons(ETH_P_IP)) {
1799 /* Not IP (i.e. ARP). Do not create route, if it is
1800 * invalid for proxy arp. DNAT routes are always valid.
1801 */
1802 if (out_dev == in_dev) {
1803 err = -EINVAL;
1804 goto cleanup;
1805 }
1806 }
1807
1808
1809 rth = dst_alloc(&ipv4_dst_ops);
1810 if (!rth) {
1811 err = -ENOBUFS;
1812 goto cleanup;
1813 }
1814
1815 atomic_set(&rth->u.dst.__refcnt, 1);
1816 rth->u.dst.flags= DST_HOST;
1817 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1818 rth->u.dst.flags |= DST_NOPOLICY;
1819 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1820 rth->u.dst.flags |= DST_NOXFRM;
1821 rth->fl.fl4_dst = daddr;
1822 rth->rt_dst = daddr;
1823 rth->fl.fl4_tos = tos;
1824 rth->fl.mark = skb->mark;
1825 rth->fl.fl4_src = saddr;
1826 rth->rt_src = saddr;
1827 rth->rt_gateway = daddr;
1828 rth->rt_iif =
1829 rth->fl.iif = in_dev->dev->ifindex;
1830 rth->u.dst.dev = (out_dev)->dev;
1831 dev_hold(rth->u.dst.dev);
1832 rth->idev = in_dev_get(rth->u.dst.dev);
1833 rth->fl.oif = 0;
1834 rth->rt_spec_dst= spec_dst;
1835
1836 rth->u.dst.input = ip_forward;
1837 rth->u.dst.output = ip_output;
1838 rth->rt_genid = atomic_read(&rt_genid);
1839
1840 rt_set_nexthop(rth, res, itag);
1841
1842 rth->rt_flags = flags;
1843
1844 *result = rth;
1845 err = 0;
1846 cleanup:
1847 /* release the working reference to the output device */
1848 in_dev_put(out_dev);
1849 return err;
1850}
1851
1852static int ip_mkroute_input(struct sk_buff *skb,
1853 struct fib_result *res,
1854 const struct flowi *fl,
1855 struct in_device *in_dev,
1856 __be32 daddr, __be32 saddr, u32 tos)
1857{
1858 struct rtable* rth = NULL;
1859 int err;
1860 unsigned hash;
1861
1862#ifdef CONFIG_IP_ROUTE_MULTIPATH
1863 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1864 fib_select_multipath(fl, res);
1865#endif
1866
1867 /* create a routing cache entry */
1868 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1869 if (err)
1870 return err;
1871
1872 /* put it into the cache */
1873 hash = rt_hash(daddr, saddr, fl->iif);
1874 return rt_intern_hash(hash, rth, &skb->rtable);
1875}
1876
1877/*
1878 * NOTE. We drop all the packets that has local source
1879 * addresses, because every properly looped back packet
1880 * must have correct destination already attached by output routine.
1881 *
1882 * Such approach solves two big problems:
1883 * 1. Not simplex devices are handled properly.
1884 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1885 */
1886
1887static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1888 u8 tos, struct net_device *dev)
1889{
1890 struct fib_result res;
1891 struct in_device *in_dev = in_dev_get(dev);
1892 struct flowi fl = { .nl_u = { .ip4_u =
1893 { .daddr = daddr,
1894 .saddr = saddr,
1895 .tos = tos,
1896 .scope = RT_SCOPE_UNIVERSE,
1897 } },
1898 .mark = skb->mark,
1899 .iif = dev->ifindex };
1900 unsigned flags = 0;
1901 u32 itag = 0;
1902 struct rtable * rth;
1903 unsigned hash;
1904 __be32 spec_dst;
1905 int err = -EINVAL;
1906 int free_res = 0;
1907 struct net * net = dev_net(dev);
1908
1909 /* IP on this device is disabled. */
1910
1911 if (!in_dev)
1912 goto out;
1913
1914 /* Check for the most weird martians, which can be not detected
1915 by fib_lookup.
1916 */
1917
1918 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1919 ipv4_is_loopback(saddr))
1920 goto martian_source;
1921
1922 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1923 goto brd_input;
1924
1925 /* Accept zero addresses only to limited broadcast;
1926 * I even do not know to fix it or not. Waiting for complains :-)
1927 */
1928 if (ipv4_is_zeronet(saddr))
1929 goto martian_source;
1930
1931 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1932 ipv4_is_loopback(daddr))
1933 goto martian_destination;
1934
1935 /*
1936 * Now we are ready to route packet.
1937 */
1938 if ((err = fib_lookup(net, &fl, &res)) != 0) {
1939 if (!IN_DEV_FORWARD(in_dev))
1940 goto e_hostunreach;
1941 goto no_route;
1942 }
1943 free_res = 1;
1944
1945 RT_CACHE_STAT_INC(in_slow_tot);
1946
1947 if (res.type == RTN_BROADCAST)
1948 goto brd_input;
1949
1950 if (res.type == RTN_LOCAL) {
1951 int result;
1952 result = fib_validate_source(saddr, daddr, tos,
1953 net->loopback_dev->ifindex,
1954 dev, &spec_dst, &itag);
1955 if (result < 0)
1956 goto martian_source;
1957 if (result)
1958 flags |= RTCF_DIRECTSRC;
1959 spec_dst = daddr;
1960 goto local_input;
1961 }
1962
1963 if (!IN_DEV_FORWARD(in_dev))
1964 goto e_hostunreach;
1965 if (res.type != RTN_UNICAST)
1966 goto martian_destination;
1967
1968 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1969done:
1970 in_dev_put(in_dev);
1971 if (free_res)
1972 fib_res_put(&res);
1973out: return err;
1974
1975brd_input:
1976 if (skb->protocol != htons(ETH_P_IP))
1977 goto e_inval;
1978
1979 if (ipv4_is_zeronet(saddr))
1980 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1981 else {
1982 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1983 &itag);
1984 if (err < 0)
1985 goto martian_source;
1986 if (err)
1987 flags |= RTCF_DIRECTSRC;
1988 }
1989 flags |= RTCF_BROADCAST;
1990 res.type = RTN_BROADCAST;
1991 RT_CACHE_STAT_INC(in_brd);
1992
1993local_input:
1994 rth = dst_alloc(&ipv4_dst_ops);
1995 if (!rth)
1996 goto e_nobufs;
1997
1998 rth->u.dst.output= ip_rt_bug;
1999 rth->rt_genid = atomic_read(&rt_genid);
2000
2001 atomic_set(&rth->u.dst.__refcnt, 1);
2002 rth->u.dst.flags= DST_HOST;
2003 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2004 rth->u.dst.flags |= DST_NOPOLICY;
2005 rth->fl.fl4_dst = daddr;
2006 rth->rt_dst = daddr;
2007 rth->fl.fl4_tos = tos;
2008 rth->fl.mark = skb->mark;
2009 rth->fl.fl4_src = saddr;
2010 rth->rt_src = saddr;
2011#ifdef CONFIG_NET_CLS_ROUTE
2012 rth->u.dst.tclassid = itag;
2013#endif
2014 rth->rt_iif =
2015 rth->fl.iif = dev->ifindex;
2016 rth->u.dst.dev = net->loopback_dev;
2017 dev_hold(rth->u.dst.dev);
2018 rth->idev = in_dev_get(rth->u.dst.dev);
2019 rth->rt_gateway = daddr;
2020 rth->rt_spec_dst= spec_dst;
2021 rth->u.dst.input= ip_local_deliver;
2022 rth->rt_flags = flags|RTCF_LOCAL;
2023 if (res.type == RTN_UNREACHABLE) {
2024 rth->u.dst.input= ip_error;
2025 rth->u.dst.error= -err;
2026 rth->rt_flags &= ~RTCF_LOCAL;
2027 }
2028 rth->rt_type = res.type;
2029 hash = rt_hash(daddr, saddr, fl.iif);
2030 err = rt_intern_hash(hash, rth, &skb->rtable);
2031 goto done;
2032
2033no_route:
2034 RT_CACHE_STAT_INC(in_no_route);
2035 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2036 res.type = RTN_UNREACHABLE;
2037 if (err == -ESRCH)
2038 err = -ENETUNREACH;
2039 goto local_input;
2040
2041 /*
2042 * Do not cache martian addresses: they should be logged (RFC1812)
2043 */
2044martian_destination:
2045 RT_CACHE_STAT_INC(in_martian_dst);
2046#ifdef CONFIG_IP_ROUTE_VERBOSE
2047 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2048 printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2049 NIPQUAD_FMT ", dev %s\n",
2050 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2051#endif
2052
2053e_hostunreach:
2054 err = -EHOSTUNREACH;
2055 goto done;
2056
2057e_inval:
2058 err = -EINVAL;
2059 goto done;
2060
2061e_nobufs:
2062 err = -ENOBUFS;
2063 goto done;
2064
2065martian_source:
2066 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2067 goto e_inval;
2068}
2069
2070int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2071 u8 tos, struct net_device *dev)
2072{
2073 struct rtable * rth;
2074 unsigned hash;
2075 int iif = dev->ifindex;
2076 struct net *net;
2077
2078 net = dev_net(dev);
2079 tos &= IPTOS_RT_MASK;
2080 hash = rt_hash(daddr, saddr, iif);
2081
2082 rcu_read_lock();
2083 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2084 rth = rcu_dereference(rth->u.dst.rt_next)) {
2085 if (((rth->fl.fl4_dst ^ daddr) |
2086 (rth->fl.fl4_src ^ saddr) |
2087 (rth->fl.iif ^ iif) |
2088 rth->fl.oif |
2089 (rth->fl.fl4_tos ^ tos)) == 0 &&
2090 rth->fl.mark == skb->mark &&
2091 net_eq(dev_net(rth->u.dst.dev), net) &&
2092 rth->rt_genid == atomic_read(&rt_genid)) {
2093 dst_use(&rth->u.dst, jiffies);
2094 RT_CACHE_STAT_INC(in_hit);
2095 rcu_read_unlock();
2096 skb->rtable = rth;
2097 return 0;
2098 }
2099 RT_CACHE_STAT_INC(in_hlist_search);
2100 }
2101 rcu_read_unlock();
2102
2103 /* Multicast recognition logic is moved from route cache to here.
2104 The problem was that too many Ethernet cards have broken/missing
2105 hardware multicast filters :-( As result the host on multicasting
2106 network acquires a lot of useless route cache entries, sort of
2107 SDR messages from all the world. Now we try to get rid of them.
2108 Really, provided software IP multicast filter is organized
2109 reasonably (at least, hashed), it does not result in a slowdown
2110 comparing with route cache reject entries.
2111 Note, that multicast routers are not affected, because
2112 route cache entry is created eventually.
2113 */
2114 if (ipv4_is_multicast(daddr)) {
2115 struct in_device *in_dev;
2116
2117 rcu_read_lock();
2118 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2119 int our = ip_check_mc(in_dev, daddr, saddr,
2120 ip_hdr(skb)->protocol);
2121 if (our
2122#ifdef CONFIG_IP_MROUTE
2123 || (!ipv4_is_local_multicast(daddr) &&
2124 IN_DEV_MFORWARD(in_dev))
2125#endif
2126 ) {
2127 rcu_read_unlock();
2128 return ip_route_input_mc(skb, daddr, saddr,
2129 tos, dev, our);
2130 }
2131 }
2132 rcu_read_unlock();
2133 return -EINVAL;
2134 }
2135 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2136}
2137
2138static int __mkroute_output(struct rtable **result,
2139 struct fib_result *res,
2140 const struct flowi *fl,
2141 const struct flowi *oldflp,
2142 struct net_device *dev_out,
2143 unsigned flags)
2144{
2145 struct rtable *rth;
2146 struct in_device *in_dev;
2147 u32 tos = RT_FL_TOS(oldflp);
2148 int err = 0;
2149
2150 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2151 return -EINVAL;
2152
2153 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2154 res->type = RTN_BROADCAST;
2155 else if (ipv4_is_multicast(fl->fl4_dst))
2156 res->type = RTN_MULTICAST;
2157 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2158 return -EINVAL;
2159
2160 if (dev_out->flags & IFF_LOOPBACK)
2161 flags |= RTCF_LOCAL;
2162
2163 /* get work reference to inet device */
2164 in_dev = in_dev_get(dev_out);
2165 if (!in_dev)
2166 return -EINVAL;
2167
2168 if (res->type == RTN_BROADCAST) {
2169 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2170 if (res->fi) {
2171 fib_info_put(res->fi);
2172 res->fi = NULL;
2173 }
2174 } else if (res->type == RTN_MULTICAST) {
2175 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2176 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2177 oldflp->proto))
2178 flags &= ~RTCF_LOCAL;
2179 /* If multicast route do not exist use
2180 default one, but do not gateway in this case.
2181 Yes, it is hack.
2182 */
2183 if (res->fi && res->prefixlen < 4) {
2184 fib_info_put(res->fi);
2185 res->fi = NULL;
2186 }
2187 }
2188
2189
2190 rth = dst_alloc(&ipv4_dst_ops);
2191 if (!rth) {
2192 err = -ENOBUFS;
2193 goto cleanup;
2194 }
2195
2196 atomic_set(&rth->u.dst.__refcnt, 1);
2197 rth->u.dst.flags= DST_HOST;
2198 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2199 rth->u.dst.flags |= DST_NOXFRM;
2200 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2201 rth->u.dst.flags |= DST_NOPOLICY;
2202
2203 rth->fl.fl4_dst = oldflp->fl4_dst;
2204 rth->fl.fl4_tos = tos;
2205 rth->fl.fl4_src = oldflp->fl4_src;
2206 rth->fl.oif = oldflp->oif;
2207 rth->fl.mark = oldflp->mark;
2208 rth->rt_dst = fl->fl4_dst;
2209 rth->rt_src = fl->fl4_src;
2210 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2211 /* get references to the devices that are to be hold by the routing
2212 cache entry */
2213 rth->u.dst.dev = dev_out;
2214 dev_hold(dev_out);
2215 rth->idev = in_dev_get(dev_out);
2216 rth->rt_gateway = fl->fl4_dst;
2217 rth->rt_spec_dst= fl->fl4_src;
2218
2219 rth->u.dst.output=ip_output;
2220 rth->rt_genid = atomic_read(&rt_genid);
2221
2222 RT_CACHE_STAT_INC(out_slow_tot);
2223
2224 if (flags & RTCF_LOCAL) {
2225 rth->u.dst.input = ip_local_deliver;
2226 rth->rt_spec_dst = fl->fl4_dst;
2227 }
2228 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2229 rth->rt_spec_dst = fl->fl4_src;
2230 if (flags & RTCF_LOCAL &&
2231 !(dev_out->flags & IFF_LOOPBACK)) {
2232 rth->u.dst.output = ip_mc_output;
2233 RT_CACHE_STAT_INC(out_slow_mc);
2234 }
2235#ifdef CONFIG_IP_MROUTE
2236 if (res->type == RTN_MULTICAST) {
2237 if (IN_DEV_MFORWARD(in_dev) &&
2238 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2239 rth->u.dst.input = ip_mr_input;
2240 rth->u.dst.output = ip_mc_output;
2241 }
2242 }
2243#endif
2244 }
2245
2246 rt_set_nexthop(rth, res, 0);
2247
2248 rth->rt_flags = flags;
2249
2250 *result = rth;
2251 cleanup:
2252 /* release work reference to inet device */
2253 in_dev_put(in_dev);
2254
2255 return err;
2256}
2257
2258static int ip_mkroute_output(struct rtable **rp,
2259 struct fib_result *res,
2260 const struct flowi *fl,
2261 const struct flowi *oldflp,
2262 struct net_device *dev_out,
2263 unsigned flags)
2264{
2265 struct rtable *rth = NULL;
2266 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2267 unsigned hash;
2268 if (err == 0) {
2269 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2270 err = rt_intern_hash(hash, rth, rp);
2271 }
2272
2273 return err;
2274}
2275
2276/*
2277 * Major route resolver routine.
2278 */
2279
2280static int ip_route_output_slow(struct net *net, struct rtable **rp,
2281 const struct flowi *oldflp)
2282{
2283 u32 tos = RT_FL_TOS(oldflp);
2284 struct flowi fl = { .nl_u = { .ip4_u =
2285 { .daddr = oldflp->fl4_dst,
2286 .saddr = oldflp->fl4_src,
2287 .tos = tos & IPTOS_RT_MASK,
2288 .scope = ((tos & RTO_ONLINK) ?
2289 RT_SCOPE_LINK :
2290 RT_SCOPE_UNIVERSE),
2291 } },
2292 .mark = oldflp->mark,
2293 .iif = net->loopback_dev->ifindex,
2294 .oif = oldflp->oif };
2295 struct fib_result res;
2296 unsigned flags = 0;
2297 struct net_device *dev_out = NULL;
2298 int free_res = 0;
2299 int err;
2300
2301
2302 res.fi = NULL;
2303#ifdef CONFIG_IP_MULTIPLE_TABLES
2304 res.r = NULL;
2305#endif
2306
2307 if (oldflp->fl4_src) {
2308 err = -EINVAL;
2309 if (ipv4_is_multicast(oldflp->fl4_src) ||
2310 ipv4_is_lbcast(oldflp->fl4_src) ||
2311 ipv4_is_zeronet(oldflp->fl4_src))
2312 goto out;
2313
2314 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2315 dev_out = ip_dev_find(net, oldflp->fl4_src);
2316 if (dev_out == NULL)
2317 goto out;
2318
2319 /* I removed check for oif == dev_out->oif here.
2320 It was wrong for two reasons:
2321 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2322 is assigned to multiple interfaces.
2323 2. Moreover, we are allowed to send packets with saddr
2324 of another iface. --ANK
2325 */
2326
2327 if (oldflp->oif == 0
2328 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2329 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2330 /* Special hack: user can direct multicasts
2331 and limited broadcast via necessary interface
2332 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2333 This hack is not just for fun, it allows
2334 vic,vat and friends to work.
2335 They bind socket to loopback, set ttl to zero
2336 and expect that it will work.
2337 From the viewpoint of routing cache they are broken,
2338 because we are not allowed to build multicast path
2339 with loopback source addr (look, routing cache
2340 cannot know, that ttl is zero, so that packet
2341 will not leave this host and route is valid).
2342 Luckily, this hack is good workaround.
2343 */
2344
2345 fl.oif = dev_out->ifindex;
2346 goto make_route;
2347 }
2348 if (dev_out)
2349 dev_put(dev_out);
2350 dev_out = NULL;
2351 }
2352
2353
2354 if (oldflp->oif) {
2355 dev_out = dev_get_by_index(net, oldflp->oif);
2356 err = -ENODEV;
2357 if (dev_out == NULL)
2358 goto out;
2359
2360 /* RACE: Check return value of inet_select_addr instead. */
2361 if (__in_dev_get_rtnl(dev_out) == NULL) {
2362 dev_put(dev_out);
2363 goto out; /* Wrong error code */
2364 }
2365
2366 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2367 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2368 if (!fl.fl4_src)
2369 fl.fl4_src = inet_select_addr(dev_out, 0,
2370 RT_SCOPE_LINK);
2371 goto make_route;
2372 }
2373 if (!fl.fl4_src) {
2374 if (ipv4_is_multicast(oldflp->fl4_dst))
2375 fl.fl4_src = inet_select_addr(dev_out, 0,
2376 fl.fl4_scope);
2377 else if (!oldflp->fl4_dst)
2378 fl.fl4_src = inet_select_addr(dev_out, 0,
2379 RT_SCOPE_HOST);
2380 }
2381 }
2382
2383 if (!fl.fl4_dst) {
2384 fl.fl4_dst = fl.fl4_src;
2385 if (!fl.fl4_dst)
2386 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2387 if (dev_out)
2388 dev_put(dev_out);
2389 dev_out = net->loopback_dev;
2390 dev_hold(dev_out);
2391 fl.oif = net->loopback_dev->ifindex;
2392 res.type = RTN_LOCAL;
2393 flags |= RTCF_LOCAL;
2394 goto make_route;
2395 }
2396
2397 if (fib_lookup(net, &fl, &res)) {
2398 res.fi = NULL;
2399 if (oldflp->oif) {
2400 /* Apparently, routing tables are wrong. Assume,
2401 that the destination is on link.
2402
2403 WHY? DW.
2404 Because we are allowed to send to iface
2405 even if it has NO routes and NO assigned
2406 addresses. When oif is specified, routing
2407 tables are looked up with only one purpose:
2408 to catch if destination is gatewayed, rather than
2409 direct. Moreover, if MSG_DONTROUTE is set,
2410 we send packet, ignoring both routing tables
2411 and ifaddr state. --ANK
2412
2413
2414 We could make it even if oif is unknown,
2415 likely IPv6, but we do not.
2416 */
2417
2418 if (fl.fl4_src == 0)
2419 fl.fl4_src = inet_select_addr(dev_out, 0,
2420 RT_SCOPE_LINK);
2421 res.type = RTN_UNICAST;
2422 goto make_route;
2423 }
2424 if (dev_out)
2425 dev_put(dev_out);
2426 err = -ENETUNREACH;
2427 goto out;
2428 }
2429 free_res = 1;
2430
2431 if (res.type == RTN_LOCAL) {
2432 if (!fl.fl4_src)
2433 fl.fl4_src = fl.fl4_dst;
2434 if (dev_out)
2435 dev_put(dev_out);
2436 dev_out = net->loopback_dev;
2437 dev_hold(dev_out);
2438 fl.oif = dev_out->ifindex;
2439 if (res.fi)
2440 fib_info_put(res.fi);
2441 res.fi = NULL;
2442 flags |= RTCF_LOCAL;
2443 goto make_route;
2444 }
2445
2446#ifdef CONFIG_IP_ROUTE_MULTIPATH
2447 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2448 fib_select_multipath(&fl, &res);
2449 else
2450#endif
2451 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2452 fib_select_default(net, &fl, &res);
2453
2454 if (!fl.fl4_src)
2455 fl.fl4_src = FIB_RES_PREFSRC(res);
2456
2457 if (dev_out)
2458 dev_put(dev_out);
2459 dev_out = FIB_RES_DEV(res);
2460 dev_hold(dev_out);
2461 fl.oif = dev_out->ifindex;
2462
2463
2464make_route:
2465 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2466
2467
2468 if (free_res)
2469 fib_res_put(&res);
2470 if (dev_out)
2471 dev_put(dev_out);
2472out: return err;
2473}
2474
2475int __ip_route_output_key(struct net *net, struct rtable **rp,
2476 const struct flowi *flp)
2477{
2478 unsigned hash;
2479 struct rtable *rth;
2480
2481 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2482
2483 rcu_read_lock_bh();
2484 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2485 rth = rcu_dereference(rth->u.dst.rt_next)) {
2486 if (rth->fl.fl4_dst == flp->fl4_dst &&
2487 rth->fl.fl4_src == flp->fl4_src &&
2488 rth->fl.iif == 0 &&
2489 rth->fl.oif == flp->oif &&
2490 rth->fl.mark == flp->mark &&
2491 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2492 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2493 net_eq(dev_net(rth->u.dst.dev), net) &&
2494 rth->rt_genid == atomic_read(&rt_genid)) {
2495 dst_use(&rth->u.dst, jiffies);
2496 RT_CACHE_STAT_INC(out_hit);
2497 rcu_read_unlock_bh();
2498 *rp = rth;
2499 return 0;
2500 }
2501 RT_CACHE_STAT_INC(out_hlist_search);
2502 }
2503 rcu_read_unlock_bh();
2504
2505 return ip_route_output_slow(net, rp, flp);
2506}
2507
2508EXPORT_SYMBOL_GPL(__ip_route_output_key);
2509
2510static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2511{
2512}
2513
2514static struct dst_ops ipv4_dst_blackhole_ops = {
2515 .family = AF_INET,
2516 .protocol = __constant_htons(ETH_P_IP),
2517 .destroy = ipv4_dst_destroy,
2518 .check = ipv4_dst_check,
2519 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2520 .entry_size = sizeof(struct rtable),
2521 .entries = ATOMIC_INIT(0),
2522};
2523
2524
2525static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
2526{
2527 struct rtable *ort = *rp;
2528 struct rtable *rt = (struct rtable *)
2529 dst_alloc(&ipv4_dst_blackhole_ops);
2530
2531 if (rt) {
2532 struct dst_entry *new = &rt->u.dst;
2533
2534 atomic_set(&new->__refcnt, 1);
2535 new->__use = 1;
2536 new->input = dst_discard;
2537 new->output = dst_discard;
2538 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2539
2540 new->dev = ort->u.dst.dev;
2541 if (new->dev)
2542 dev_hold(new->dev);
2543
2544 rt->fl = ort->fl;
2545
2546 rt->idev = ort->idev;
2547 if (rt->idev)
2548 in_dev_hold(rt->idev);
2549 rt->rt_genid = atomic_read(&rt_genid);
2550 rt->rt_flags = ort->rt_flags;
2551 rt->rt_type = ort->rt_type;
2552 rt->rt_dst = ort->rt_dst;
2553 rt->rt_src = ort->rt_src;
2554 rt->rt_iif = ort->rt_iif;
2555 rt->rt_gateway = ort->rt_gateway;
2556 rt->rt_spec_dst = ort->rt_spec_dst;
2557 rt->peer = ort->peer;
2558 if (rt->peer)
2559 atomic_inc(&rt->peer->refcnt);
2560
2561 dst_free(new);
2562 }
2563
2564 dst_release(&(*rp)->u.dst);
2565 *rp = rt;
2566 return (rt ? 0 : -ENOMEM);
2567}
2568
2569int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2570 struct sock *sk, int flags)
2571{
2572 int err;
2573
2574 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2575 return err;
2576
2577 if (flp->proto) {
2578 if (!flp->fl4_src)
2579 flp->fl4_src = (*rp)->rt_src;
2580 if (!flp->fl4_dst)
2581 flp->fl4_dst = (*rp)->rt_dst;
2582 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2583 flags ? XFRM_LOOKUP_WAIT : 0);
2584 if (err == -EREMOTE)
2585 err = ipv4_dst_blackhole(rp, flp);
2586
2587 return err;
2588 }
2589
2590 return 0;
2591}
2592
2593EXPORT_SYMBOL_GPL(ip_route_output_flow);
2594
2595int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2596{
2597 return ip_route_output_flow(net, rp, flp, NULL, 0);
2598}
2599
2600static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2601 int nowait, unsigned int flags)
2602{
2603 struct rtable *rt = skb->rtable;
2604 struct rtmsg *r;
2605 struct nlmsghdr *nlh;
2606 long expires;
2607 u32 id = 0, ts = 0, tsage = 0, error;
2608
2609 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2610 if (nlh == NULL)
2611 return -EMSGSIZE;
2612
2613 r = nlmsg_data(nlh);
2614 r->rtm_family = AF_INET;
2615 r->rtm_dst_len = 32;
2616 r->rtm_src_len = 0;
2617 r->rtm_tos = rt->fl.fl4_tos;
2618 r->rtm_table = RT_TABLE_MAIN;
2619 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2620 r->rtm_type = rt->rt_type;
2621 r->rtm_scope = RT_SCOPE_UNIVERSE;
2622 r->rtm_protocol = RTPROT_UNSPEC;
2623 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2624 if (rt->rt_flags & RTCF_NOTIFY)
2625 r->rtm_flags |= RTM_F_NOTIFY;
2626
2627 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2628
2629 if (rt->fl.fl4_src) {
2630 r->rtm_src_len = 32;
2631 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2632 }
2633 if (rt->u.dst.dev)
2634 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2635#ifdef CONFIG_NET_CLS_ROUTE
2636 if (rt->u.dst.tclassid)
2637 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2638#endif
2639 if (rt->fl.iif)
2640 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2641 else if (rt->rt_src != rt->fl.fl4_src)
2642 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2643
2644 if (rt->rt_dst != rt->rt_gateway)
2645 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2646
2647 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2648 goto nla_put_failure;
2649
2650 error = rt->u.dst.error;
2651 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2652 if (rt->peer) {
2653 id = rt->peer->ip_id_count;
2654 if (rt->peer->tcp_ts_stamp) {
2655 ts = rt->peer->tcp_ts;
2656 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2657 }
2658 }
2659
2660 if (rt->fl.iif) {
2661#ifdef CONFIG_IP_MROUTE
2662 __be32 dst = rt->rt_dst;
2663
2664 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2665 IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2666 int err = ipmr_get_route(skb, r, nowait);
2667 if (err <= 0) {
2668 if (!nowait) {
2669 if (err == 0)
2670 return 0;
2671 goto nla_put_failure;
2672 } else {
2673 if (err == -EMSGSIZE)
2674 goto nla_put_failure;
2675 error = err;
2676 }
2677 }
2678 } else
2679#endif
2680 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2681 }
2682
2683 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2684 expires, error) < 0)
2685 goto nla_put_failure;
2686
2687 return nlmsg_end(skb, nlh);
2688
2689nla_put_failure:
2690 nlmsg_cancel(skb, nlh);
2691 return -EMSGSIZE;
2692}
2693
2694static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2695{
2696 struct net *net = sock_net(in_skb->sk);
2697 struct rtmsg *rtm;
2698 struct nlattr *tb[RTA_MAX+1];
2699 struct rtable *rt = NULL;
2700 __be32 dst = 0;
2701 __be32 src = 0;
2702 u32 iif;
2703 int err;
2704 struct sk_buff *skb;
2705
2706 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2707 if (err < 0)
2708 goto errout;
2709
2710 rtm = nlmsg_data(nlh);
2711
2712 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2713 if (skb == NULL) {
2714 err = -ENOBUFS;
2715 goto errout;
2716 }
2717
2718 /* Reserve room for dummy headers, this skb can pass
2719 through good chunk of routing engine.
2720 */
2721 skb_reset_mac_header(skb);
2722 skb_reset_network_header(skb);
2723
2724 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2725 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2726 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2727
2728 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2729 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2730 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2731
2732 if (iif) {
2733 struct net_device *dev;
2734
2735 dev = __dev_get_by_index(net, iif);
2736 if (dev == NULL) {
2737 err = -ENODEV;
2738 goto errout_free;
2739 }
2740
2741 skb->protocol = htons(ETH_P_IP);
2742 skb->dev = dev;
2743 local_bh_disable();
2744 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2745 local_bh_enable();
2746
2747 rt = skb->rtable;
2748 if (err == 0 && rt->u.dst.error)
2749 err = -rt->u.dst.error;
2750 } else {
2751 struct flowi fl = {
2752 .nl_u = {
2753 .ip4_u = {
2754 .daddr = dst,
2755 .saddr = src,
2756 .tos = rtm->rtm_tos,
2757 },
2758 },
2759 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2760 };
2761 err = ip_route_output_key(net, &rt, &fl);
2762 }
2763
2764 if (err)
2765 goto errout_free;
2766
2767 skb->rtable = rt;
2768 if (rtm->rtm_flags & RTM_F_NOTIFY)
2769 rt->rt_flags |= RTCF_NOTIFY;
2770
2771 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2772 RTM_NEWROUTE, 0, 0);
2773 if (err <= 0)
2774 goto errout_free;
2775
2776 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2777errout:
2778 return err;
2779
2780errout_free:
2781 kfree_skb(skb);
2782 goto errout;
2783}
2784
2785int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2786{
2787 struct rtable *rt;
2788 int h, s_h;
2789 int idx, s_idx;
2790 struct net *net;
2791
2792 net = sock_net(skb->sk);
2793
2794 s_h = cb->args[0];
2795 if (s_h < 0)
2796 s_h = 0;
2797 s_idx = idx = cb->args[1];
2798 for (h = s_h; h <= rt_hash_mask; h++) {
2799 rcu_read_lock_bh();
2800 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2801 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2802 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2803 continue;
2804 if (rt->rt_genid != atomic_read(&rt_genid))
2805 continue;
2806 skb->dst = dst_clone(&rt->u.dst);
2807 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2808 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2809 1, NLM_F_MULTI) <= 0) {
2810 dst_release(xchg(&skb->dst, NULL));
2811 rcu_read_unlock_bh();
2812 goto done;
2813 }
2814 dst_release(xchg(&skb->dst, NULL));
2815 }
2816 rcu_read_unlock_bh();
2817 s_idx = 0;
2818 }
2819
2820done:
2821 cb->args[0] = h;
2822 cb->args[1] = idx;
2823 return skb->len;
2824}
2825
2826void ip_rt_multicast_event(struct in_device *in_dev)
2827{
2828 rt_cache_flush(dev_net(in_dev->dev), 0);
2829}
2830
2831#ifdef CONFIG_SYSCTL
2832static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2833 struct file *filp, void __user *buffer,
2834 size_t *lenp, loff_t *ppos)
2835{
2836 if (write) {
2837 int flush_delay;
2838 struct net *net;
2839 static DEFINE_MUTEX(flush_mutex);
2840
2841 mutex_lock(&flush_mutex);
2842 ctl->data = &flush_delay;
2843 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2844 ctl->data = NULL;
2845 mutex_unlock(&flush_mutex);
2846
2847 net = (struct net *)ctl->extra1;
2848 rt_cache_flush(net, flush_delay);
2849 return 0;
2850 }
2851
2852 return -EINVAL;
2853}
2854
2855static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2856 int __user *name,
2857 int nlen,
2858 void __user *oldval,
2859 size_t __user *oldlenp,
2860 void __user *newval,
2861 size_t newlen)
2862{
2863 int delay;
2864 struct net *net;
2865 if (newlen != sizeof(int))
2866 return -EINVAL;
2867 if (get_user(delay, (int __user *)newval))
2868 return -EFAULT;
2869 net = (struct net *)table->extra1;
2870 rt_cache_flush(net, delay);
2871 return 0;
2872}
2873
2874ctl_table ipv4_route_table[] = {
2875 {
2876 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2877 .procname = "gc_thresh",
2878 .data = &ipv4_dst_ops.gc_thresh,
2879 .maxlen = sizeof(int),
2880 .mode = 0644,
2881 .proc_handler = &proc_dointvec,
2882 },
2883 {
2884 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2885 .procname = "max_size",
2886 .data = &ip_rt_max_size,
2887 .maxlen = sizeof(int),
2888 .mode = 0644,
2889 .proc_handler = &proc_dointvec,
2890 },
2891 {
2892 /* Deprecated. Use gc_min_interval_ms */
2893
2894 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2895 .procname = "gc_min_interval",
2896 .data = &ip_rt_gc_min_interval,
2897 .maxlen = sizeof(int),
2898 .mode = 0644,
2899 .proc_handler = &proc_dointvec_jiffies,
2900 .strategy = &sysctl_jiffies,
2901 },
2902 {
2903 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2904 .procname = "gc_min_interval_ms",
2905 .data = &ip_rt_gc_min_interval,
2906 .maxlen = sizeof(int),
2907 .mode = 0644,
2908 .proc_handler = &proc_dointvec_ms_jiffies,
2909 .strategy = &sysctl_ms_jiffies,
2910 },
2911 {
2912 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2913 .procname = "gc_timeout",
2914 .data = &ip_rt_gc_timeout,
2915 .maxlen = sizeof(int),
2916 .mode = 0644,
2917 .proc_handler = &proc_dointvec_jiffies,
2918 .strategy = &sysctl_jiffies,
2919 },
2920 {
2921 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2922 .procname = "gc_interval",
2923 .data = &ip_rt_gc_interval,
2924 .maxlen = sizeof(int),
2925 .mode = 0644,
2926 .proc_handler = &proc_dointvec_jiffies,
2927 .strategy = &sysctl_jiffies,
2928 },
2929 {
2930 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2931 .procname = "redirect_load",
2932 .data = &ip_rt_redirect_load,
2933 .maxlen = sizeof(int),
2934 .mode = 0644,
2935 .proc_handler = &proc_dointvec,
2936 },
2937 {
2938 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2939 .procname = "redirect_number",
2940 .data = &ip_rt_redirect_number,
2941 .maxlen = sizeof(int),
2942 .mode = 0644,
2943 .proc_handler = &proc_dointvec,
2944 },
2945 {
2946 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2947 .procname = "redirect_silence",
2948 .data = &ip_rt_redirect_silence,
2949 .maxlen = sizeof(int),
2950 .mode = 0644,
2951 .proc_handler = &proc_dointvec,
2952 },
2953 {
2954 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2955 .procname = "error_cost",
2956 .data = &ip_rt_error_cost,
2957 .maxlen = sizeof(int),
2958 .mode = 0644,
2959 .proc_handler = &proc_dointvec,
2960 },
2961 {
2962 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2963 .procname = "error_burst",
2964 .data = &ip_rt_error_burst,
2965 .maxlen = sizeof(int),
2966 .mode = 0644,
2967 .proc_handler = &proc_dointvec,
2968 },
2969 {
2970 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2971 .procname = "gc_elasticity",
2972 .data = &ip_rt_gc_elasticity,
2973 .maxlen = sizeof(int),
2974 .mode = 0644,
2975 .proc_handler = &proc_dointvec,
2976 },
2977 {
2978 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2979 .procname = "mtu_expires",
2980 .data = &ip_rt_mtu_expires,
2981 .maxlen = sizeof(int),
2982 .mode = 0644,
2983 .proc_handler = &proc_dointvec_jiffies,
2984 .strategy = &sysctl_jiffies,
2985 },
2986 {
2987 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2988 .procname = "min_pmtu",
2989 .data = &ip_rt_min_pmtu,
2990 .maxlen = sizeof(int),
2991 .mode = 0644,
2992 .proc_handler = &proc_dointvec,
2993 },
2994 {
2995 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2996 .procname = "min_adv_mss",
2997 .data = &ip_rt_min_advmss,
2998 .maxlen = sizeof(int),
2999 .mode = 0644,
3000 .proc_handler = &proc_dointvec,
3001 },
3002 {
3003 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3004 .procname = "secret_interval",
3005 .data = &ip_rt_secret_interval,
3006 .maxlen = sizeof(int),
3007 .mode = 0644,
3008 .proc_handler = &proc_dointvec_jiffies,
3009 .strategy = &sysctl_jiffies,
3010 },
3011 { .ctl_name = 0 }
3012};
3013
3014static __net_initdata struct ctl_path ipv4_route_path[] = {
3015 { .procname = "net", .ctl_name = CTL_NET, },
3016 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3017 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3018 { },
3019};
3020
3021
3022static struct ctl_table ipv4_route_flush_table[] = {
3023 {
3024 .ctl_name = NET_IPV4_ROUTE_FLUSH,
3025 .procname = "flush",
3026 .maxlen = sizeof(int),
3027 .mode = 0200,
3028 .proc_handler = &ipv4_sysctl_rtcache_flush,
3029 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
3030 },
3031 { .ctl_name = 0 },
3032};
3033
3034static __net_init int sysctl_route_net_init(struct net *net)
3035{
3036 struct ctl_table *tbl;
3037
3038 tbl = ipv4_route_flush_table;
3039 if (net != &init_net) {
3040 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3041 if (tbl == NULL)
3042 goto err_dup;
3043 }
3044 tbl[0].extra1 = net;
3045
3046 net->ipv4.route_hdr =
3047 register_net_sysctl_table(net, ipv4_route_path, tbl);
3048 if (net->ipv4.route_hdr == NULL)
3049 goto err_reg;
3050 return 0;
3051
3052err_reg:
3053 if (tbl != ipv4_route_flush_table)
3054 kfree(tbl);
3055err_dup:
3056 return -ENOMEM;
3057}
3058
3059static __net_exit void sysctl_route_net_exit(struct net *net)
3060{
3061 struct ctl_table *tbl;
3062
3063 tbl = net->ipv4.route_hdr->ctl_table_arg;
3064 unregister_net_sysctl_table(net->ipv4.route_hdr);
3065 BUG_ON(tbl == ipv4_route_flush_table);
3066 kfree(tbl);
3067}
3068
3069static __net_initdata struct pernet_operations sysctl_route_ops = {
3070 .init = sysctl_route_net_init,
3071 .exit = sysctl_route_net_exit,
3072};
3073#endif
3074
3075#ifdef CONFIG_NET_CLS_ROUTE
3076struct ip_rt_acct *ip_rt_acct __read_mostly;
3077#endif /* CONFIG_NET_CLS_ROUTE */
3078
3079static __initdata unsigned long rhash_entries;
3080static int __init set_rhash_entries(char *str)
3081{
3082 if (!str)
3083 return 0;
3084 rhash_entries = simple_strtoul(str, &str, 0);
3085 return 1;
3086}
3087__setup("rhash_entries=", set_rhash_entries);
3088
3089int __init ip_rt_init(void)
3090{
3091 int rc = 0;
3092
3093 atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3094 (jiffies ^ (jiffies >> 7))));
3095
3096#ifdef CONFIG_NET_CLS_ROUTE
3097 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3098 if (!ip_rt_acct)
3099 panic("IP: failed to allocate ip_rt_acct\n");
3100#endif
3101
3102 ipv4_dst_ops.kmem_cachep =
3103 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3104 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3105
3106 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3107
3108 rt_hash_table = (struct rt_hash_bucket *)
3109 alloc_large_system_hash("IP route cache",
3110 sizeof(struct rt_hash_bucket),
3111 rhash_entries,
3112 (num_physpages >= 128 * 1024) ?
3113 15 : 17,
3114 0,
3115 &rt_hash_log,
3116 &rt_hash_mask,
3117 0);
3118 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3119 rt_hash_lock_init();
3120
3121 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3122 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3123
3124 devinet_init();
3125 ip_fib_init();
3126
3127 rt_secret_timer.function = rt_secret_rebuild;
3128 rt_secret_timer.data = 0;
3129 init_timer_deferrable(&rt_secret_timer);
3130
3131 /* All the timers, started at system startup tend
3132 to synchronize. Perturb it a bit.
3133 */
3134 schedule_delayed_work(&expires_work,
3135 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3136
3137 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3138 ip_rt_secret_interval;
3139 add_timer(&rt_secret_timer);
3140
3141 if (ip_rt_proc_init())
3142 printk(KERN_ERR "Unable to create route proc files\n");
3143#ifdef CONFIG_XFRM
3144 xfrm_init();
3145 xfrm4_init();
3146#endif
3147 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3148
3149#ifdef CONFIG_SYSCTL
3150 register_pernet_subsys(&sysctl_route_ops);
3151#endif
3152 return rc;
3153}
3154
3155EXPORT_SYMBOL(__ip_select_ident);
3156EXPORT_SYMBOL(ip_route_input);
3157EXPORT_SYMBOL(ip_route_output_key);