2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
110 #include <linux/sysctl.h>
113 #define RT_FL_TOS(oldflp) \
114 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
116 #define IP_MAX_MTU 0xFFF0
118 #define RT_GC_TIMEOUT (300*HZ)
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
123 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
124 static int ip_rt_redirect_number __read_mostly = 9;
125 static int ip_rt_redirect_load __read_mostly = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly = HZ;
128 static int ip_rt_error_burst __read_mostly = 5 * HZ;
129 static int ip_rt_gc_elasticity __read_mostly = 8;
130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly = 256;
133 static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
135 static void rt_worker_func(struct work_struct *work);
136 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
137 static struct timer_list rt_secret_timer;
140 * Interface to generic destination cache.
143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144 static void ipv4_dst_destroy(struct dst_entry *dst);
145 static void ipv4_dst_ifdown(struct dst_entry *dst,
146 struct net_device *dev, int how);
147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148 static void ipv4_link_failure(struct sk_buff *skb);
149 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150 static int rt_garbage_collect(struct dst_ops *ops);
153 static struct dst_ops ipv4_dst_ops = {
155 .protocol = __constant_htons(ETH_P_IP),
156 .gc = rt_garbage_collect,
157 .check = ipv4_dst_check,
158 .destroy = ipv4_dst_destroy,
159 .ifdown = ipv4_dst_ifdown,
160 .negative_advice = ipv4_negative_advice,
161 .link_failure = ipv4_link_failure,
162 .update_pmtu = ip_rt_update_pmtu,
163 .local_out = ip_local_out,
164 .entry_size = sizeof(struct rtable),
165 .entries = ATOMIC_INIT(0),
168 #define ECN_OR_COST(class) TC_PRIO_##class
170 const __u8 ip_tos2prio[16] = {
174 ECN_OR_COST(BESTEFFORT),
180 ECN_OR_COST(INTERACTIVE),
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK)
194 /* The locking scheme is rather straight forward:
196 * 1) Read-Copy Update protects the buckets of the central route hash.
197 * 2) Only writers remove entries, and they hold the lock
198 * as they look at rtable reference counts.
199 * 3) Only readers acquire references to rtable entries,
200 * they do so with atomic increments and with the
204 struct rt_hash_bucket {
205 struct rtable *chain;
207 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
208 defined(CONFIG_PROVE_LOCKING)
210 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
211 * The size of this table is a power of two and depends on the number of CPUS.
212 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
214 #ifdef CONFIG_LOCKDEP
215 # define RT_HASH_LOCK_SZ 256
218 # define RT_HASH_LOCK_SZ 4096
220 # define RT_HASH_LOCK_SZ 2048
222 # define RT_HASH_LOCK_SZ 1024
224 # define RT_HASH_LOCK_SZ 512
226 # define RT_HASH_LOCK_SZ 256
230 static spinlock_t *rt_hash_locks;
231 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
233 static __init void rt_hash_lock_init(void)
237 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
240 panic("IP: failed to allocate rt_hash_locks\n");
242 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
243 spin_lock_init(&rt_hash_locks[i]);
246 # define rt_hash_lock_addr(slot) NULL
248 static inline void rt_hash_lock_init(void)
253 static struct rt_hash_bucket *rt_hash_table __read_mostly;
254 static unsigned rt_hash_mask __read_mostly;
255 static unsigned int rt_hash_log __read_mostly;
256 static atomic_t rt_genid __read_mostly;
258 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
259 #define RT_CACHE_STAT_INC(field) \
260 (__raw_get_cpu_var(rt_cache_stat).field++)
262 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
264 return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
268 #define rt_hash(daddr, saddr, idx) \
269 rt_hash_code((__force u32)(__be32)(daddr),\
270 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
272 #ifdef CONFIG_PROC_FS
273 struct rt_cache_iter_state {
274 struct seq_net_private p;
279 static struct rtable *rt_cache_get_first(struct seq_file *seq)
281 struct rt_cache_iter_state *st = seq->private;
282 struct rtable *r = NULL;
284 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
286 r = rcu_dereference(rt_hash_table[st->bucket].chain);
288 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
289 r->rt_genid == st->genid)
291 r = rcu_dereference(r->u.dst.rt_next);
293 rcu_read_unlock_bh();
298 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
301 struct rt_cache_iter_state *st = seq->private;
302 r = r->u.dst.rt_next;
304 rcu_read_unlock_bh();
305 if (--st->bucket < 0)
308 r = rt_hash_table[st->bucket].chain;
310 return rcu_dereference(r);
313 static struct rtable *rt_cache_get_next(struct seq_file *seq,
316 struct rt_cache_iter_state *st = seq->private;
317 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
318 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
320 if (r->rt_genid == st->genid)
326 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
328 struct rtable *r = rt_cache_get_first(seq);
331 while (pos && (r = rt_cache_get_next(seq, r)))
333 return pos ? NULL : r;
336 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
338 struct rt_cache_iter_state *st = seq->private;
340 return rt_cache_get_idx(seq, *pos - 1);
341 st->genid = atomic_read(&rt_genid);
342 return SEQ_START_TOKEN;
345 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
349 if (v == SEQ_START_TOKEN)
350 r = rt_cache_get_first(seq);
352 r = rt_cache_get_next(seq, v);
357 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
359 if (v && v != SEQ_START_TOKEN)
360 rcu_read_unlock_bh();
363 static int rt_cache_seq_show(struct seq_file *seq, void *v)
365 if (v == SEQ_START_TOKEN)
366 seq_printf(seq, "%-127s\n",
367 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
368 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
371 struct rtable *r = v;
374 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
375 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
376 r->u.dst.dev ? r->u.dst.dev->name : "*",
377 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
378 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
379 r->u.dst.__use, 0, (unsigned long)r->rt_src,
380 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
381 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
382 dst_metric(&r->u.dst, RTAX_WINDOW),
383 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
384 dst_metric(&r->u.dst, RTAX_RTTVAR)),
386 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
387 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
390 seq_printf(seq, "%-127s\n", temp);
395 static const struct seq_operations rt_cache_seq_ops = {
396 .start = rt_cache_seq_start,
397 .next = rt_cache_seq_next,
398 .stop = rt_cache_seq_stop,
399 .show = rt_cache_seq_show,
402 static int rt_cache_seq_open(struct inode *inode, struct file *file)
404 return seq_open_net(inode, file, &rt_cache_seq_ops,
405 sizeof(struct rt_cache_iter_state));
408 static const struct file_operations rt_cache_seq_fops = {
409 .owner = THIS_MODULE,
410 .open = rt_cache_seq_open,
413 .release = seq_release_net,
417 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
422 return SEQ_START_TOKEN;
424 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
425 if (!cpu_possible(cpu))
428 return &per_cpu(rt_cache_stat, cpu);
433 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
437 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
438 if (!cpu_possible(cpu))
441 return &per_cpu(rt_cache_stat, cpu);
447 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
452 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
454 struct rt_cache_stat *st = v;
456 if (v == SEQ_START_TOKEN) {
457 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
461 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
462 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
463 atomic_read(&ipv4_dst_ops.entries),
486 static const struct seq_operations rt_cpu_seq_ops = {
487 .start = rt_cpu_seq_start,
488 .next = rt_cpu_seq_next,
489 .stop = rt_cpu_seq_stop,
490 .show = rt_cpu_seq_show,
494 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
496 return seq_open(file, &rt_cpu_seq_ops);
499 static const struct file_operations rt_cpu_seq_fops = {
500 .owner = THIS_MODULE,
501 .open = rt_cpu_seq_open,
504 .release = seq_release,
507 #ifdef CONFIG_NET_CLS_ROUTE
508 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
509 int length, int *eof, void *data)
513 if ((offset & 3) || (length & 3))
516 if (offset >= sizeof(struct ip_rt_acct) * 256) {
521 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
522 length = sizeof(struct ip_rt_acct) * 256 - offset;
526 offset /= sizeof(u32);
529 u32 *dst = (u32 *) buffer;
532 memset(dst, 0, length);
534 for_each_possible_cpu(i) {
538 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
539 for (j = 0; j < length/4; j++)
547 static int __net_init ip_rt_do_proc_init(struct net *net)
549 struct proc_dir_entry *pde;
551 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
556 pde = proc_create("rt_cache", S_IRUGO,
557 net->proc_net_stat, &rt_cpu_seq_fops);
561 #ifdef CONFIG_NET_CLS_ROUTE
562 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
563 ip_rt_acct_read, NULL);
569 #ifdef CONFIG_NET_CLS_ROUTE
571 remove_proc_entry("rt_cache", net->proc_net_stat);
574 remove_proc_entry("rt_cache", net->proc_net);
579 static void __net_exit ip_rt_do_proc_exit(struct net *net)
581 remove_proc_entry("rt_cache", net->proc_net_stat);
582 remove_proc_entry("rt_cache", net->proc_net);
583 remove_proc_entry("rt_acct", net->proc_net);
586 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
587 .init = ip_rt_do_proc_init,
588 .exit = ip_rt_do_proc_exit,
591 static int __init ip_rt_proc_init(void)
593 return register_pernet_subsys(&ip_rt_proc_ops);
597 static inline int ip_rt_proc_init(void)
601 #endif /* CONFIG_PROC_FS */
603 static __inline__ void rt_free(struct rtable *rt)
605 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
608 static __inline__ void rt_drop(struct rtable *rt)
611 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
614 static __inline__ int rt_fast_clean(struct rtable *rth)
616 /* Kill broadcast/multicast entries very aggresively, if they
617 collide in hash table with more useful entries */
618 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
619 rth->fl.iif && rth->u.dst.rt_next;
622 static __inline__ int rt_valuable(struct rtable *rth)
624 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
628 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
633 if (atomic_read(&rth->u.dst.__refcnt))
637 if (rth->u.dst.expires &&
638 time_after_eq(jiffies, rth->u.dst.expires))
641 age = jiffies - rth->u.dst.lastuse;
643 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
644 (age <= tmo2 && rt_valuable(rth)))
650 /* Bits of score are:
652 * 30: not quite useless
653 * 29..0: usage counter
655 static inline u32 rt_score(struct rtable *rt)
657 u32 score = jiffies - rt->u.dst.lastuse;
659 score = ~score & ~(3<<30);
665 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
671 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
673 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
674 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
675 (fl1->mark ^ fl2->mark) |
676 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
677 *(u16 *)&fl2->nl_u.ip4_u.tos) |
678 (fl1->oif ^ fl2->oif) |
679 (fl1->iif ^ fl2->iif)) == 0;
682 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
684 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
688 * Perform a full scan of hash table and free all entries.
689 * Can be called by a softirq or a process.
690 * In the later case, we want to be reschedule if necessary
692 static void rt_do_flush(int process_context)
695 struct rtable *rth, *next;
697 for (i = 0; i <= rt_hash_mask; i++) {
698 if (process_context && need_resched())
700 rth = rt_hash_table[i].chain;
704 spin_lock_bh(rt_hash_lock_addr(i));
705 rth = rt_hash_table[i].chain;
706 rt_hash_table[i].chain = NULL;
707 spin_unlock_bh(rt_hash_lock_addr(i));
709 for (; rth; rth = next) {
710 next = rth->u.dst.rt_next;
716 static void rt_check_expire(void)
718 static unsigned int rover;
719 unsigned int i = rover, goal;
720 struct rtable *rth, **rthp;
723 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
724 if (ip_rt_gc_timeout > 1)
725 do_div(mult, ip_rt_gc_timeout);
726 goal = (unsigned int)mult;
727 if (goal > rt_hash_mask)
728 goal = rt_hash_mask + 1;
729 for (; goal > 0; goal--) {
730 unsigned long tmo = ip_rt_gc_timeout;
732 i = (i + 1) & rt_hash_mask;
733 rthp = &rt_hash_table[i].chain;
740 spin_lock_bh(rt_hash_lock_addr(i));
741 while ((rth = *rthp) != NULL) {
742 if (rth->rt_genid != atomic_read(&rt_genid)) {
743 *rthp = rth->u.dst.rt_next;
747 if (rth->u.dst.expires) {
748 /* Entry is expired even if it is in use */
749 if (time_before_eq(jiffies, rth->u.dst.expires)) {
751 rthp = &rth->u.dst.rt_next;
754 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
756 rthp = &rth->u.dst.rt_next;
760 /* Cleanup aged off entries. */
761 *rthp = rth->u.dst.rt_next;
764 spin_unlock_bh(rt_hash_lock_addr(i));
770 * rt_worker_func() is run in process context.
771 * we call rt_check_expire() to scan part of the hash table
773 static void rt_worker_func(struct work_struct *work)
776 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
780 * Pertubation of rt_genid by a small quantity [1..256]
781 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
782 * many times (2^24) without giving recent rt_genid.
783 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
785 static void rt_cache_invalidate(void)
787 unsigned char shuffle;
789 get_random_bytes(&shuffle, sizeof(shuffle));
790 atomic_add(shuffle + 1U, &rt_genid);
794 * delay < 0 : invalidate cache (fast : entries will be deleted later)
795 * delay >= 0 : invalidate & flush cache (can be long)
797 void rt_cache_flush(int delay)
799 rt_cache_invalidate();
801 rt_do_flush(!in_softirq());
805 * We change rt_genid and let gc do the cleanup
807 static void rt_secret_rebuild(unsigned long dummy)
809 rt_cache_invalidate();
810 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
814 Short description of GC goals.
816 We want to build algorithm, which will keep routing cache
817 at some equilibrium point, when number of aged off entries
818 is kept approximately equal to newly generated ones.
820 Current expiration strength is variable "expire".
821 We try to adjust it dynamically, so that if networking
822 is idle expires is large enough to keep enough of warm entries,
823 and when load increases it reduces to limit cache size.
826 static int rt_garbage_collect(struct dst_ops *ops)
828 static unsigned long expire = RT_GC_TIMEOUT;
829 static unsigned long last_gc;
831 static int equilibrium;
832 struct rtable *rth, **rthp;
833 unsigned long now = jiffies;
837 * Garbage collection is pretty expensive,
838 * do not make it too frequently.
841 RT_CACHE_STAT_INC(gc_total);
843 if (now - last_gc < ip_rt_gc_min_interval &&
844 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
845 RT_CACHE_STAT_INC(gc_ignored);
849 /* Calculate number of entries, which we want to expire now. */
850 goal = atomic_read(&ipv4_dst_ops.entries) -
851 (ip_rt_gc_elasticity << rt_hash_log);
853 if (equilibrium < ipv4_dst_ops.gc_thresh)
854 equilibrium = ipv4_dst_ops.gc_thresh;
855 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
857 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
858 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
861 /* We are in dangerous area. Try to reduce cache really
864 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
865 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
868 if (now - last_gc >= ip_rt_gc_min_interval)
879 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
880 unsigned long tmo = expire;
882 k = (k + 1) & rt_hash_mask;
883 rthp = &rt_hash_table[k].chain;
884 spin_lock_bh(rt_hash_lock_addr(k));
885 while ((rth = *rthp) != NULL) {
886 if (rth->rt_genid == atomic_read(&rt_genid) &&
887 !rt_may_expire(rth, tmo, expire)) {
889 rthp = &rth->u.dst.rt_next;
892 *rthp = rth->u.dst.rt_next;
896 spin_unlock_bh(rt_hash_lock_addr(k));
905 /* Goal is not achieved. We stop process if:
907 - if expire reduced to zero. Otherwise, expire is halfed.
908 - if table is not full.
909 - if we are called from interrupt.
910 - jiffies check is just fallback/debug loop breaker.
911 We will not spin here for long time in any case.
914 RT_CACHE_STAT_INC(gc_goal_miss);
920 #if RT_CACHE_DEBUG >= 2
921 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
922 atomic_read(&ipv4_dst_ops.entries), goal, i);
925 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
927 } while (!in_softirq() && time_before_eq(jiffies, now));
929 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
932 printk(KERN_WARNING "dst cache overflow\n");
933 RT_CACHE_STAT_INC(gc_dst_overflow);
937 expire += ip_rt_gc_min_interval;
938 if (expire > ip_rt_gc_timeout ||
939 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
940 expire = ip_rt_gc_timeout;
941 #if RT_CACHE_DEBUG >= 2
942 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
943 atomic_read(&ipv4_dst_ops.entries), goal, rover);
948 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
950 struct rtable *rth, **rthp;
952 struct rtable *cand, **candp;
955 int attempts = !in_softirq();
964 rthp = &rt_hash_table[hash].chain;
966 spin_lock_bh(rt_hash_lock_addr(hash));
967 while ((rth = *rthp) != NULL) {
968 if (rth->rt_genid != atomic_read(&rt_genid)) {
969 *rthp = rth->u.dst.rt_next;
973 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
975 *rthp = rth->u.dst.rt_next;
977 * Since lookup is lockfree, the deletion
978 * must be visible to another weakly ordered CPU before
979 * the insertion at the start of the hash chain.
981 rcu_assign_pointer(rth->u.dst.rt_next,
982 rt_hash_table[hash].chain);
984 * Since lookup is lockfree, the update writes
985 * must be ordered for consistency on SMP.
987 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
989 dst_use(&rth->u.dst, now);
990 spin_unlock_bh(rt_hash_lock_addr(hash));
997 if (!atomic_read(&rth->u.dst.__refcnt)) {
998 u32 score = rt_score(rth);
1000 if (score <= min_score) {
1009 rthp = &rth->u.dst.rt_next;
1013 /* ip_rt_gc_elasticity used to be average length of chain
1014 * length, when exceeded gc becomes really aggressive.
1016 * The second limit is less certain. At the moment it allows
1017 * only 2 entries per bucket. We will see.
1019 if (chain_length > ip_rt_gc_elasticity) {
1020 *candp = cand->u.dst.rt_next;
1025 /* Try to bind route to arp only if it is output
1026 route or unicast forwarding path.
1028 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1029 int err = arp_bind_neighbour(&rt->u.dst);
1031 spin_unlock_bh(rt_hash_lock_addr(hash));
1033 if (err != -ENOBUFS) {
1038 /* Neighbour tables are full and nothing
1039 can be released. Try to shrink route cache,
1040 it is most likely it holds some neighbour records.
1042 if (attempts-- > 0) {
1043 int saved_elasticity = ip_rt_gc_elasticity;
1044 int saved_int = ip_rt_gc_min_interval;
1045 ip_rt_gc_elasticity = 1;
1046 ip_rt_gc_min_interval = 0;
1047 rt_garbage_collect(&ipv4_dst_ops);
1048 ip_rt_gc_min_interval = saved_int;
1049 ip_rt_gc_elasticity = saved_elasticity;
1053 if (net_ratelimit())
1054 printk(KERN_WARNING "Neighbour table overflow.\n");
1060 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1061 #if RT_CACHE_DEBUG >= 2
1062 if (rt->u.dst.rt_next) {
1064 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1065 NIPQUAD(rt->rt_dst));
1066 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1067 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1071 rt_hash_table[hash].chain = rt;
1072 spin_unlock_bh(rt_hash_lock_addr(hash));
1077 void rt_bind_peer(struct rtable *rt, int create)
1079 static DEFINE_SPINLOCK(rt_peer_lock);
1080 struct inet_peer *peer;
1082 peer = inet_getpeer(rt->rt_dst, create);
1084 spin_lock_bh(&rt_peer_lock);
1085 if (rt->peer == NULL) {
1089 spin_unlock_bh(&rt_peer_lock);
1095 * Peer allocation may fail only in serious out-of-memory conditions. However
1096 * we still can generate some output.
1097 * Random ID selection looks a bit dangerous because we have no chances to
1098 * select ID being unique in a reasonable period of time.
1099 * But broken packet identifier may be better than no packet at all.
1101 static void ip_select_fb_ident(struct iphdr *iph)
1103 static DEFINE_SPINLOCK(ip_fb_id_lock);
1104 static u32 ip_fallback_id;
1107 spin_lock_bh(&ip_fb_id_lock);
1108 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1109 iph->id = htons(salt & 0xFFFF);
1110 ip_fallback_id = salt;
1111 spin_unlock_bh(&ip_fb_id_lock);
1114 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1116 struct rtable *rt = (struct rtable *) dst;
1119 if (rt->peer == NULL)
1120 rt_bind_peer(rt, 1);
1122 /* If peer is attached to destination, it is never detached,
1123 so that we need not to grab a lock to dereference it.
1126 iph->id = htons(inet_getid(rt->peer, more));
1130 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1131 __builtin_return_address(0));
1133 ip_select_fb_ident(iph);
1136 static void rt_del(unsigned hash, struct rtable *rt)
1138 struct rtable **rthp, *aux;
1140 rthp = &rt_hash_table[hash].chain;
1141 spin_lock_bh(rt_hash_lock_addr(hash));
1143 while ((aux = *rthp) != NULL) {
1144 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1145 *rthp = aux->u.dst.rt_next;
1149 rthp = &aux->u.dst.rt_next;
1151 spin_unlock_bh(rt_hash_lock_addr(hash));
1154 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1155 __be32 saddr, struct net_device *dev)
1158 struct in_device *in_dev = in_dev_get(dev);
1159 struct rtable *rth, **rthp;
1160 __be32 skeys[2] = { saddr, 0 };
1161 int ikeys[2] = { dev->ifindex, 0 };
1162 struct netevent_redirect netevent;
1169 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1170 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1171 || ipv4_is_zeronet(new_gw))
1172 goto reject_redirect;
1174 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1175 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1176 goto reject_redirect;
1177 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1178 goto reject_redirect;
1180 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1181 goto reject_redirect;
1184 for (i = 0; i < 2; i++) {
1185 for (k = 0; k < 2; k++) {
1186 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1188 rthp=&rt_hash_table[hash].chain;
1191 while ((rth = rcu_dereference(*rthp)) != NULL) {
1194 if (rth->fl.fl4_dst != daddr ||
1195 rth->fl.fl4_src != skeys[i] ||
1196 rth->fl.oif != ikeys[k] ||
1198 rth->rt_genid != atomic_read(&rt_genid) ||
1199 !net_eq(dev_net(rth->u.dst.dev), net)) {
1200 rthp = &rth->u.dst.rt_next;
1204 if (rth->rt_dst != daddr ||
1205 rth->rt_src != saddr ||
1207 rth->rt_gateway != old_gw ||
1208 rth->u.dst.dev != dev)
1211 dst_hold(&rth->u.dst);
1214 rt = dst_alloc(&ipv4_dst_ops);
1221 /* Copy all the information. */
1223 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1224 rt->u.dst.__use = 1;
1225 atomic_set(&rt->u.dst.__refcnt, 1);
1226 rt->u.dst.child = NULL;
1228 dev_hold(rt->u.dst.dev);
1230 in_dev_hold(rt->idev);
1231 rt->u.dst.obsolete = 0;
1232 rt->u.dst.lastuse = jiffies;
1233 rt->u.dst.path = &rt->u.dst;
1234 rt->u.dst.neighbour = NULL;
1235 rt->u.dst.hh = NULL;
1236 rt->u.dst.xfrm = NULL;
1237 rt->rt_genid = atomic_read(&rt_genid);
1238 rt->rt_flags |= RTCF_REDIRECTED;
1240 /* Gateway is different ... */
1241 rt->rt_gateway = new_gw;
1243 /* Redirect received -> path was valid */
1244 dst_confirm(&rth->u.dst);
1247 atomic_inc(&rt->peer->refcnt);
1249 if (arp_bind_neighbour(&rt->u.dst) ||
1250 !(rt->u.dst.neighbour->nud_state &
1252 if (rt->u.dst.neighbour)
1253 neigh_event_send(rt->u.dst.neighbour, NULL);
1259 netevent.old = &rth->u.dst;
1260 netevent.new = &rt->u.dst;
1261 call_netevent_notifiers(NETEVENT_REDIRECT,
1265 if (!rt_intern_hash(hash, rt, &rt))
1278 #ifdef CONFIG_IP_ROUTE_VERBOSE
1279 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1280 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1281 "%u.%u.%u.%u ignored.\n"
1282 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1283 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1284 NIPQUAD(saddr), NIPQUAD(daddr));
1289 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1291 struct rtable *rt = (struct rtable *)dst;
1292 struct dst_entry *ret = dst;
1295 if (dst->obsolete) {
1298 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1299 rt->u.dst.expires) {
1300 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1302 #if RT_CACHE_DEBUG >= 1
1303 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1304 "%u.%u.%u.%u/%02x dropped\n",
1305 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1316 * 1. The first ip_rt_redirect_number redirects are sent
1317 * with exponential backoff, then we stop sending them at all,
1318 * assuming that the host ignores our redirects.
1319 * 2. If we did not see packets requiring redirects
1320 * during ip_rt_redirect_silence, we assume that the host
1321 * forgot redirected route and start to send redirects again.
1323 * This algorithm is much cheaper and more intelligent than dumb load limiting
1326 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1327 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1330 void ip_rt_send_redirect(struct sk_buff *skb)
1332 struct rtable *rt = skb->rtable;
1333 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1338 if (!IN_DEV_TX_REDIRECTS(in_dev))
1341 /* No redirected packets during ip_rt_redirect_silence;
1342 * reset the algorithm.
1344 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1345 rt->u.dst.rate_tokens = 0;
1347 /* Too many ignored redirects; do not send anything
1348 * set u.dst.rate_last to the last seen redirected packet.
1350 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1351 rt->u.dst.rate_last = jiffies;
1355 /* Check for load limit; set rate_last to the latest sent
1358 if (rt->u.dst.rate_tokens == 0 ||
1360 (rt->u.dst.rate_last +
1361 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1362 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1363 rt->u.dst.rate_last = jiffies;
1364 ++rt->u.dst.rate_tokens;
1365 #ifdef CONFIG_IP_ROUTE_VERBOSE
1366 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1367 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1369 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1370 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1371 NIPQUAD(rt->rt_src), rt->rt_iif,
1372 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1379 static int ip_error(struct sk_buff *skb)
1381 struct rtable *rt = skb->rtable;
1385 switch (rt->u.dst.error) {
1390 code = ICMP_HOST_UNREACH;
1393 code = ICMP_NET_UNREACH;
1394 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1397 code = ICMP_PKT_FILTERED;
1402 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1403 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1404 rt->u.dst.rate_tokens = ip_rt_error_burst;
1405 rt->u.dst.rate_last = now;
1406 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1407 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1408 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1411 out: kfree_skb(skb);
1416 * The last two values are not from the RFC but
1417 * are needed for AMPRnet AX.25 paths.
1420 static const unsigned short mtu_plateau[] =
1421 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1423 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1427 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1428 if (old_mtu > mtu_plateau[i])
1429 return mtu_plateau[i];
1433 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1434 unsigned short new_mtu)
1437 unsigned short old_mtu = ntohs(iph->tot_len);
1439 __be32 skeys[2] = { iph->saddr, 0, };
1440 __be32 daddr = iph->daddr;
1441 unsigned short est_mtu = 0;
1443 if (ipv4_config.no_pmtu_disc)
1446 for (i = 0; i < 2; i++) {
1447 unsigned hash = rt_hash(daddr, skeys[i], 0);
1450 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1451 rth = rcu_dereference(rth->u.dst.rt_next)) {
1452 if (rth->fl.fl4_dst == daddr &&
1453 rth->fl.fl4_src == skeys[i] &&
1454 rth->rt_dst == daddr &&
1455 rth->rt_src == iph->saddr &&
1457 !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1458 net_eq(dev_net(rth->u.dst.dev), net) &&
1459 rth->rt_genid == atomic_read(&rt_genid)) {
1460 unsigned short mtu = new_mtu;
1462 if (new_mtu < 68 || new_mtu >= old_mtu) {
1464 /* BSD 4.2 compatibility hack :-( */
1466 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1467 old_mtu >= 68 + (iph->ihl << 2))
1468 old_mtu -= iph->ihl << 2;
1470 mtu = guess_mtu(old_mtu);
1472 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1473 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1474 dst_confirm(&rth->u.dst);
1475 if (mtu < ip_rt_min_pmtu) {
1476 mtu = ip_rt_min_pmtu;
1477 rth->u.dst.metrics[RTAX_LOCK-1] |=
1480 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1481 dst_set_expires(&rth->u.dst,
1490 return est_mtu ? : new_mtu;
1493 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1495 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1496 !(dst_metric_locked(dst, RTAX_MTU))) {
1497 if (mtu < ip_rt_min_pmtu) {
1498 mtu = ip_rt_min_pmtu;
1499 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1501 dst->metrics[RTAX_MTU-1] = mtu;
1502 dst_set_expires(dst, ip_rt_mtu_expires);
1503 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1507 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1512 static void ipv4_dst_destroy(struct dst_entry *dst)
1514 struct rtable *rt = (struct rtable *) dst;
1515 struct inet_peer *peer = rt->peer;
1516 struct in_device *idev = rt->idev;
1529 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1532 struct rtable *rt = (struct rtable *) dst;
1533 struct in_device *idev = rt->idev;
1534 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1535 struct in_device *loopback_idev =
1536 in_dev_get(dev_net(dev)->loopback_dev);
1537 if (loopback_idev) {
1538 rt->idev = loopback_idev;
1544 static void ipv4_link_failure(struct sk_buff *skb)
1548 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1552 dst_set_expires(&rt->u.dst, 0);
1555 static int ip_rt_bug(struct sk_buff *skb)
1557 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1558 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1559 skb->dev ? skb->dev->name : "?");
1565 We do not cache source address of outgoing interface,
1566 because it is used only by IP RR, TS and SRR options,
1567 so that it out of fast path.
1569 BTW remember: "addr" is allowed to be not aligned
1573 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1576 struct fib_result res;
1578 if (rt->fl.iif == 0)
1580 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1581 src = FIB_RES_PREFSRC(res);
1584 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1586 memcpy(addr, &src, 4);
1589 #ifdef CONFIG_NET_CLS_ROUTE
1590 static void set_class_tag(struct rtable *rt, u32 tag)
1592 if (!(rt->u.dst.tclassid & 0xFFFF))
1593 rt->u.dst.tclassid |= tag & 0xFFFF;
1594 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1595 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1599 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1601 struct fib_info *fi = res->fi;
1604 if (FIB_RES_GW(*res) &&
1605 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1606 rt->rt_gateway = FIB_RES_GW(*res);
1607 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1608 sizeof(rt->u.dst.metrics));
1609 if (fi->fib_mtu == 0) {
1610 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1611 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1612 rt->rt_gateway != rt->rt_dst &&
1613 rt->u.dst.dev->mtu > 576)
1614 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1616 #ifdef CONFIG_NET_CLS_ROUTE
1617 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1620 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1622 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1623 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1624 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1625 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1626 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1627 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1629 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1630 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1632 #ifdef CONFIG_NET_CLS_ROUTE
1633 #ifdef CONFIG_IP_MULTIPLE_TABLES
1634 set_class_tag(rt, fib_rules_tclass(res));
1636 set_class_tag(rt, itag);
1638 rt->rt_type = res->type;
1641 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1642 u8 tos, struct net_device *dev, int our)
1647 struct in_device *in_dev = in_dev_get(dev);
1650 /* Primary sanity checks. */
1655 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1656 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1659 if (ipv4_is_zeronet(saddr)) {
1660 if (!ipv4_is_local_multicast(daddr))
1662 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1663 } else if (fib_validate_source(saddr, 0, tos, 0,
1664 dev, &spec_dst, &itag) < 0)
1667 rth = dst_alloc(&ipv4_dst_ops);
1671 rth->u.dst.output= ip_rt_bug;
1673 atomic_set(&rth->u.dst.__refcnt, 1);
1674 rth->u.dst.flags= DST_HOST;
1675 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1676 rth->u.dst.flags |= DST_NOPOLICY;
1677 rth->fl.fl4_dst = daddr;
1678 rth->rt_dst = daddr;
1679 rth->fl.fl4_tos = tos;
1680 rth->fl.mark = skb->mark;
1681 rth->fl.fl4_src = saddr;
1682 rth->rt_src = saddr;
1683 #ifdef CONFIG_NET_CLS_ROUTE
1684 rth->u.dst.tclassid = itag;
1687 rth->fl.iif = dev->ifindex;
1688 rth->u.dst.dev = init_net.loopback_dev;
1689 dev_hold(rth->u.dst.dev);
1690 rth->idev = in_dev_get(rth->u.dst.dev);
1692 rth->rt_gateway = daddr;
1693 rth->rt_spec_dst= spec_dst;
1694 rth->rt_genid = atomic_read(&rt_genid);
1695 rth->rt_flags = RTCF_MULTICAST;
1696 rth->rt_type = RTN_MULTICAST;
1698 rth->u.dst.input= ip_local_deliver;
1699 rth->rt_flags |= RTCF_LOCAL;
1702 #ifdef CONFIG_IP_MROUTE
1703 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1704 rth->u.dst.input = ip_mr_input;
1706 RT_CACHE_STAT_INC(in_slow_mc);
1709 hash = rt_hash(daddr, saddr, dev->ifindex);
1710 return rt_intern_hash(hash, rth, &skb->rtable);
1722 static void ip_handle_martian_source(struct net_device *dev,
1723 struct in_device *in_dev,
1724 struct sk_buff *skb,
1728 RT_CACHE_STAT_INC(in_martian_src);
1729 #ifdef CONFIG_IP_ROUTE_VERBOSE
1730 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1732 * RFC1812 recommendation, if source is martian,
1733 * the only hint is MAC header.
1735 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1736 "%u.%u.%u.%u, on dev %s\n",
1737 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1738 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1740 const unsigned char *p = skb_mac_header(skb);
1741 printk(KERN_WARNING "ll header: ");
1742 for (i = 0; i < dev->hard_header_len; i++, p++) {
1744 if (i < (dev->hard_header_len - 1))
1753 static inline int __mkroute_input(struct sk_buff *skb,
1754 struct fib_result* res,
1755 struct in_device *in_dev,
1756 __be32 daddr, __be32 saddr, u32 tos,
1757 struct rtable **result)
1762 struct in_device *out_dev;
1767 /* get a working reference to the output device */
1768 out_dev = in_dev_get(FIB_RES_DEV(*res));
1769 if (out_dev == NULL) {
1770 if (net_ratelimit())
1771 printk(KERN_CRIT "Bug in ip_route_input" \
1772 "_slow(). Please, report\n");
1777 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1778 in_dev->dev, &spec_dst, &itag);
1780 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1788 flags |= RTCF_DIRECTSRC;
1790 if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1791 (IN_DEV_SHARED_MEDIA(out_dev) ||
1792 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1793 flags |= RTCF_DOREDIRECT;
1795 if (skb->protocol != htons(ETH_P_IP)) {
1796 /* Not IP (i.e. ARP). Do not create route, if it is
1797 * invalid for proxy arp. DNAT routes are always valid.
1799 if (out_dev == in_dev) {
1806 rth = dst_alloc(&ipv4_dst_ops);
1812 atomic_set(&rth->u.dst.__refcnt, 1);
1813 rth->u.dst.flags= DST_HOST;
1814 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1815 rth->u.dst.flags |= DST_NOPOLICY;
1816 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1817 rth->u.dst.flags |= DST_NOXFRM;
1818 rth->fl.fl4_dst = daddr;
1819 rth->rt_dst = daddr;
1820 rth->fl.fl4_tos = tos;
1821 rth->fl.mark = skb->mark;
1822 rth->fl.fl4_src = saddr;
1823 rth->rt_src = saddr;
1824 rth->rt_gateway = daddr;
1826 rth->fl.iif = in_dev->dev->ifindex;
1827 rth->u.dst.dev = (out_dev)->dev;
1828 dev_hold(rth->u.dst.dev);
1829 rth->idev = in_dev_get(rth->u.dst.dev);
1831 rth->rt_spec_dst= spec_dst;
1833 rth->u.dst.input = ip_forward;
1834 rth->u.dst.output = ip_output;
1835 rth->rt_genid = atomic_read(&rt_genid);
1837 rt_set_nexthop(rth, res, itag);
1839 rth->rt_flags = flags;
1844 /* release the working reference to the output device */
1845 in_dev_put(out_dev);
1849 static inline int ip_mkroute_input(struct sk_buff *skb,
1850 struct fib_result* res,
1851 const struct flowi *fl,
1852 struct in_device *in_dev,
1853 __be32 daddr, __be32 saddr, u32 tos)
1855 struct rtable* rth = NULL;
1859 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1860 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1861 fib_select_multipath(fl, res);
1864 /* create a routing cache entry */
1865 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1869 /* put it into the cache */
1870 hash = rt_hash(daddr, saddr, fl->iif);
1871 return rt_intern_hash(hash, rth, &skb->rtable);
1875 * NOTE. We drop all the packets that has local source
1876 * addresses, because every properly looped back packet
1877 * must have correct destination already attached by output routine.
1879 * Such approach solves two big problems:
1880 * 1. Not simplex devices are handled properly.
1881 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1884 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1885 u8 tos, struct net_device *dev)
1887 struct fib_result res;
1888 struct in_device *in_dev = in_dev_get(dev);
1889 struct flowi fl = { .nl_u = { .ip4_u =
1893 .scope = RT_SCOPE_UNIVERSE,
1896 .iif = dev->ifindex };
1899 struct rtable * rth;
1904 struct net * net = dev_net(dev);
1906 /* IP on this device is disabled. */
1911 /* Check for the most weird martians, which can be not detected
1915 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1916 ipv4_is_loopback(saddr))
1917 goto martian_source;
1919 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1922 /* Accept zero addresses only to limited broadcast;
1923 * I even do not know to fix it or not. Waiting for complains :-)
1925 if (ipv4_is_zeronet(saddr))
1926 goto martian_source;
1928 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1929 ipv4_is_loopback(daddr))
1930 goto martian_destination;
1933 * Now we are ready to route packet.
1935 if ((err = fib_lookup(net, &fl, &res)) != 0) {
1936 if (!IN_DEV_FORWARD(in_dev))
1942 RT_CACHE_STAT_INC(in_slow_tot);
1944 if (res.type == RTN_BROADCAST)
1947 if (res.type == RTN_LOCAL) {
1949 result = fib_validate_source(saddr, daddr, tos,
1950 net->loopback_dev->ifindex,
1951 dev, &spec_dst, &itag);
1953 goto martian_source;
1955 flags |= RTCF_DIRECTSRC;
1960 if (!IN_DEV_FORWARD(in_dev))
1962 if (res.type != RTN_UNICAST)
1963 goto martian_destination;
1965 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1973 if (skb->protocol != htons(ETH_P_IP))
1976 if (ipv4_is_zeronet(saddr))
1977 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1979 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1982 goto martian_source;
1984 flags |= RTCF_DIRECTSRC;
1986 flags |= RTCF_BROADCAST;
1987 res.type = RTN_BROADCAST;
1988 RT_CACHE_STAT_INC(in_brd);
1991 rth = dst_alloc(&ipv4_dst_ops);
1995 rth->u.dst.output= ip_rt_bug;
1996 rth->rt_genid = atomic_read(&rt_genid);
1998 atomic_set(&rth->u.dst.__refcnt, 1);
1999 rth->u.dst.flags= DST_HOST;
2000 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2001 rth->u.dst.flags |= DST_NOPOLICY;
2002 rth->fl.fl4_dst = daddr;
2003 rth->rt_dst = daddr;
2004 rth->fl.fl4_tos = tos;
2005 rth->fl.mark = skb->mark;
2006 rth->fl.fl4_src = saddr;
2007 rth->rt_src = saddr;
2008 #ifdef CONFIG_NET_CLS_ROUTE
2009 rth->u.dst.tclassid = itag;
2012 rth->fl.iif = dev->ifindex;
2013 rth->u.dst.dev = net->loopback_dev;
2014 dev_hold(rth->u.dst.dev);
2015 rth->idev = in_dev_get(rth->u.dst.dev);
2016 rth->rt_gateway = daddr;
2017 rth->rt_spec_dst= spec_dst;
2018 rth->u.dst.input= ip_local_deliver;
2019 rth->rt_flags = flags|RTCF_LOCAL;
2020 if (res.type == RTN_UNREACHABLE) {
2021 rth->u.dst.input= ip_error;
2022 rth->u.dst.error= -err;
2023 rth->rt_flags &= ~RTCF_LOCAL;
2025 rth->rt_type = res.type;
2026 hash = rt_hash(daddr, saddr, fl.iif);
2027 err = rt_intern_hash(hash, rth, &skb->rtable);
2031 RT_CACHE_STAT_INC(in_no_route);
2032 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2033 res.type = RTN_UNREACHABLE;
2039 * Do not cache martian addresses: they should be logged (RFC1812)
2041 martian_destination:
2042 RT_CACHE_STAT_INC(in_martian_dst);
2043 #ifdef CONFIG_IP_ROUTE_VERBOSE
2044 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2045 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2046 "%u.%u.%u.%u, dev %s\n",
2047 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2051 err = -EHOSTUNREACH;
2063 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2067 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2068 u8 tos, struct net_device *dev)
2070 struct rtable * rth;
2072 int iif = dev->ifindex;
2076 tos &= IPTOS_RT_MASK;
2077 hash = rt_hash(daddr, saddr, iif);
2080 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2081 rth = rcu_dereference(rth->u.dst.rt_next)) {
2082 if (rth->fl.fl4_dst == daddr &&
2083 rth->fl.fl4_src == saddr &&
2084 rth->fl.iif == iif &&
2086 rth->fl.mark == skb->mark &&
2087 rth->fl.fl4_tos == tos &&
2088 net_eq(dev_net(rth->u.dst.dev), net) &&
2089 rth->rt_genid == atomic_read(&rt_genid)) {
2090 dst_use(&rth->u.dst, jiffies);
2091 RT_CACHE_STAT_INC(in_hit);
2096 RT_CACHE_STAT_INC(in_hlist_search);
2100 /* Multicast recognition logic is moved from route cache to here.
2101 The problem was that too many Ethernet cards have broken/missing
2102 hardware multicast filters :-( As result the host on multicasting
2103 network acquires a lot of useless route cache entries, sort of
2104 SDR messages from all the world. Now we try to get rid of them.
2105 Really, provided software IP multicast filter is organized
2106 reasonably (at least, hashed), it does not result in a slowdown
2107 comparing with route cache reject entries.
2108 Note, that multicast routers are not affected, because
2109 route cache entry is created eventually.
2111 if (ipv4_is_multicast(daddr)) {
2112 struct in_device *in_dev;
2115 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2116 int our = ip_check_mc(in_dev, daddr, saddr,
2117 ip_hdr(skb)->protocol);
2119 #ifdef CONFIG_IP_MROUTE
2120 || (!ipv4_is_local_multicast(daddr) &&
2121 IN_DEV_MFORWARD(in_dev))
2125 return ip_route_input_mc(skb, daddr, saddr,
2132 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2135 static inline int __mkroute_output(struct rtable **result,
2136 struct fib_result* res,
2137 const struct flowi *fl,
2138 const struct flowi *oldflp,
2139 struct net_device *dev_out,
2143 struct in_device *in_dev;
2144 u32 tos = RT_FL_TOS(oldflp);
2147 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2150 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2151 res->type = RTN_BROADCAST;
2152 else if (ipv4_is_multicast(fl->fl4_dst))
2153 res->type = RTN_MULTICAST;
2154 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2157 if (dev_out->flags & IFF_LOOPBACK)
2158 flags |= RTCF_LOCAL;
2160 /* get work reference to inet device */
2161 in_dev = in_dev_get(dev_out);
2165 if (res->type == RTN_BROADCAST) {
2166 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2168 fib_info_put(res->fi);
2171 } else if (res->type == RTN_MULTICAST) {
2172 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2173 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2175 flags &= ~RTCF_LOCAL;
2176 /* If multicast route do not exist use
2177 default one, but do not gateway in this case.
2180 if (res->fi && res->prefixlen < 4) {
2181 fib_info_put(res->fi);
2187 rth = dst_alloc(&ipv4_dst_ops);
2193 atomic_set(&rth->u.dst.__refcnt, 1);
2194 rth->u.dst.flags= DST_HOST;
2195 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2196 rth->u.dst.flags |= DST_NOXFRM;
2197 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2198 rth->u.dst.flags |= DST_NOPOLICY;
2200 rth->fl.fl4_dst = oldflp->fl4_dst;
2201 rth->fl.fl4_tos = tos;
2202 rth->fl.fl4_src = oldflp->fl4_src;
2203 rth->fl.oif = oldflp->oif;
2204 rth->fl.mark = oldflp->mark;
2205 rth->rt_dst = fl->fl4_dst;
2206 rth->rt_src = fl->fl4_src;
2207 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2208 /* get references to the devices that are to be hold by the routing
2210 rth->u.dst.dev = dev_out;
2212 rth->idev = in_dev_get(dev_out);
2213 rth->rt_gateway = fl->fl4_dst;
2214 rth->rt_spec_dst= fl->fl4_src;
2216 rth->u.dst.output=ip_output;
2217 rth->rt_genid = atomic_read(&rt_genid);
2219 RT_CACHE_STAT_INC(out_slow_tot);
2221 if (flags & RTCF_LOCAL) {
2222 rth->u.dst.input = ip_local_deliver;
2223 rth->rt_spec_dst = fl->fl4_dst;
2225 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2226 rth->rt_spec_dst = fl->fl4_src;
2227 if (flags & RTCF_LOCAL &&
2228 !(dev_out->flags & IFF_LOOPBACK)) {
2229 rth->u.dst.output = ip_mc_output;
2230 RT_CACHE_STAT_INC(out_slow_mc);
2232 #ifdef CONFIG_IP_MROUTE
2233 if (res->type == RTN_MULTICAST) {
2234 if (IN_DEV_MFORWARD(in_dev) &&
2235 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2236 rth->u.dst.input = ip_mr_input;
2237 rth->u.dst.output = ip_mc_output;
2243 rt_set_nexthop(rth, res, 0);
2245 rth->rt_flags = flags;
2249 /* release work reference to inet device */
2255 static inline int ip_mkroute_output(struct rtable **rp,
2256 struct fib_result* res,
2257 const struct flowi *fl,
2258 const struct flowi *oldflp,
2259 struct net_device *dev_out,
2262 struct rtable *rth = NULL;
2263 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2266 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2267 err = rt_intern_hash(hash, rth, rp);
2274 * Major route resolver routine.
2277 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2278 const struct flowi *oldflp)
2280 u32 tos = RT_FL_TOS(oldflp);
2281 struct flowi fl = { .nl_u = { .ip4_u =
2282 { .daddr = oldflp->fl4_dst,
2283 .saddr = oldflp->fl4_src,
2284 .tos = tos & IPTOS_RT_MASK,
2285 .scope = ((tos & RTO_ONLINK) ?
2289 .mark = oldflp->mark,
2290 .iif = net->loopback_dev->ifindex,
2291 .oif = oldflp->oif };
2292 struct fib_result res;
2294 struct net_device *dev_out = NULL;
2300 #ifdef CONFIG_IP_MULTIPLE_TABLES
2304 if (oldflp->fl4_src) {
2306 if (ipv4_is_multicast(oldflp->fl4_src) ||
2307 ipv4_is_lbcast(oldflp->fl4_src) ||
2308 ipv4_is_zeronet(oldflp->fl4_src))
2311 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2312 dev_out = ip_dev_find(net, oldflp->fl4_src);
2313 if (dev_out == NULL)
2316 /* I removed check for oif == dev_out->oif here.
2317 It was wrong for two reasons:
2318 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2319 is assigned to multiple interfaces.
2320 2. Moreover, we are allowed to send packets with saddr
2321 of another iface. --ANK
2324 if (oldflp->oif == 0
2325 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2326 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2327 /* Special hack: user can direct multicasts
2328 and limited broadcast via necessary interface
2329 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2330 This hack is not just for fun, it allows
2331 vic,vat and friends to work.
2332 They bind socket to loopback, set ttl to zero
2333 and expect that it will work.
2334 From the viewpoint of routing cache they are broken,
2335 because we are not allowed to build multicast path
2336 with loopback source addr (look, routing cache
2337 cannot know, that ttl is zero, so that packet
2338 will not leave this host and route is valid).
2339 Luckily, this hack is good workaround.
2342 fl.oif = dev_out->ifindex;
2352 dev_out = dev_get_by_index(net, oldflp->oif);
2354 if (dev_out == NULL)
2357 /* RACE: Check return value of inet_select_addr instead. */
2358 if (__in_dev_get_rtnl(dev_out) == NULL) {
2360 goto out; /* Wrong error code */
2363 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2364 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2366 fl.fl4_src = inet_select_addr(dev_out, 0,
2371 if (ipv4_is_multicast(oldflp->fl4_dst))
2372 fl.fl4_src = inet_select_addr(dev_out, 0,
2374 else if (!oldflp->fl4_dst)
2375 fl.fl4_src = inet_select_addr(dev_out, 0,
2381 fl.fl4_dst = fl.fl4_src;
2383 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2386 dev_out = net->loopback_dev;
2388 fl.oif = net->loopback_dev->ifindex;
2389 res.type = RTN_LOCAL;
2390 flags |= RTCF_LOCAL;
2394 if (fib_lookup(net, &fl, &res)) {
2397 /* Apparently, routing tables are wrong. Assume,
2398 that the destination is on link.
2401 Because we are allowed to send to iface
2402 even if it has NO routes and NO assigned
2403 addresses. When oif is specified, routing
2404 tables are looked up with only one purpose:
2405 to catch if destination is gatewayed, rather than
2406 direct. Moreover, if MSG_DONTROUTE is set,
2407 we send packet, ignoring both routing tables
2408 and ifaddr state. --ANK
2411 We could make it even if oif is unknown,
2412 likely IPv6, but we do not.
2415 if (fl.fl4_src == 0)
2416 fl.fl4_src = inet_select_addr(dev_out, 0,
2418 res.type = RTN_UNICAST;
2428 if (res.type == RTN_LOCAL) {
2430 fl.fl4_src = fl.fl4_dst;
2433 dev_out = net->loopback_dev;
2435 fl.oif = dev_out->ifindex;
2437 fib_info_put(res.fi);
2439 flags |= RTCF_LOCAL;
2443 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2444 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2445 fib_select_multipath(&fl, &res);
2448 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2449 fib_select_default(net, &fl, &res);
2452 fl.fl4_src = FIB_RES_PREFSRC(res);
2456 dev_out = FIB_RES_DEV(res);
2458 fl.oif = dev_out->ifindex;
2462 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2472 int __ip_route_output_key(struct net *net, struct rtable **rp,
2473 const struct flowi *flp)
2478 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2481 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2482 rth = rcu_dereference(rth->u.dst.rt_next)) {
2483 if (rth->fl.fl4_dst == flp->fl4_dst &&
2484 rth->fl.fl4_src == flp->fl4_src &&
2486 rth->fl.oif == flp->oif &&
2487 rth->fl.mark == flp->mark &&
2488 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2489 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2490 net_eq(dev_net(rth->u.dst.dev), net) &&
2491 rth->rt_genid == atomic_read(&rt_genid)) {
2492 dst_use(&rth->u.dst, jiffies);
2493 RT_CACHE_STAT_INC(out_hit);
2494 rcu_read_unlock_bh();
2498 RT_CACHE_STAT_INC(out_hlist_search);
2500 rcu_read_unlock_bh();
2502 return ip_route_output_slow(net, rp, flp);
2505 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2507 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2511 static struct dst_ops ipv4_dst_blackhole_ops = {
2513 .protocol = __constant_htons(ETH_P_IP),
2514 .destroy = ipv4_dst_destroy,
2515 .check = ipv4_dst_check,
2516 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2517 .entry_size = sizeof(struct rtable),
2518 .entries = ATOMIC_INIT(0),
2522 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
2524 struct rtable *ort = *rp;
2525 struct rtable *rt = (struct rtable *)
2526 dst_alloc(&ipv4_dst_blackhole_ops);
2529 struct dst_entry *new = &rt->u.dst;
2531 atomic_set(&new->__refcnt, 1);
2533 new->input = dst_discard;
2534 new->output = dst_discard;
2535 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2537 new->dev = ort->u.dst.dev;
2543 rt->idev = ort->idev;
2545 in_dev_hold(rt->idev);
2546 rt->rt_genid = atomic_read(&rt_genid);
2547 rt->rt_flags = ort->rt_flags;
2548 rt->rt_type = ort->rt_type;
2549 rt->rt_dst = ort->rt_dst;
2550 rt->rt_src = ort->rt_src;
2551 rt->rt_iif = ort->rt_iif;
2552 rt->rt_gateway = ort->rt_gateway;
2553 rt->rt_spec_dst = ort->rt_spec_dst;
2554 rt->peer = ort->peer;
2556 atomic_inc(&rt->peer->refcnt);
2561 dst_release(&(*rp)->u.dst);
2563 return (rt ? 0 : -ENOMEM);
2566 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2567 struct sock *sk, int flags)
2571 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2576 flp->fl4_src = (*rp)->rt_src;
2578 flp->fl4_dst = (*rp)->rt_dst;
2579 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2580 flags ? XFRM_LOOKUP_WAIT : 0);
2581 if (err == -EREMOTE)
2582 err = ipv4_dst_blackhole(rp, flp);
2590 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2592 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2594 return ip_route_output_flow(net, rp, flp, NULL, 0);
2597 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2598 int nowait, unsigned int flags)
2600 struct rtable *rt = skb->rtable;
2602 struct nlmsghdr *nlh;
2604 u32 id = 0, ts = 0, tsage = 0, error;
2606 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2610 r = nlmsg_data(nlh);
2611 r->rtm_family = AF_INET;
2612 r->rtm_dst_len = 32;
2614 r->rtm_tos = rt->fl.fl4_tos;
2615 r->rtm_table = RT_TABLE_MAIN;
2616 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2617 r->rtm_type = rt->rt_type;
2618 r->rtm_scope = RT_SCOPE_UNIVERSE;
2619 r->rtm_protocol = RTPROT_UNSPEC;
2620 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2621 if (rt->rt_flags & RTCF_NOTIFY)
2622 r->rtm_flags |= RTM_F_NOTIFY;
2624 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2626 if (rt->fl.fl4_src) {
2627 r->rtm_src_len = 32;
2628 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2631 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2632 #ifdef CONFIG_NET_CLS_ROUTE
2633 if (rt->u.dst.tclassid)
2634 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2637 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2638 else if (rt->rt_src != rt->fl.fl4_src)
2639 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2641 if (rt->rt_dst != rt->rt_gateway)
2642 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2644 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2645 goto nla_put_failure;
2647 error = rt->u.dst.error;
2648 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2650 id = rt->peer->ip_id_count;
2651 if (rt->peer->tcp_ts_stamp) {
2652 ts = rt->peer->tcp_ts;
2653 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2658 #ifdef CONFIG_IP_MROUTE
2659 __be32 dst = rt->rt_dst;
2661 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2662 IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2663 int err = ipmr_get_route(skb, r, nowait);
2668 goto nla_put_failure;
2670 if (err == -EMSGSIZE)
2671 goto nla_put_failure;
2677 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2680 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2681 expires, error) < 0)
2682 goto nla_put_failure;
2684 return nlmsg_end(skb, nlh);
2687 nlmsg_cancel(skb, nlh);
2691 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2693 struct net *net = sock_net(in_skb->sk);
2695 struct nlattr *tb[RTA_MAX+1];
2696 struct rtable *rt = NULL;
2701 struct sk_buff *skb;
2703 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2707 rtm = nlmsg_data(nlh);
2709 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2715 /* Reserve room for dummy headers, this skb can pass
2716 through good chunk of routing engine.
2718 skb_reset_mac_header(skb);
2719 skb_reset_network_header(skb);
2721 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2722 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2723 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2725 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2726 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2727 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2730 struct net_device *dev;
2732 dev = __dev_get_by_index(net, iif);
2738 skb->protocol = htons(ETH_P_IP);
2741 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2745 if (err == 0 && rt->u.dst.error)
2746 err = -rt->u.dst.error;
2753 .tos = rtm->rtm_tos,
2756 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2758 err = ip_route_output_key(net, &rt, &fl);
2765 if (rtm->rtm_flags & RTM_F_NOTIFY)
2766 rt->rt_flags |= RTCF_NOTIFY;
2768 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2769 RTM_NEWROUTE, 0, 0);
2773 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2782 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2789 net = sock_net(skb->sk);
2794 s_idx = idx = cb->args[1];
2795 for (h = s_h; h <= rt_hash_mask; h++) {
2797 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2798 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2799 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2801 if (rt->rt_genid != atomic_read(&rt_genid))
2803 skb->dst = dst_clone(&rt->u.dst);
2804 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2805 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2806 1, NLM_F_MULTI) <= 0) {
2807 dst_release(xchg(&skb->dst, NULL));
2808 rcu_read_unlock_bh();
2811 dst_release(xchg(&skb->dst, NULL));
2813 rcu_read_unlock_bh();
2823 void ip_rt_multicast_event(struct in_device *in_dev)
2828 #ifdef CONFIG_SYSCTL
2829 static int flush_delay;
2831 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2832 struct file *filp, void __user *buffer,
2833 size_t *lenp, loff_t *ppos)
2836 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2837 rt_cache_flush(flush_delay);
2844 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2847 void __user *oldval,
2848 size_t __user *oldlenp,
2849 void __user *newval,
2853 if (newlen != sizeof(int))
2855 if (get_user(delay, (int __user *)newval))
2857 rt_cache_flush(delay);
2861 ctl_table ipv4_route_table[] = {
2863 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2864 .procname = "flush",
2865 .data = &flush_delay,
2866 .maxlen = sizeof(int),
2868 .proc_handler = &ipv4_sysctl_rtcache_flush,
2869 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2872 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2873 .procname = "gc_thresh",
2874 .data = &ipv4_dst_ops.gc_thresh,
2875 .maxlen = sizeof(int),
2877 .proc_handler = &proc_dointvec,
2880 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2881 .procname = "max_size",
2882 .data = &ip_rt_max_size,
2883 .maxlen = sizeof(int),
2885 .proc_handler = &proc_dointvec,
2888 /* Deprecated. Use gc_min_interval_ms */
2890 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2891 .procname = "gc_min_interval",
2892 .data = &ip_rt_gc_min_interval,
2893 .maxlen = sizeof(int),
2895 .proc_handler = &proc_dointvec_jiffies,
2896 .strategy = &sysctl_jiffies,
2899 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2900 .procname = "gc_min_interval_ms",
2901 .data = &ip_rt_gc_min_interval,
2902 .maxlen = sizeof(int),
2904 .proc_handler = &proc_dointvec_ms_jiffies,
2905 .strategy = &sysctl_ms_jiffies,
2908 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2909 .procname = "gc_timeout",
2910 .data = &ip_rt_gc_timeout,
2911 .maxlen = sizeof(int),
2913 .proc_handler = &proc_dointvec_jiffies,
2914 .strategy = &sysctl_jiffies,
2917 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2918 .procname = "gc_interval",
2919 .data = &ip_rt_gc_interval,
2920 .maxlen = sizeof(int),
2922 .proc_handler = &proc_dointvec_jiffies,
2923 .strategy = &sysctl_jiffies,
2926 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2927 .procname = "redirect_load",
2928 .data = &ip_rt_redirect_load,
2929 .maxlen = sizeof(int),
2931 .proc_handler = &proc_dointvec,
2934 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2935 .procname = "redirect_number",
2936 .data = &ip_rt_redirect_number,
2937 .maxlen = sizeof(int),
2939 .proc_handler = &proc_dointvec,
2942 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2943 .procname = "redirect_silence",
2944 .data = &ip_rt_redirect_silence,
2945 .maxlen = sizeof(int),
2947 .proc_handler = &proc_dointvec,
2950 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2951 .procname = "error_cost",
2952 .data = &ip_rt_error_cost,
2953 .maxlen = sizeof(int),
2955 .proc_handler = &proc_dointvec,
2958 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2959 .procname = "error_burst",
2960 .data = &ip_rt_error_burst,
2961 .maxlen = sizeof(int),
2963 .proc_handler = &proc_dointvec,
2966 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2967 .procname = "gc_elasticity",
2968 .data = &ip_rt_gc_elasticity,
2969 .maxlen = sizeof(int),
2971 .proc_handler = &proc_dointvec,
2974 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2975 .procname = "mtu_expires",
2976 .data = &ip_rt_mtu_expires,
2977 .maxlen = sizeof(int),
2979 .proc_handler = &proc_dointvec_jiffies,
2980 .strategy = &sysctl_jiffies,
2983 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2984 .procname = "min_pmtu",
2985 .data = &ip_rt_min_pmtu,
2986 .maxlen = sizeof(int),
2988 .proc_handler = &proc_dointvec,
2991 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2992 .procname = "min_adv_mss",
2993 .data = &ip_rt_min_advmss,
2994 .maxlen = sizeof(int),
2996 .proc_handler = &proc_dointvec,
2999 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3000 .procname = "secret_interval",
3001 .data = &ip_rt_secret_interval,
3002 .maxlen = sizeof(int),
3004 .proc_handler = &proc_dointvec_jiffies,
3005 .strategy = &sysctl_jiffies,
3011 #ifdef CONFIG_NET_CLS_ROUTE
3012 struct ip_rt_acct *ip_rt_acct __read_mostly;
3013 #endif /* CONFIG_NET_CLS_ROUTE */
3015 static __initdata unsigned long rhash_entries;
3016 static int __init set_rhash_entries(char *str)
3020 rhash_entries = simple_strtoul(str, &str, 0);
3023 __setup("rhash_entries=", set_rhash_entries);
3025 int __init ip_rt_init(void)
3029 atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3030 (jiffies ^ (jiffies >> 7))));
3032 #ifdef CONFIG_NET_CLS_ROUTE
3033 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3035 panic("IP: failed to allocate ip_rt_acct\n");
3038 ipv4_dst_ops.kmem_cachep =
3039 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3040 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3042 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3044 rt_hash_table = (struct rt_hash_bucket *)
3045 alloc_large_system_hash("IP route cache",
3046 sizeof(struct rt_hash_bucket),
3048 (num_physpages >= 128 * 1024) ?
3054 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3055 rt_hash_lock_init();
3057 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3058 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3063 setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3065 /* All the timers, started at system startup tend
3066 to synchronize. Perturb it a bit.
3068 schedule_delayed_work(&expires_work,
3069 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3071 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3072 ip_rt_secret_interval;
3073 add_timer(&rt_secret_timer);
3075 if (ip_rt_proc_init())
3076 printk(KERN_ERR "Unable to create route proc files\n");
3081 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3086 EXPORT_SYMBOL(__ip_select_ident);
3087 EXPORT_SYMBOL(ip_route_input);
3088 EXPORT_SYMBOL(ip_route_output_key);