]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv4/route.c
net: allocate tx queues in register_netdevice
[net-next-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly    = 8;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132 static int rt_chain_length_max __read_mostly    = 20;
133
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136
137 /*
138  *      Interface to generic destination cache.
139  */
140
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void              ipv4_dst_destroy(struct dst_entry *dst);
143 static void              ipv4_dst_ifdown(struct dst_entry *dst,
144                                          struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void              ipv4_link_failure(struct sk_buff *skb);
147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149
150
151 static struct dst_ops ipv4_dst_ops = {
152         .family =               AF_INET,
153         .protocol =             cpu_to_be16(ETH_P_IP),
154         .gc =                   rt_garbage_collect,
155         .check =                ipv4_dst_check,
156         .destroy =              ipv4_dst_destroy,
157         .ifdown =               ipv4_dst_ifdown,
158         .negative_advice =      ipv4_negative_advice,
159         .link_failure =         ipv4_link_failure,
160         .update_pmtu =          ip_rt_update_pmtu,
161         .local_out =            __ip_local_out,
162 };
163
164 #define ECN_OR_COST(class)      TC_PRIO_##class
165
166 const __u8 ip_tos2prio[16] = {
167         TC_PRIO_BESTEFFORT,
168         ECN_OR_COST(FILLER),
169         TC_PRIO_BESTEFFORT,
170         ECN_OR_COST(BESTEFFORT),
171         TC_PRIO_BULK,
172         ECN_OR_COST(BULK),
173         TC_PRIO_BULK,
174         ECN_OR_COST(BULK),
175         TC_PRIO_INTERACTIVE,
176         ECN_OR_COST(INTERACTIVE),
177         TC_PRIO_INTERACTIVE,
178         ECN_OR_COST(INTERACTIVE),
179         TC_PRIO_INTERACTIVE_BULK,
180         ECN_OR_COST(INTERACTIVE_BULK),
181         TC_PRIO_INTERACTIVE_BULK,
182         ECN_OR_COST(INTERACTIVE_BULK)
183 };
184
185
186 /*
187  * Route cache.
188  */
189
190 /* The locking scheme is rather straight forward:
191  *
192  * 1) Read-Copy Update protects the buckets of the central route hash.
193  * 2) Only writers remove entries, and they hold the lock
194  *    as they look at rtable reference counts.
195  * 3) Only readers acquire references to rtable entries,
196  *    they do so with atomic increments and with the
197  *    lock held.
198  */
199
200 struct rt_hash_bucket {
201         struct rtable   *chain;
202 };
203
204 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
205         defined(CONFIG_PROVE_LOCKING)
206 /*
207  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
208  * The size of this table is a power of two and depends on the number of CPUS.
209  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
210  */
211 #ifdef CONFIG_LOCKDEP
212 # define RT_HASH_LOCK_SZ        256
213 #else
214 # if NR_CPUS >= 32
215 #  define RT_HASH_LOCK_SZ       4096
216 # elif NR_CPUS >= 16
217 #  define RT_HASH_LOCK_SZ       2048
218 # elif NR_CPUS >= 8
219 #  define RT_HASH_LOCK_SZ       1024
220 # elif NR_CPUS >= 4
221 #  define RT_HASH_LOCK_SZ       512
222 # else
223 #  define RT_HASH_LOCK_SZ       256
224 # endif
225 #endif
226
227 static spinlock_t       *rt_hash_locks;
228 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
229
230 static __init void rt_hash_lock_init(void)
231 {
232         int i;
233
234         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
235                         GFP_KERNEL);
236         if (!rt_hash_locks)
237                 panic("IP: failed to allocate rt_hash_locks\n");
238
239         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
240                 spin_lock_init(&rt_hash_locks[i]);
241 }
242 #else
243 # define rt_hash_lock_addr(slot) NULL
244
245 static inline void rt_hash_lock_init(void)
246 {
247 }
248 #endif
249
250 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
251 static unsigned                 rt_hash_mask __read_mostly;
252 static unsigned int             rt_hash_log  __read_mostly;
253
254 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
255 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
256
257 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
258                                    int genid)
259 {
260         return jhash_3words((__force u32)daddr, (__force u32)saddr,
261                             idx, genid)
262                 & rt_hash_mask;
263 }
264
265 static inline int rt_genid(struct net *net)
266 {
267         return atomic_read(&net->ipv4.rt_genid);
268 }
269
270 #ifdef CONFIG_PROC_FS
271 struct rt_cache_iter_state {
272         struct seq_net_private p;
273         int bucket;
274         int genid;
275 };
276
277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
278 {
279         struct rt_cache_iter_state *st = seq->private;
280         struct rtable *r = NULL;
281
282         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283                 if (!rt_hash_table[st->bucket].chain)
284                         continue;
285                 rcu_read_lock_bh();
286                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
287                 while (r) {
288                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
289                             r->rt_genid == st->genid)
290                                 return r;
291                         r = rcu_dereference_bh(r->dst.rt_next);
292                 }
293                 rcu_read_unlock_bh();
294         }
295         return r;
296 }
297
298 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
299                                           struct rtable *r)
300 {
301         struct rt_cache_iter_state *st = seq->private;
302
303         r = r->dst.rt_next;
304         while (!r) {
305                 rcu_read_unlock_bh();
306                 do {
307                         if (--st->bucket < 0)
308                                 return NULL;
309                 } while (!rt_hash_table[st->bucket].chain);
310                 rcu_read_lock_bh();
311                 r = rt_hash_table[st->bucket].chain;
312         }
313         return rcu_dereference_bh(r);
314 }
315
316 static struct rtable *rt_cache_get_next(struct seq_file *seq,
317                                         struct rtable *r)
318 {
319         struct rt_cache_iter_state *st = seq->private;
320         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
321                 if (dev_net(r->dst.dev) != seq_file_net(seq))
322                         continue;
323                 if (r->rt_genid == st->genid)
324                         break;
325         }
326         return r;
327 }
328
329 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
330 {
331         struct rtable *r = rt_cache_get_first(seq);
332
333         if (r)
334                 while (pos && (r = rt_cache_get_next(seq, r)))
335                         --pos;
336         return pos ? NULL : r;
337 }
338
339 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
340 {
341         struct rt_cache_iter_state *st = seq->private;
342         if (*pos)
343                 return rt_cache_get_idx(seq, *pos - 1);
344         st->genid = rt_genid(seq_file_net(seq));
345         return SEQ_START_TOKEN;
346 }
347
348 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
349 {
350         struct rtable *r;
351
352         if (v == SEQ_START_TOKEN)
353                 r = rt_cache_get_first(seq);
354         else
355                 r = rt_cache_get_next(seq, v);
356         ++*pos;
357         return r;
358 }
359
360 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
361 {
362         if (v && v != SEQ_START_TOKEN)
363                 rcu_read_unlock_bh();
364 }
365
366 static int rt_cache_seq_show(struct seq_file *seq, void *v)
367 {
368         if (v == SEQ_START_TOKEN)
369                 seq_printf(seq, "%-127s\n",
370                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
371                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
372                            "HHUptod\tSpecDst");
373         else {
374                 struct rtable *r = v;
375                 int len;
376
377                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
378                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
379                         r->dst.dev ? r->dst.dev->name : "*",
380                         (__force u32)r->rt_dst,
381                         (__force u32)r->rt_gateway,
382                         r->rt_flags, atomic_read(&r->dst.__refcnt),
383                         r->dst.__use, 0, (__force u32)r->rt_src,
384                         (dst_metric(&r->dst, RTAX_ADVMSS) ?
385                              (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
386                         dst_metric(&r->dst, RTAX_WINDOW),
387                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
388                               dst_metric(&r->dst, RTAX_RTTVAR)),
389                         r->fl.fl4_tos,
390                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
391                         r->dst.hh ? (r->dst.hh->hh_output ==
392                                        dev_queue_xmit) : 0,
393                         r->rt_spec_dst, &len);
394
395                 seq_printf(seq, "%*s\n", 127 - len, "");
396         }
397         return 0;
398 }
399
400 static const struct seq_operations rt_cache_seq_ops = {
401         .start  = rt_cache_seq_start,
402         .next   = rt_cache_seq_next,
403         .stop   = rt_cache_seq_stop,
404         .show   = rt_cache_seq_show,
405 };
406
407 static int rt_cache_seq_open(struct inode *inode, struct file *file)
408 {
409         return seq_open_net(inode, file, &rt_cache_seq_ops,
410                         sizeof(struct rt_cache_iter_state));
411 }
412
413 static const struct file_operations rt_cache_seq_fops = {
414         .owner   = THIS_MODULE,
415         .open    = rt_cache_seq_open,
416         .read    = seq_read,
417         .llseek  = seq_lseek,
418         .release = seq_release_net,
419 };
420
421
422 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
423 {
424         int cpu;
425
426         if (*pos == 0)
427                 return SEQ_START_TOKEN;
428
429         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
430                 if (!cpu_possible(cpu))
431                         continue;
432                 *pos = cpu+1;
433                 return &per_cpu(rt_cache_stat, cpu);
434         }
435         return NULL;
436 }
437
438 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
439 {
440         int cpu;
441
442         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
443                 if (!cpu_possible(cpu))
444                         continue;
445                 *pos = cpu+1;
446                 return &per_cpu(rt_cache_stat, cpu);
447         }
448         return NULL;
449
450 }
451
452 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
453 {
454
455 }
456
457 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
458 {
459         struct rt_cache_stat *st = v;
460
461         if (v == SEQ_START_TOKEN) {
462                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
463                 return 0;
464         }
465
466         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
467                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
468                    dst_entries_get_slow(&ipv4_dst_ops),
469                    st->in_hit,
470                    st->in_slow_tot,
471                    st->in_slow_mc,
472                    st->in_no_route,
473                    st->in_brd,
474                    st->in_martian_dst,
475                    st->in_martian_src,
476
477                    st->out_hit,
478                    st->out_slow_tot,
479                    st->out_slow_mc,
480
481                    st->gc_total,
482                    st->gc_ignored,
483                    st->gc_goal_miss,
484                    st->gc_dst_overflow,
485                    st->in_hlist_search,
486                    st->out_hlist_search
487                 );
488         return 0;
489 }
490
491 static const struct seq_operations rt_cpu_seq_ops = {
492         .start  = rt_cpu_seq_start,
493         .next   = rt_cpu_seq_next,
494         .stop   = rt_cpu_seq_stop,
495         .show   = rt_cpu_seq_show,
496 };
497
498
499 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
500 {
501         return seq_open(file, &rt_cpu_seq_ops);
502 }
503
504 static const struct file_operations rt_cpu_seq_fops = {
505         .owner   = THIS_MODULE,
506         .open    = rt_cpu_seq_open,
507         .read    = seq_read,
508         .llseek  = seq_lseek,
509         .release = seq_release,
510 };
511
512 #ifdef CONFIG_NET_CLS_ROUTE
513 static int rt_acct_proc_show(struct seq_file *m, void *v)
514 {
515         struct ip_rt_acct *dst, *src;
516         unsigned int i, j;
517
518         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
519         if (!dst)
520                 return -ENOMEM;
521
522         for_each_possible_cpu(i) {
523                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
524                 for (j = 0; j < 256; j++) {
525                         dst[j].o_bytes   += src[j].o_bytes;
526                         dst[j].o_packets += src[j].o_packets;
527                         dst[j].i_bytes   += src[j].i_bytes;
528                         dst[j].i_packets += src[j].i_packets;
529                 }
530         }
531
532         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
533         kfree(dst);
534         return 0;
535 }
536
537 static int rt_acct_proc_open(struct inode *inode, struct file *file)
538 {
539         return single_open(file, rt_acct_proc_show, NULL);
540 }
541
542 static const struct file_operations rt_acct_proc_fops = {
543         .owner          = THIS_MODULE,
544         .open           = rt_acct_proc_open,
545         .read           = seq_read,
546         .llseek         = seq_lseek,
547         .release        = single_release,
548 };
549 #endif
550
551 static int __net_init ip_rt_do_proc_init(struct net *net)
552 {
553         struct proc_dir_entry *pde;
554
555         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
556                         &rt_cache_seq_fops);
557         if (!pde)
558                 goto err1;
559
560         pde = proc_create("rt_cache", S_IRUGO,
561                           net->proc_net_stat, &rt_cpu_seq_fops);
562         if (!pde)
563                 goto err2;
564
565 #ifdef CONFIG_NET_CLS_ROUTE
566         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
567         if (!pde)
568                 goto err3;
569 #endif
570         return 0;
571
572 #ifdef CONFIG_NET_CLS_ROUTE
573 err3:
574         remove_proc_entry("rt_cache", net->proc_net_stat);
575 #endif
576 err2:
577         remove_proc_entry("rt_cache", net->proc_net);
578 err1:
579         return -ENOMEM;
580 }
581
582 static void __net_exit ip_rt_do_proc_exit(struct net *net)
583 {
584         remove_proc_entry("rt_cache", net->proc_net_stat);
585         remove_proc_entry("rt_cache", net->proc_net);
586 #ifdef CONFIG_NET_CLS_ROUTE
587         remove_proc_entry("rt_acct", net->proc_net);
588 #endif
589 }
590
591 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
592         .init = ip_rt_do_proc_init,
593         .exit = ip_rt_do_proc_exit,
594 };
595
596 static int __init ip_rt_proc_init(void)
597 {
598         return register_pernet_subsys(&ip_rt_proc_ops);
599 }
600
601 #else
602 static inline int ip_rt_proc_init(void)
603 {
604         return 0;
605 }
606 #endif /* CONFIG_PROC_FS */
607
608 static inline void rt_free(struct rtable *rt)
609 {
610         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
611 }
612
613 static inline void rt_drop(struct rtable *rt)
614 {
615         ip_rt_put(rt);
616         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
617 }
618
619 static inline int rt_fast_clean(struct rtable *rth)
620 {
621         /* Kill broadcast/multicast entries very aggresively, if they
622            collide in hash table with more useful entries */
623         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
624                 rth->fl.iif && rth->dst.rt_next;
625 }
626
627 static inline int rt_valuable(struct rtable *rth)
628 {
629         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
630                 rth->dst.expires;
631 }
632
633 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
634 {
635         unsigned long age;
636         int ret = 0;
637
638         if (atomic_read(&rth->dst.__refcnt))
639                 goto out;
640
641         ret = 1;
642         if (rth->dst.expires &&
643             time_after_eq(jiffies, rth->dst.expires))
644                 goto out;
645
646         age = jiffies - rth->dst.lastuse;
647         ret = 0;
648         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
649             (age <= tmo2 && rt_valuable(rth)))
650                 goto out;
651         ret = 1;
652 out:    return ret;
653 }
654
655 /* Bits of score are:
656  * 31: very valuable
657  * 30: not quite useless
658  * 29..0: usage counter
659  */
660 static inline u32 rt_score(struct rtable *rt)
661 {
662         u32 score = jiffies - rt->dst.lastuse;
663
664         score = ~score & ~(3<<30);
665
666         if (rt_valuable(rt))
667                 score |= (1<<31);
668
669         if (!rt->fl.iif ||
670             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
671                 score |= (1<<30);
672
673         return score;
674 }
675
676 static inline bool rt_caching(const struct net *net)
677 {
678         return net->ipv4.current_rt_cache_rebuild_count <=
679                 net->ipv4.sysctl_rt_cache_rebuild_count;
680 }
681
682 static inline bool compare_hash_inputs(const struct flowi *fl1,
683                                         const struct flowi *fl2)
684 {
685         return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
686                 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
687                 (fl1->iif ^ fl2->iif)) == 0);
688 }
689
690 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
691 {
692         return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
693                 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
694                 (fl1->mark ^ fl2->mark) |
695                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
696                 (fl1->oif ^ fl2->oif) |
697                 (fl1->iif ^ fl2->iif)) == 0;
698 }
699
700 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
701 {
702         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
703 }
704
705 static inline int rt_is_expired(struct rtable *rth)
706 {
707         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
708 }
709
710 /*
711  * Perform a full scan of hash table and free all entries.
712  * Can be called by a softirq or a process.
713  * In the later case, we want to be reschedule if necessary
714  */
715 static void rt_do_flush(int process_context)
716 {
717         unsigned int i;
718         struct rtable *rth, *next;
719         struct rtable * tail;
720
721         for (i = 0; i <= rt_hash_mask; i++) {
722                 if (process_context && need_resched())
723                         cond_resched();
724                 rth = rt_hash_table[i].chain;
725                 if (!rth)
726                         continue;
727
728                 spin_lock_bh(rt_hash_lock_addr(i));
729 #ifdef CONFIG_NET_NS
730                 {
731                 struct rtable ** prev, * p;
732
733                 rth = rt_hash_table[i].chain;
734
735                 /* defer releasing the head of the list after spin_unlock */
736                 for (tail = rth; tail; tail = tail->dst.rt_next)
737                         if (!rt_is_expired(tail))
738                                 break;
739                 if (rth != tail)
740                         rt_hash_table[i].chain = tail;
741
742                 /* call rt_free on entries after the tail requiring flush */
743                 prev = &rt_hash_table[i].chain;
744                 for (p = *prev; p; p = next) {
745                         next = p->dst.rt_next;
746                         if (!rt_is_expired(p)) {
747                                 prev = &p->dst.rt_next;
748                         } else {
749                                 *prev = next;
750                                 rt_free(p);
751                         }
752                 }
753                 }
754 #else
755                 rth = rt_hash_table[i].chain;
756                 rt_hash_table[i].chain = NULL;
757                 tail = NULL;
758 #endif
759                 spin_unlock_bh(rt_hash_lock_addr(i));
760
761                 for (; rth != tail; rth = next) {
762                         next = rth->dst.rt_next;
763                         rt_free(rth);
764                 }
765         }
766 }
767
768 /*
769  * While freeing expired entries, we compute average chain length
770  * and standard deviation, using fixed-point arithmetic.
771  * This to have an estimation of rt_chain_length_max
772  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
773  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
774  */
775
776 #define FRACT_BITS 3
777 #define ONE (1UL << FRACT_BITS)
778
779 /*
780  * Given a hash chain and an item in this hash chain,
781  * find if a previous entry has the same hash_inputs
782  * (but differs on tos, mark or oif)
783  * Returns 0 if an alias is found.
784  * Returns ONE if rth has no alias before itself.
785  */
786 static int has_noalias(const struct rtable *head, const struct rtable *rth)
787 {
788         const struct rtable *aux = head;
789
790         while (aux != rth) {
791                 if (compare_hash_inputs(&aux->fl, &rth->fl))
792                         return 0;
793                 aux = aux->dst.rt_next;
794         }
795         return ONE;
796 }
797
798 static void rt_check_expire(void)
799 {
800         static unsigned int rover;
801         unsigned int i = rover, goal;
802         struct rtable *rth, **rthp;
803         unsigned long samples = 0;
804         unsigned long sum = 0, sum2 = 0;
805         unsigned long delta;
806         u64 mult;
807
808         delta = jiffies - expires_ljiffies;
809         expires_ljiffies = jiffies;
810         mult = ((u64)delta) << rt_hash_log;
811         if (ip_rt_gc_timeout > 1)
812                 do_div(mult, ip_rt_gc_timeout);
813         goal = (unsigned int)mult;
814         if (goal > rt_hash_mask)
815                 goal = rt_hash_mask + 1;
816         for (; goal > 0; goal--) {
817                 unsigned long tmo = ip_rt_gc_timeout;
818                 unsigned long length;
819
820                 i = (i + 1) & rt_hash_mask;
821                 rthp = &rt_hash_table[i].chain;
822
823                 if (need_resched())
824                         cond_resched();
825
826                 samples++;
827
828                 if (*rthp == NULL)
829                         continue;
830                 length = 0;
831                 spin_lock_bh(rt_hash_lock_addr(i));
832                 while ((rth = *rthp) != NULL) {
833                         prefetch(rth->dst.rt_next);
834                         if (rt_is_expired(rth)) {
835                                 *rthp = rth->dst.rt_next;
836                                 rt_free(rth);
837                                 continue;
838                         }
839                         if (rth->dst.expires) {
840                                 /* Entry is expired even if it is in use */
841                                 if (time_before_eq(jiffies, rth->dst.expires)) {
842 nofree:
843                                         tmo >>= 1;
844                                         rthp = &rth->dst.rt_next;
845                                         /*
846                                          * We only count entries on
847                                          * a chain with equal hash inputs once
848                                          * so that entries for different QOS
849                                          * levels, and other non-hash input
850                                          * attributes don't unfairly skew
851                                          * the length computation
852                                          */
853                                         length += has_noalias(rt_hash_table[i].chain, rth);
854                                         continue;
855                                 }
856                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
857                                 goto nofree;
858
859                         /* Cleanup aged off entries. */
860                         *rthp = rth->dst.rt_next;
861                         rt_free(rth);
862                 }
863                 spin_unlock_bh(rt_hash_lock_addr(i));
864                 sum += length;
865                 sum2 += length*length;
866         }
867         if (samples) {
868                 unsigned long avg = sum / samples;
869                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
870                 rt_chain_length_max = max_t(unsigned long,
871                                         ip_rt_gc_elasticity,
872                                         (avg + 4*sd) >> FRACT_BITS);
873         }
874         rover = i;
875 }
876
877 /*
878  * rt_worker_func() is run in process context.
879  * we call rt_check_expire() to scan part of the hash table
880  */
881 static void rt_worker_func(struct work_struct *work)
882 {
883         rt_check_expire();
884         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
885 }
886
887 /*
888  * Pertubation of rt_genid by a small quantity [1..256]
889  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
890  * many times (2^24) without giving recent rt_genid.
891  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
892  */
893 static void rt_cache_invalidate(struct net *net)
894 {
895         unsigned char shuffle;
896
897         get_random_bytes(&shuffle, sizeof(shuffle));
898         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
899 }
900
901 /*
902  * delay < 0  : invalidate cache (fast : entries will be deleted later)
903  * delay >= 0 : invalidate & flush cache (can be long)
904  */
905 void rt_cache_flush(struct net *net, int delay)
906 {
907         rt_cache_invalidate(net);
908         if (delay >= 0)
909                 rt_do_flush(!in_softirq());
910 }
911
912 /* Flush previous cache invalidated entries from the cache */
913 void rt_cache_flush_batch(void)
914 {
915         rt_do_flush(!in_softirq());
916 }
917
918 static void rt_emergency_hash_rebuild(struct net *net)
919 {
920         if (net_ratelimit())
921                 printk(KERN_WARNING "Route hash chain too long!\n");
922         rt_cache_invalidate(net);
923 }
924
925 /*
926    Short description of GC goals.
927
928    We want to build algorithm, which will keep routing cache
929    at some equilibrium point, when number of aged off entries
930    is kept approximately equal to newly generated ones.
931
932    Current expiration strength is variable "expire".
933    We try to adjust it dynamically, so that if networking
934    is idle expires is large enough to keep enough of warm entries,
935    and when load increases it reduces to limit cache size.
936  */
937
938 static int rt_garbage_collect(struct dst_ops *ops)
939 {
940         static unsigned long expire = RT_GC_TIMEOUT;
941         static unsigned long last_gc;
942         static int rover;
943         static int equilibrium;
944         struct rtable *rth, **rthp;
945         unsigned long now = jiffies;
946         int goal;
947         int entries = dst_entries_get_fast(&ipv4_dst_ops);
948
949         /*
950          * Garbage collection is pretty expensive,
951          * do not make it too frequently.
952          */
953
954         RT_CACHE_STAT_INC(gc_total);
955
956         if (now - last_gc < ip_rt_gc_min_interval &&
957             entries < ip_rt_max_size) {
958                 RT_CACHE_STAT_INC(gc_ignored);
959                 goto out;
960         }
961
962         entries = dst_entries_get_slow(&ipv4_dst_ops);
963         /* Calculate number of entries, which we want to expire now. */
964         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
965         if (goal <= 0) {
966                 if (equilibrium < ipv4_dst_ops.gc_thresh)
967                         equilibrium = ipv4_dst_ops.gc_thresh;
968                 goal = entries - equilibrium;
969                 if (goal > 0) {
970                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
971                         goal = entries - equilibrium;
972                 }
973         } else {
974                 /* We are in dangerous area. Try to reduce cache really
975                  * aggressively.
976                  */
977                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
978                 equilibrium = entries - goal;
979         }
980
981         if (now - last_gc >= ip_rt_gc_min_interval)
982                 last_gc = now;
983
984         if (goal <= 0) {
985                 equilibrium += goal;
986                 goto work_done;
987         }
988
989         do {
990                 int i, k;
991
992                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
993                         unsigned long tmo = expire;
994
995                         k = (k + 1) & rt_hash_mask;
996                         rthp = &rt_hash_table[k].chain;
997                         spin_lock_bh(rt_hash_lock_addr(k));
998                         while ((rth = *rthp) != NULL) {
999                                 if (!rt_is_expired(rth) &&
1000                                         !rt_may_expire(rth, tmo, expire)) {
1001                                         tmo >>= 1;
1002                                         rthp = &rth->dst.rt_next;
1003                                         continue;
1004                                 }
1005                                 *rthp = rth->dst.rt_next;
1006                                 rt_free(rth);
1007                                 goal--;
1008                         }
1009                         spin_unlock_bh(rt_hash_lock_addr(k));
1010                         if (goal <= 0)
1011                                 break;
1012                 }
1013                 rover = k;
1014
1015                 if (goal <= 0)
1016                         goto work_done;
1017
1018                 /* Goal is not achieved. We stop process if:
1019
1020                    - if expire reduced to zero. Otherwise, expire is halfed.
1021                    - if table is not full.
1022                    - if we are called from interrupt.
1023                    - jiffies check is just fallback/debug loop breaker.
1024                      We will not spin here for long time in any case.
1025                  */
1026
1027                 RT_CACHE_STAT_INC(gc_goal_miss);
1028
1029                 if (expire == 0)
1030                         break;
1031
1032                 expire >>= 1;
1033 #if RT_CACHE_DEBUG >= 2
1034                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1035                                 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1036 #endif
1037
1038                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1039                         goto out;
1040         } while (!in_softirq() && time_before_eq(jiffies, now));
1041
1042         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1043                 goto out;
1044         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1045                 goto out;
1046         if (net_ratelimit())
1047                 printk(KERN_WARNING "dst cache overflow\n");
1048         RT_CACHE_STAT_INC(gc_dst_overflow);
1049         return 1;
1050
1051 work_done:
1052         expire += ip_rt_gc_min_interval;
1053         if (expire > ip_rt_gc_timeout ||
1054             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1055             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1056                 expire = ip_rt_gc_timeout;
1057 #if RT_CACHE_DEBUG >= 2
1058         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1059                         dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1060 #endif
1061 out:    return 0;
1062 }
1063
1064 /*
1065  * Returns number of entries in a hash chain that have different hash_inputs
1066  */
1067 static int slow_chain_length(const struct rtable *head)
1068 {
1069         int length = 0;
1070         const struct rtable *rth = head;
1071
1072         while (rth) {
1073                 length += has_noalias(head, rth);
1074                 rth = rth->dst.rt_next;
1075         }
1076         return length >> FRACT_BITS;
1077 }
1078
1079 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1080                           struct rtable **rp, struct sk_buff *skb, int ifindex)
1081 {
1082         struct rtable   *rth, **rthp;
1083         unsigned long   now;
1084         struct rtable *cand, **candp;
1085         u32             min_score;
1086         int             chain_length;
1087         int attempts = !in_softirq();
1088
1089 restart:
1090         chain_length = 0;
1091         min_score = ~(u32)0;
1092         cand = NULL;
1093         candp = NULL;
1094         now = jiffies;
1095
1096         if (!rt_caching(dev_net(rt->dst.dev))) {
1097                 /*
1098                  * If we're not caching, just tell the caller we
1099                  * were successful and don't touch the route.  The
1100                  * caller hold the sole reference to the cache entry, and
1101                  * it will be released when the caller is done with it.
1102                  * If we drop it here, the callers have no way to resolve routes
1103                  * when we're not caching.  Instead, just point *rp at rt, so
1104                  * the caller gets a single use out of the route
1105                  * Note that we do rt_free on this new route entry, so that
1106                  * once its refcount hits zero, we are still able to reap it
1107                  * (Thanks Alexey)
1108                  * Note also the rt_free uses call_rcu.  We don't actually
1109                  * need rcu protection here, this is just our path to get
1110                  * on the route gc list.
1111                  */
1112
1113                 rt->dst.flags |= DST_NOCACHE;
1114                 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1115                         int err = arp_bind_neighbour(&rt->dst);
1116                         if (err) {
1117                                 if (net_ratelimit())
1118                                         printk(KERN_WARNING
1119                                             "Neighbour table failure & not caching routes.\n");
1120                                 rt_drop(rt);
1121                                 return err;
1122                         }
1123                 }
1124
1125                 rt_free(rt);
1126                 goto skip_hashing;
1127         }
1128
1129         rthp = &rt_hash_table[hash].chain;
1130
1131         spin_lock_bh(rt_hash_lock_addr(hash));
1132         while ((rth = *rthp) != NULL) {
1133                 if (rt_is_expired(rth)) {
1134                         *rthp = rth->dst.rt_next;
1135                         rt_free(rth);
1136                         continue;
1137                 }
1138                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1139                         /* Put it first */
1140                         *rthp = rth->dst.rt_next;
1141                         /*
1142                          * Since lookup is lockfree, the deletion
1143                          * must be visible to another weakly ordered CPU before
1144                          * the insertion at the start of the hash chain.
1145                          */
1146                         rcu_assign_pointer(rth->dst.rt_next,
1147                                            rt_hash_table[hash].chain);
1148                         /*
1149                          * Since lookup is lockfree, the update writes
1150                          * must be ordered for consistency on SMP.
1151                          */
1152                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1153
1154                         dst_use(&rth->dst, now);
1155                         spin_unlock_bh(rt_hash_lock_addr(hash));
1156
1157                         rt_drop(rt);
1158                         if (rp)
1159                                 *rp = rth;
1160                         else
1161                                 skb_dst_set(skb, &rth->dst);
1162                         return 0;
1163                 }
1164
1165                 if (!atomic_read(&rth->dst.__refcnt)) {
1166                         u32 score = rt_score(rth);
1167
1168                         if (score <= min_score) {
1169                                 cand = rth;
1170                                 candp = rthp;
1171                                 min_score = score;
1172                         }
1173                 }
1174
1175                 chain_length++;
1176
1177                 rthp = &rth->dst.rt_next;
1178         }
1179
1180         if (cand) {
1181                 /* ip_rt_gc_elasticity used to be average length of chain
1182                  * length, when exceeded gc becomes really aggressive.
1183                  *
1184                  * The second limit is less certain. At the moment it allows
1185                  * only 2 entries per bucket. We will see.
1186                  */
1187                 if (chain_length > ip_rt_gc_elasticity) {
1188                         *candp = cand->dst.rt_next;
1189                         rt_free(cand);
1190                 }
1191         } else {
1192                 if (chain_length > rt_chain_length_max &&
1193                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1194                         struct net *net = dev_net(rt->dst.dev);
1195                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1196                         if (!rt_caching(net)) {
1197                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1198                                         rt->dst.dev->name, num);
1199                         }
1200                         rt_emergency_hash_rebuild(net);
1201                         spin_unlock_bh(rt_hash_lock_addr(hash));
1202
1203                         hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1204                                         ifindex, rt_genid(net));
1205                         goto restart;
1206                 }
1207         }
1208
1209         /* Try to bind route to arp only if it is output
1210            route or unicast forwarding path.
1211          */
1212         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1213                 int err = arp_bind_neighbour(&rt->dst);
1214                 if (err) {
1215                         spin_unlock_bh(rt_hash_lock_addr(hash));
1216
1217                         if (err != -ENOBUFS) {
1218                                 rt_drop(rt);
1219                                 return err;
1220                         }
1221
1222                         /* Neighbour tables are full and nothing
1223                            can be released. Try to shrink route cache,
1224                            it is most likely it holds some neighbour records.
1225                          */
1226                         if (attempts-- > 0) {
1227                                 int saved_elasticity = ip_rt_gc_elasticity;
1228                                 int saved_int = ip_rt_gc_min_interval;
1229                                 ip_rt_gc_elasticity     = 1;
1230                                 ip_rt_gc_min_interval   = 0;
1231                                 rt_garbage_collect(&ipv4_dst_ops);
1232                                 ip_rt_gc_min_interval   = saved_int;
1233                                 ip_rt_gc_elasticity     = saved_elasticity;
1234                                 goto restart;
1235                         }
1236
1237                         if (net_ratelimit())
1238                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1239                         rt_drop(rt);
1240                         return -ENOBUFS;
1241                 }
1242         }
1243
1244         rt->dst.rt_next = rt_hash_table[hash].chain;
1245
1246 #if RT_CACHE_DEBUG >= 2
1247         if (rt->dst.rt_next) {
1248                 struct rtable *trt;
1249                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1250                        hash, &rt->rt_dst);
1251                 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1252                         printk(" . %pI4", &trt->rt_dst);
1253                 printk("\n");
1254         }
1255 #endif
1256         /*
1257          * Since lookup is lockfree, we must make sure
1258          * previous writes to rt are comitted to memory
1259          * before making rt visible to other CPUS.
1260          */
1261         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1262
1263         spin_unlock_bh(rt_hash_lock_addr(hash));
1264
1265 skip_hashing:
1266         if (rp)
1267                 *rp = rt;
1268         else
1269                 skb_dst_set(skb, &rt->dst);
1270         return 0;
1271 }
1272
1273 void rt_bind_peer(struct rtable *rt, int create)
1274 {
1275         struct inet_peer *peer;
1276
1277         peer = inet_getpeer(rt->rt_dst, create);
1278
1279         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1280                 inet_putpeer(peer);
1281 }
1282
1283 /*
1284  * Peer allocation may fail only in serious out-of-memory conditions.  However
1285  * we still can generate some output.
1286  * Random ID selection looks a bit dangerous because we have no chances to
1287  * select ID being unique in a reasonable period of time.
1288  * But broken packet identifier may be better than no packet at all.
1289  */
1290 static void ip_select_fb_ident(struct iphdr *iph)
1291 {
1292         static DEFINE_SPINLOCK(ip_fb_id_lock);
1293         static u32 ip_fallback_id;
1294         u32 salt;
1295
1296         spin_lock_bh(&ip_fb_id_lock);
1297         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1298         iph->id = htons(salt & 0xFFFF);
1299         ip_fallback_id = salt;
1300         spin_unlock_bh(&ip_fb_id_lock);
1301 }
1302
1303 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1304 {
1305         struct rtable *rt = (struct rtable *) dst;
1306
1307         if (rt) {
1308                 if (rt->peer == NULL)
1309                         rt_bind_peer(rt, 1);
1310
1311                 /* If peer is attached to destination, it is never detached,
1312                    so that we need not to grab a lock to dereference it.
1313                  */
1314                 if (rt->peer) {
1315                         iph->id = htons(inet_getid(rt->peer, more));
1316                         return;
1317                 }
1318         } else
1319                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1320                        __builtin_return_address(0));
1321
1322         ip_select_fb_ident(iph);
1323 }
1324 EXPORT_SYMBOL(__ip_select_ident);
1325
1326 static void rt_del(unsigned hash, struct rtable *rt)
1327 {
1328         struct rtable **rthp, *aux;
1329
1330         rthp = &rt_hash_table[hash].chain;
1331         spin_lock_bh(rt_hash_lock_addr(hash));
1332         ip_rt_put(rt);
1333         while ((aux = *rthp) != NULL) {
1334                 if (aux == rt || rt_is_expired(aux)) {
1335                         *rthp = aux->dst.rt_next;
1336                         rt_free(aux);
1337                         continue;
1338                 }
1339                 rthp = &aux->dst.rt_next;
1340         }
1341         spin_unlock_bh(rt_hash_lock_addr(hash));
1342 }
1343
1344 /* called in rcu_read_lock() section */
1345 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1346                     __be32 saddr, struct net_device *dev)
1347 {
1348         int i, k;
1349         struct in_device *in_dev = __in_dev_get_rcu(dev);
1350         struct rtable *rth, **rthp;
1351         __be32  skeys[2] = { saddr, 0 };
1352         int  ikeys[2] = { dev->ifindex, 0 };
1353         struct netevent_redirect netevent;
1354         struct net *net;
1355
1356         if (!in_dev)
1357                 return;
1358
1359         net = dev_net(dev);
1360         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1361             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1362             ipv4_is_zeronet(new_gw))
1363                 goto reject_redirect;
1364
1365         if (!rt_caching(net))
1366                 goto reject_redirect;
1367
1368         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1369                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1370                         goto reject_redirect;
1371                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1372                         goto reject_redirect;
1373         } else {
1374                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1375                         goto reject_redirect;
1376         }
1377
1378         for (i = 0; i < 2; i++) {
1379                 for (k = 0; k < 2; k++) {
1380                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1381                                                 rt_genid(net));
1382
1383                         rthp=&rt_hash_table[hash].chain;
1384
1385                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1386                                 struct rtable *rt;
1387
1388                                 if (rth->fl.fl4_dst != daddr ||
1389                                     rth->fl.fl4_src != skeys[i] ||
1390                                     rth->fl.oif != ikeys[k] ||
1391                                     rth->fl.iif != 0 ||
1392                                     rt_is_expired(rth) ||
1393                                     !net_eq(dev_net(rth->dst.dev), net)) {
1394                                         rthp = &rth->dst.rt_next;
1395                                         continue;
1396                                 }
1397
1398                                 if (rth->rt_dst != daddr ||
1399                                     rth->rt_src != saddr ||
1400                                     rth->dst.error ||
1401                                     rth->rt_gateway != old_gw ||
1402                                     rth->dst.dev != dev)
1403                                         break;
1404
1405                                 dst_hold(&rth->dst);
1406
1407                                 rt = dst_alloc(&ipv4_dst_ops);
1408                                 if (rt == NULL) {
1409                                         ip_rt_put(rth);
1410                                         return;
1411                                 }
1412
1413                                 /* Copy all the information. */
1414                                 *rt = *rth;
1415                                 rt->dst.__use           = 1;
1416                                 atomic_set(&rt->dst.__refcnt, 1);
1417                                 rt->dst.child           = NULL;
1418                                 if (rt->dst.dev)
1419                                         dev_hold(rt->dst.dev);
1420                                 if (rt->idev)
1421                                         in_dev_hold(rt->idev);
1422                                 rt->dst.obsolete        = -1;
1423                                 rt->dst.lastuse = jiffies;
1424                                 rt->dst.path            = &rt->dst;
1425                                 rt->dst.neighbour       = NULL;
1426                                 rt->dst.hh              = NULL;
1427 #ifdef CONFIG_XFRM
1428                                 rt->dst.xfrm            = NULL;
1429 #endif
1430                                 rt->rt_genid            = rt_genid(net);
1431                                 rt->rt_flags            |= RTCF_REDIRECTED;
1432
1433                                 /* Gateway is different ... */
1434                                 rt->rt_gateway          = new_gw;
1435
1436                                 /* Redirect received -> path was valid */
1437                                 dst_confirm(&rth->dst);
1438
1439                                 if (rt->peer)
1440                                         atomic_inc(&rt->peer->refcnt);
1441
1442                                 if (arp_bind_neighbour(&rt->dst) ||
1443                                     !(rt->dst.neighbour->nud_state &
1444                                             NUD_VALID)) {
1445                                         if (rt->dst.neighbour)
1446                                                 neigh_event_send(rt->dst.neighbour, NULL);
1447                                         ip_rt_put(rth);
1448                                         rt_drop(rt);
1449                                         goto do_next;
1450                                 }
1451
1452                                 netevent.old = &rth->dst;
1453                                 netevent.new = &rt->dst;
1454                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1455                                                         &netevent);
1456
1457                                 rt_del(hash, rth);
1458                                 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1459                                         ip_rt_put(rt);
1460                                 goto do_next;
1461                         }
1462                 do_next:
1463                         ;
1464                 }
1465         }
1466         return;
1467
1468 reject_redirect:
1469 #ifdef CONFIG_IP_ROUTE_VERBOSE
1470         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1471                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1472                         "  Advised path = %pI4 -> %pI4\n",
1473                        &old_gw, dev->name, &new_gw,
1474                        &saddr, &daddr);
1475 #endif
1476         ;
1477 }
1478
1479 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1480 {
1481         struct rtable *rt = (struct rtable *)dst;
1482         struct dst_entry *ret = dst;
1483
1484         if (rt) {
1485                 if (dst->obsolete > 0) {
1486                         ip_rt_put(rt);
1487                         ret = NULL;
1488                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1489                            (rt->dst.expires &&
1490                             time_after_eq(jiffies, rt->dst.expires))) {
1491                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1492                                                 rt->fl.oif,
1493                                                 rt_genid(dev_net(dst->dev)));
1494 #if RT_CACHE_DEBUG >= 1
1495                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1496                                 &rt->rt_dst, rt->fl.fl4_tos);
1497 #endif
1498                         rt_del(hash, rt);
1499                         ret = NULL;
1500                 }
1501         }
1502         return ret;
1503 }
1504
1505 /*
1506  * Algorithm:
1507  *      1. The first ip_rt_redirect_number redirects are sent
1508  *         with exponential backoff, then we stop sending them at all,
1509  *         assuming that the host ignores our redirects.
1510  *      2. If we did not see packets requiring redirects
1511  *         during ip_rt_redirect_silence, we assume that the host
1512  *         forgot redirected route and start to send redirects again.
1513  *
1514  * This algorithm is much cheaper and more intelligent than dumb load limiting
1515  * in icmp.c.
1516  *
1517  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1518  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1519  */
1520
1521 void ip_rt_send_redirect(struct sk_buff *skb)
1522 {
1523         struct rtable *rt = skb_rtable(skb);
1524         struct in_device *in_dev;
1525         int log_martians;
1526
1527         rcu_read_lock();
1528         in_dev = __in_dev_get_rcu(rt->dst.dev);
1529         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1530                 rcu_read_unlock();
1531                 return;
1532         }
1533         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1534         rcu_read_unlock();
1535
1536         /* No redirected packets during ip_rt_redirect_silence;
1537          * reset the algorithm.
1538          */
1539         if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1540                 rt->dst.rate_tokens = 0;
1541
1542         /* Too many ignored redirects; do not send anything
1543          * set dst.rate_last to the last seen redirected packet.
1544          */
1545         if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1546                 rt->dst.rate_last = jiffies;
1547                 return;
1548         }
1549
1550         /* Check for load limit; set rate_last to the latest sent
1551          * redirect.
1552          */
1553         if (rt->dst.rate_tokens == 0 ||
1554             time_after(jiffies,
1555                        (rt->dst.rate_last +
1556                         (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1557                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1558                 rt->dst.rate_last = jiffies;
1559                 ++rt->dst.rate_tokens;
1560 #ifdef CONFIG_IP_ROUTE_VERBOSE
1561                 if (log_martians &&
1562                     rt->dst.rate_tokens == ip_rt_redirect_number &&
1563                     net_ratelimit())
1564                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1565                                 &rt->rt_src, rt->rt_iif,
1566                                 &rt->rt_dst, &rt->rt_gateway);
1567 #endif
1568         }
1569 }
1570
1571 static int ip_error(struct sk_buff *skb)
1572 {
1573         struct rtable *rt = skb_rtable(skb);
1574         unsigned long now;
1575         int code;
1576
1577         switch (rt->dst.error) {
1578                 case EINVAL:
1579                 default:
1580                         goto out;
1581                 case EHOSTUNREACH:
1582                         code = ICMP_HOST_UNREACH;
1583                         break;
1584                 case ENETUNREACH:
1585                         code = ICMP_NET_UNREACH;
1586                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1587                                         IPSTATS_MIB_INNOROUTES);
1588                         break;
1589                 case EACCES:
1590                         code = ICMP_PKT_FILTERED;
1591                         break;
1592         }
1593
1594         now = jiffies;
1595         rt->dst.rate_tokens += now - rt->dst.rate_last;
1596         if (rt->dst.rate_tokens > ip_rt_error_burst)
1597                 rt->dst.rate_tokens = ip_rt_error_burst;
1598         rt->dst.rate_last = now;
1599         if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1600                 rt->dst.rate_tokens -= ip_rt_error_cost;
1601                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1602         }
1603
1604 out:    kfree_skb(skb);
1605         return 0;
1606 }
1607
1608 /*
1609  *      The last two values are not from the RFC but
1610  *      are needed for AMPRnet AX.25 paths.
1611  */
1612
1613 static const unsigned short mtu_plateau[] =
1614 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1615
1616 static inline unsigned short guess_mtu(unsigned short old_mtu)
1617 {
1618         int i;
1619
1620         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1621                 if (old_mtu > mtu_plateau[i])
1622                         return mtu_plateau[i];
1623         return 68;
1624 }
1625
1626 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1627                                  unsigned short new_mtu,
1628                                  struct net_device *dev)
1629 {
1630         int i, k;
1631         unsigned short old_mtu = ntohs(iph->tot_len);
1632         struct rtable *rth;
1633         int  ikeys[2] = { dev->ifindex, 0 };
1634         __be32  skeys[2] = { iph->saddr, 0, };
1635         __be32  daddr = iph->daddr;
1636         unsigned short est_mtu = 0;
1637
1638         for (k = 0; k < 2; k++) {
1639                 for (i = 0; i < 2; i++) {
1640                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1641                                                 rt_genid(net));
1642
1643                         rcu_read_lock();
1644                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1645                              rth = rcu_dereference(rth->dst.rt_next)) {
1646                                 unsigned short mtu = new_mtu;
1647
1648                                 if (rth->fl.fl4_dst != daddr ||
1649                                     rth->fl.fl4_src != skeys[i] ||
1650                                     rth->rt_dst != daddr ||
1651                                     rth->rt_src != iph->saddr ||
1652                                     rth->fl.oif != ikeys[k] ||
1653                                     rth->fl.iif != 0 ||
1654                                     dst_metric_locked(&rth->dst, RTAX_MTU) ||
1655                                     !net_eq(dev_net(rth->dst.dev), net) ||
1656                                     rt_is_expired(rth))
1657                                         continue;
1658
1659                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1660
1661                                         /* BSD 4.2 compatibility hack :-( */
1662                                         if (mtu == 0 &&
1663                                             old_mtu >= dst_mtu(&rth->dst) &&
1664                                             old_mtu >= 68 + (iph->ihl << 2))
1665                                                 old_mtu -= iph->ihl << 2;
1666
1667                                         mtu = guess_mtu(old_mtu);
1668                                 }
1669                                 if (mtu <= dst_mtu(&rth->dst)) {
1670                                         if (mtu < dst_mtu(&rth->dst)) {
1671                                                 dst_confirm(&rth->dst);
1672                                                 if (mtu < ip_rt_min_pmtu) {
1673                                                         mtu = ip_rt_min_pmtu;
1674                                                         rth->dst.metrics[RTAX_LOCK-1] |=
1675                                                                 (1 << RTAX_MTU);
1676                                                 }
1677                                                 rth->dst.metrics[RTAX_MTU-1] = mtu;
1678                                                 dst_set_expires(&rth->dst,
1679                                                         ip_rt_mtu_expires);
1680                                         }
1681                                         est_mtu = mtu;
1682                                 }
1683                         }
1684                         rcu_read_unlock();
1685                 }
1686         }
1687         return est_mtu ? : new_mtu;
1688 }
1689
1690 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1691 {
1692         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1693             !(dst_metric_locked(dst, RTAX_MTU))) {
1694                 if (mtu < ip_rt_min_pmtu) {
1695                         mtu = ip_rt_min_pmtu;
1696                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1697                 }
1698                 dst->metrics[RTAX_MTU-1] = mtu;
1699                 dst_set_expires(dst, ip_rt_mtu_expires);
1700                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1701         }
1702 }
1703
1704 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1705 {
1706         if (rt_is_expired((struct rtable *)dst))
1707                 return NULL;
1708         return dst;
1709 }
1710
1711 static void ipv4_dst_destroy(struct dst_entry *dst)
1712 {
1713         struct rtable *rt = (struct rtable *) dst;
1714         struct inet_peer *peer = rt->peer;
1715         struct in_device *idev = rt->idev;
1716
1717         if (peer) {
1718                 rt->peer = NULL;
1719                 inet_putpeer(peer);
1720         }
1721
1722         if (idev) {
1723                 rt->idev = NULL;
1724                 in_dev_put(idev);
1725         }
1726 }
1727
1728 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1729                             int how)
1730 {
1731         struct rtable *rt = (struct rtable *) dst;
1732         struct in_device *idev = rt->idev;
1733         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1734                 struct in_device *loopback_idev =
1735                         in_dev_get(dev_net(dev)->loopback_dev);
1736                 if (loopback_idev) {
1737                         rt->idev = loopback_idev;
1738                         in_dev_put(idev);
1739                 }
1740         }
1741 }
1742
1743 static void ipv4_link_failure(struct sk_buff *skb)
1744 {
1745         struct rtable *rt;
1746
1747         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1748
1749         rt = skb_rtable(skb);
1750         if (rt)
1751                 dst_set_expires(&rt->dst, 0);
1752 }
1753
1754 static int ip_rt_bug(struct sk_buff *skb)
1755 {
1756         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1757                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1758                 skb->dev ? skb->dev->name : "?");
1759         kfree_skb(skb);
1760         return 0;
1761 }
1762
1763 /*
1764    We do not cache source address of outgoing interface,
1765    because it is used only by IP RR, TS and SRR options,
1766    so that it out of fast path.
1767
1768    BTW remember: "addr" is allowed to be not aligned
1769    in IP options!
1770  */
1771
1772 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1773 {
1774         __be32 src;
1775         struct fib_result res;
1776
1777         if (rt->fl.iif == 0)
1778                 src = rt->rt_src;
1779         else {
1780                 rcu_read_lock();
1781                 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1782                         src = FIB_RES_PREFSRC(res);
1783                 else
1784                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1785                                         RT_SCOPE_UNIVERSE);
1786                 rcu_read_unlock();
1787         }
1788         memcpy(addr, &src, 4);
1789 }
1790
1791 #ifdef CONFIG_NET_CLS_ROUTE
1792 static void set_class_tag(struct rtable *rt, u32 tag)
1793 {
1794         if (!(rt->dst.tclassid & 0xFFFF))
1795                 rt->dst.tclassid |= tag & 0xFFFF;
1796         if (!(rt->dst.tclassid & 0xFFFF0000))
1797                 rt->dst.tclassid |= tag & 0xFFFF0000;
1798 }
1799 #endif
1800
1801 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1802 {
1803         struct fib_info *fi = res->fi;
1804
1805         if (fi) {
1806                 if (FIB_RES_GW(*res) &&
1807                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1808                         rt->rt_gateway = FIB_RES_GW(*res);
1809                 memcpy(rt->dst.metrics, fi->fib_metrics,
1810                        sizeof(rt->dst.metrics));
1811                 if (fi->fib_mtu == 0) {
1812                         rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
1813                         if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
1814                             rt->rt_gateway != rt->rt_dst &&
1815                             rt->dst.dev->mtu > 576)
1816                                 rt->dst.metrics[RTAX_MTU-1] = 576;
1817                 }
1818 #ifdef CONFIG_NET_CLS_ROUTE
1819                 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1820 #endif
1821         } else
1822                 rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
1823
1824         if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1825                 rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1826         if (dst_mtu(&rt->dst) > IP_MAX_MTU)
1827                 rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1828         if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
1829                 rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
1830                                        ip_rt_min_advmss);
1831         if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
1832                 rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1833
1834 #ifdef CONFIG_NET_CLS_ROUTE
1835 #ifdef CONFIG_IP_MULTIPLE_TABLES
1836         set_class_tag(rt, fib_rules_tclass(res));
1837 #endif
1838         set_class_tag(rt, itag);
1839 #endif
1840         rt->rt_type = res->type;
1841 }
1842
1843 /* called in rcu_read_lock() section */
1844 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1845                                 u8 tos, struct net_device *dev, int our)
1846 {
1847         unsigned int hash;
1848         struct rtable *rth;
1849         __be32 spec_dst;
1850         struct in_device *in_dev = __in_dev_get_rcu(dev);
1851         u32 itag = 0;
1852         int err;
1853
1854         /* Primary sanity checks. */
1855
1856         if (in_dev == NULL)
1857                 return -EINVAL;
1858
1859         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1860             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1861                 goto e_inval;
1862
1863         if (ipv4_is_zeronet(saddr)) {
1864                 if (!ipv4_is_local_multicast(daddr))
1865                         goto e_inval;
1866                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1867         } else {
1868                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1869                                           &itag, 0);
1870                 if (err < 0)
1871                         goto e_err;
1872         }
1873         rth = dst_alloc(&ipv4_dst_ops);
1874         if (!rth)
1875                 goto e_nobufs;
1876
1877         rth->dst.output = ip_rt_bug;
1878         rth->dst.obsolete = -1;
1879
1880         atomic_set(&rth->dst.__refcnt, 1);
1881         rth->dst.flags= DST_HOST;
1882         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1883                 rth->dst.flags |= DST_NOPOLICY;
1884         rth->fl.fl4_dst = daddr;
1885         rth->rt_dst     = daddr;
1886         rth->fl.fl4_tos = tos;
1887         rth->fl.mark    = skb->mark;
1888         rth->fl.fl4_src = saddr;
1889         rth->rt_src     = saddr;
1890 #ifdef CONFIG_NET_CLS_ROUTE
1891         rth->dst.tclassid = itag;
1892 #endif
1893         rth->rt_iif     =
1894         rth->fl.iif     = dev->ifindex;
1895         rth->dst.dev    = init_net.loopback_dev;
1896         dev_hold(rth->dst.dev);
1897         rth->idev       = in_dev_get(rth->dst.dev);
1898         rth->fl.oif     = 0;
1899         rth->rt_gateway = daddr;
1900         rth->rt_spec_dst= spec_dst;
1901         rth->rt_genid   = rt_genid(dev_net(dev));
1902         rth->rt_flags   = RTCF_MULTICAST;
1903         rth->rt_type    = RTN_MULTICAST;
1904         if (our) {
1905                 rth->dst.input= ip_local_deliver;
1906                 rth->rt_flags |= RTCF_LOCAL;
1907         }
1908
1909 #ifdef CONFIG_IP_MROUTE
1910         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1911                 rth->dst.input = ip_mr_input;
1912 #endif
1913         RT_CACHE_STAT_INC(in_slow_mc);
1914
1915         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1916         return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1917
1918 e_nobufs:
1919         return -ENOBUFS;
1920 e_inval:
1921         return -EINVAL;
1922 e_err:
1923         return err;
1924 }
1925
1926
1927 static void ip_handle_martian_source(struct net_device *dev,
1928                                      struct in_device *in_dev,
1929                                      struct sk_buff *skb,
1930                                      __be32 daddr,
1931                                      __be32 saddr)
1932 {
1933         RT_CACHE_STAT_INC(in_martian_src);
1934 #ifdef CONFIG_IP_ROUTE_VERBOSE
1935         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1936                 /*
1937                  *      RFC1812 recommendation, if source is martian,
1938                  *      the only hint is MAC header.
1939                  */
1940                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1941                         &daddr, &saddr, dev->name);
1942                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1943                         int i;
1944                         const unsigned char *p = skb_mac_header(skb);
1945                         printk(KERN_WARNING "ll header: ");
1946                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1947                                 printk("%02x", *p);
1948                                 if (i < (dev->hard_header_len - 1))
1949                                         printk(":");
1950                         }
1951                         printk("\n");
1952                 }
1953         }
1954 #endif
1955 }
1956
1957 /* called in rcu_read_lock() section */
1958 static int __mkroute_input(struct sk_buff *skb,
1959                            struct fib_result *res,
1960                            struct in_device *in_dev,
1961                            __be32 daddr, __be32 saddr, u32 tos,
1962                            struct rtable **result)
1963 {
1964         struct rtable *rth;
1965         int err;
1966         struct in_device *out_dev;
1967         unsigned int flags = 0;
1968         __be32 spec_dst;
1969         u32 itag;
1970
1971         /* get a working reference to the output device */
1972         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1973         if (out_dev == NULL) {
1974                 if (net_ratelimit())
1975                         printk(KERN_CRIT "Bug in ip_route_input" \
1976                                "_slow(). Please, report\n");
1977                 return -EINVAL;
1978         }
1979
1980
1981         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1982                                   in_dev->dev, &spec_dst, &itag, skb->mark);
1983         if (err < 0) {
1984                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1985                                          saddr);
1986
1987                 goto cleanup;
1988         }
1989
1990         if (err)
1991                 flags |= RTCF_DIRECTSRC;
1992
1993         if (out_dev == in_dev && err &&
1994             (IN_DEV_SHARED_MEDIA(out_dev) ||
1995              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1996                 flags |= RTCF_DOREDIRECT;
1997
1998         if (skb->protocol != htons(ETH_P_IP)) {
1999                 /* Not IP (i.e. ARP). Do not create route, if it is
2000                  * invalid for proxy arp. DNAT routes are always valid.
2001                  *
2002                  * Proxy arp feature have been extended to allow, ARP
2003                  * replies back to the same interface, to support
2004                  * Private VLAN switch technologies. See arp.c.
2005                  */
2006                 if (out_dev == in_dev &&
2007                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2008                         err = -EINVAL;
2009                         goto cleanup;
2010                 }
2011         }
2012
2013
2014         rth = dst_alloc(&ipv4_dst_ops);
2015         if (!rth) {
2016                 err = -ENOBUFS;
2017                 goto cleanup;
2018         }
2019
2020         atomic_set(&rth->dst.__refcnt, 1);
2021         rth->dst.flags= DST_HOST;
2022         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2023                 rth->dst.flags |= DST_NOPOLICY;
2024         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2025                 rth->dst.flags |= DST_NOXFRM;
2026         rth->fl.fl4_dst = daddr;
2027         rth->rt_dst     = daddr;
2028         rth->fl.fl4_tos = tos;
2029         rth->fl.mark    = skb->mark;
2030         rth->fl.fl4_src = saddr;
2031         rth->rt_src     = saddr;
2032         rth->rt_gateway = daddr;
2033         rth->rt_iif     =
2034                 rth->fl.iif     = in_dev->dev->ifindex;
2035         rth->dst.dev    = (out_dev)->dev;
2036         dev_hold(rth->dst.dev);
2037         rth->idev       = in_dev_get(rth->dst.dev);
2038         rth->fl.oif     = 0;
2039         rth->rt_spec_dst= spec_dst;
2040
2041         rth->dst.obsolete = -1;
2042         rth->dst.input = ip_forward;
2043         rth->dst.output = ip_output;
2044         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2045
2046         rt_set_nexthop(rth, res, itag);
2047
2048         rth->rt_flags = flags;
2049
2050         *result = rth;
2051         err = 0;
2052  cleanup:
2053         return err;
2054 }
2055
2056 static int ip_mkroute_input(struct sk_buff *skb,
2057                             struct fib_result *res,
2058                             const struct flowi *fl,
2059                             struct in_device *in_dev,
2060                             __be32 daddr, __be32 saddr, u32 tos)
2061 {
2062         struct rtable* rth = NULL;
2063         int err;
2064         unsigned hash;
2065
2066 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2067         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2068                 fib_select_multipath(fl, res);
2069 #endif
2070
2071         /* create a routing cache entry */
2072         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2073         if (err)
2074                 return err;
2075
2076         /* put it into the cache */
2077         hash = rt_hash(daddr, saddr, fl->iif,
2078                        rt_genid(dev_net(rth->dst.dev)));
2079         return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2080 }
2081
2082 /*
2083  *      NOTE. We drop all the packets that has local source
2084  *      addresses, because every properly looped back packet
2085  *      must have correct destination already attached by output routine.
2086  *
2087  *      Such approach solves two big problems:
2088  *      1. Not simplex devices are handled properly.
2089  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2090  *      called with rcu_read_lock()
2091  */
2092
2093 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2094                                u8 tos, struct net_device *dev)
2095 {
2096         struct fib_result res;
2097         struct in_device *in_dev = __in_dev_get_rcu(dev);
2098         struct flowi fl = { .nl_u = { .ip4_u =
2099                                       { .daddr = daddr,
2100                                         .saddr = saddr,
2101                                         .tos = tos,
2102                                         .scope = RT_SCOPE_UNIVERSE,
2103                                       } },
2104                             .mark = skb->mark,
2105                             .iif = dev->ifindex };
2106         unsigned        flags = 0;
2107         u32             itag = 0;
2108         struct rtable * rth;
2109         unsigned        hash;
2110         __be32          spec_dst;
2111         int             err = -EINVAL;
2112         struct net    * net = dev_net(dev);
2113
2114         /* IP on this device is disabled. */
2115
2116         if (!in_dev)
2117                 goto out;
2118
2119         /* Check for the most weird martians, which can be not detected
2120            by fib_lookup.
2121          */
2122
2123         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2124             ipv4_is_loopback(saddr))
2125                 goto martian_source;
2126
2127         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2128                 goto brd_input;
2129
2130         /* Accept zero addresses only to limited broadcast;
2131          * I even do not know to fix it or not. Waiting for complains :-)
2132          */
2133         if (ipv4_is_zeronet(saddr))
2134                 goto martian_source;
2135
2136         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2137                 goto martian_destination;
2138
2139         /*
2140          *      Now we are ready to route packet.
2141          */
2142         err = fib_lookup(net, &fl, &res);
2143         if (err != 0) {
2144                 if (!IN_DEV_FORWARD(in_dev))
2145                         goto e_hostunreach;
2146                 goto no_route;
2147         }
2148
2149         RT_CACHE_STAT_INC(in_slow_tot);
2150
2151         if (res.type == RTN_BROADCAST)
2152                 goto brd_input;
2153
2154         if (res.type == RTN_LOCAL) {
2155                 err = fib_validate_source(saddr, daddr, tos,
2156                                           net->loopback_dev->ifindex,
2157                                           dev, &spec_dst, &itag, skb->mark);
2158                 if (err < 0)
2159                         goto martian_source_keep_err;
2160                 if (err)
2161                         flags |= RTCF_DIRECTSRC;
2162                 spec_dst = daddr;
2163                 goto local_input;
2164         }
2165
2166         if (!IN_DEV_FORWARD(in_dev))
2167                 goto e_hostunreach;
2168         if (res.type != RTN_UNICAST)
2169                 goto martian_destination;
2170
2171         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2172 out:    return err;
2173
2174 brd_input:
2175         if (skb->protocol != htons(ETH_P_IP))
2176                 goto e_inval;
2177
2178         if (ipv4_is_zeronet(saddr))
2179                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2180         else {
2181                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2182                                           &itag, skb->mark);
2183                 if (err < 0)
2184                         goto martian_source_keep_err;
2185                 if (err)
2186                         flags |= RTCF_DIRECTSRC;
2187         }
2188         flags |= RTCF_BROADCAST;
2189         res.type = RTN_BROADCAST;
2190         RT_CACHE_STAT_INC(in_brd);
2191
2192 local_input:
2193         rth = dst_alloc(&ipv4_dst_ops);
2194         if (!rth)
2195                 goto e_nobufs;
2196
2197         rth->dst.output= ip_rt_bug;
2198         rth->dst.obsolete = -1;
2199         rth->rt_genid = rt_genid(net);
2200
2201         atomic_set(&rth->dst.__refcnt, 1);
2202         rth->dst.flags= DST_HOST;
2203         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2204                 rth->dst.flags |= DST_NOPOLICY;
2205         rth->fl.fl4_dst = daddr;
2206         rth->rt_dst     = daddr;
2207         rth->fl.fl4_tos = tos;
2208         rth->fl.mark    = skb->mark;
2209         rth->fl.fl4_src = saddr;
2210         rth->rt_src     = saddr;
2211 #ifdef CONFIG_NET_CLS_ROUTE
2212         rth->dst.tclassid = itag;
2213 #endif
2214         rth->rt_iif     =
2215         rth->fl.iif     = dev->ifindex;
2216         rth->dst.dev    = net->loopback_dev;
2217         dev_hold(rth->dst.dev);
2218         rth->idev       = in_dev_get(rth->dst.dev);
2219         rth->rt_gateway = daddr;
2220         rth->rt_spec_dst= spec_dst;
2221         rth->dst.input= ip_local_deliver;
2222         rth->rt_flags   = flags|RTCF_LOCAL;
2223         if (res.type == RTN_UNREACHABLE) {
2224                 rth->dst.input= ip_error;
2225                 rth->dst.error= -err;
2226                 rth->rt_flags   &= ~RTCF_LOCAL;
2227         }
2228         rth->rt_type    = res.type;
2229         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2230         err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2231         goto out;
2232
2233 no_route:
2234         RT_CACHE_STAT_INC(in_no_route);
2235         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2236         res.type = RTN_UNREACHABLE;
2237         if (err == -ESRCH)
2238                 err = -ENETUNREACH;
2239         goto local_input;
2240
2241         /*
2242          *      Do not cache martian addresses: they should be logged (RFC1812)
2243          */
2244 martian_destination:
2245         RT_CACHE_STAT_INC(in_martian_dst);
2246 #ifdef CONFIG_IP_ROUTE_VERBOSE
2247         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2248                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2249                         &daddr, &saddr, dev->name);
2250 #endif
2251
2252 e_hostunreach:
2253         err = -EHOSTUNREACH;
2254         goto out;
2255
2256 e_inval:
2257         err = -EINVAL;
2258         goto out;
2259
2260 e_nobufs:
2261         err = -ENOBUFS;
2262         goto out;
2263
2264 martian_source:
2265         err = -EINVAL;
2266 martian_source_keep_err:
2267         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2268         goto out;
2269 }
2270
2271 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2272                            u8 tos, struct net_device *dev, bool noref)
2273 {
2274         struct rtable * rth;
2275         unsigned        hash;
2276         int iif = dev->ifindex;
2277         struct net *net;
2278         int res;
2279
2280         net = dev_net(dev);
2281
2282         rcu_read_lock();
2283
2284         if (!rt_caching(net))
2285                 goto skip_cache;
2286
2287         tos &= IPTOS_RT_MASK;
2288         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2289
2290         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2291              rth = rcu_dereference(rth->dst.rt_next)) {
2292                 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2293                      ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2294                      (rth->fl.iif ^ iif) |
2295                      rth->fl.oif |
2296                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2297                     rth->fl.mark == skb->mark &&
2298                     net_eq(dev_net(rth->dst.dev), net) &&
2299                     !rt_is_expired(rth)) {
2300                         if (noref) {
2301                                 dst_use_noref(&rth->dst, jiffies);
2302                                 skb_dst_set_noref(skb, &rth->dst);
2303                         } else {
2304                                 dst_use(&rth->dst, jiffies);
2305                                 skb_dst_set(skb, &rth->dst);
2306                         }
2307                         RT_CACHE_STAT_INC(in_hit);
2308                         rcu_read_unlock();
2309                         return 0;
2310                 }
2311                 RT_CACHE_STAT_INC(in_hlist_search);
2312         }
2313
2314 skip_cache:
2315         /* Multicast recognition logic is moved from route cache to here.
2316            The problem was that too many Ethernet cards have broken/missing
2317            hardware multicast filters :-( As result the host on multicasting
2318            network acquires a lot of useless route cache entries, sort of
2319            SDR messages from all the world. Now we try to get rid of them.
2320            Really, provided software IP multicast filter is organized
2321            reasonably (at least, hashed), it does not result in a slowdown
2322            comparing with route cache reject entries.
2323            Note, that multicast routers are not affected, because
2324            route cache entry is created eventually.
2325          */
2326         if (ipv4_is_multicast(daddr)) {
2327                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2328
2329                 if (in_dev) {
2330                         int our = ip_check_mc(in_dev, daddr, saddr,
2331                                               ip_hdr(skb)->protocol);
2332                         if (our
2333 #ifdef CONFIG_IP_MROUTE
2334                                 ||
2335                             (!ipv4_is_local_multicast(daddr) &&
2336                              IN_DEV_MFORWARD(in_dev))
2337 #endif
2338                            ) {
2339                                 int res = ip_route_input_mc(skb, daddr, saddr,
2340                                                             tos, dev, our);
2341                                 rcu_read_unlock();
2342                                 return res;
2343                         }
2344                 }
2345                 rcu_read_unlock();
2346                 return -EINVAL;
2347         }
2348         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2349         rcu_read_unlock();
2350         return res;
2351 }
2352 EXPORT_SYMBOL(ip_route_input_common);
2353
2354 /* called with rcu_read_lock() */
2355 static int __mkroute_output(struct rtable **result,
2356                             struct fib_result *res,
2357                             const struct flowi *fl,
2358                             const struct flowi *oldflp,
2359                             struct net_device *dev_out,
2360                             unsigned flags)
2361 {
2362         struct rtable *rth;
2363         struct in_device *in_dev;
2364         u32 tos = RT_FL_TOS(oldflp);
2365
2366         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2367                 return -EINVAL;
2368
2369         if (ipv4_is_lbcast(fl->fl4_dst))
2370                 res->type = RTN_BROADCAST;
2371         else if (ipv4_is_multicast(fl->fl4_dst))
2372                 res->type = RTN_MULTICAST;
2373         else if (ipv4_is_zeronet(fl->fl4_dst))
2374                 return -EINVAL;
2375
2376         if (dev_out->flags & IFF_LOOPBACK)
2377                 flags |= RTCF_LOCAL;
2378
2379         in_dev = __in_dev_get_rcu(dev_out);
2380         if (!in_dev)
2381                 return -EINVAL;
2382
2383         if (res->type == RTN_BROADCAST) {
2384                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2385                 res->fi = NULL;
2386         } else if (res->type == RTN_MULTICAST) {
2387                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2388                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2389                                  oldflp->proto))
2390                         flags &= ~RTCF_LOCAL;
2391                 /* If multicast route do not exist use
2392                  * default one, but do not gateway in this case.
2393                  * Yes, it is hack.
2394                  */
2395                 if (res->fi && res->prefixlen < 4)
2396                         res->fi = NULL;
2397         }
2398
2399
2400         rth = dst_alloc(&ipv4_dst_ops);
2401         if (!rth)
2402                 return -ENOBUFS;
2403
2404         in_dev_hold(in_dev);
2405         rth->idev = in_dev;
2406
2407         atomic_set(&rth->dst.__refcnt, 1);
2408         rth->dst.flags= DST_HOST;
2409         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2410                 rth->dst.flags |= DST_NOXFRM;
2411         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2412                 rth->dst.flags |= DST_NOPOLICY;
2413
2414         rth->fl.fl4_dst = oldflp->fl4_dst;
2415         rth->fl.fl4_tos = tos;
2416         rth->fl.fl4_src = oldflp->fl4_src;
2417         rth->fl.oif     = oldflp->oif;
2418         rth->fl.mark    = oldflp->mark;
2419         rth->rt_dst     = fl->fl4_dst;
2420         rth->rt_src     = fl->fl4_src;
2421         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2422         /* get references to the devices that are to be hold by the routing
2423            cache entry */
2424         rth->dst.dev    = dev_out;
2425         dev_hold(dev_out);
2426         rth->rt_gateway = fl->fl4_dst;
2427         rth->rt_spec_dst= fl->fl4_src;
2428
2429         rth->dst.output=ip_output;
2430         rth->dst.obsolete = -1;
2431         rth->rt_genid = rt_genid(dev_net(dev_out));
2432
2433         RT_CACHE_STAT_INC(out_slow_tot);
2434
2435         if (flags & RTCF_LOCAL) {
2436                 rth->dst.input = ip_local_deliver;
2437                 rth->rt_spec_dst = fl->fl4_dst;
2438         }
2439         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2440                 rth->rt_spec_dst = fl->fl4_src;
2441                 if (flags & RTCF_LOCAL &&
2442                     !(dev_out->flags & IFF_LOOPBACK)) {
2443                         rth->dst.output = ip_mc_output;
2444                         RT_CACHE_STAT_INC(out_slow_mc);
2445                 }
2446 #ifdef CONFIG_IP_MROUTE
2447                 if (res->type == RTN_MULTICAST) {
2448                         if (IN_DEV_MFORWARD(in_dev) &&
2449                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2450                                 rth->dst.input = ip_mr_input;
2451                                 rth->dst.output = ip_mc_output;
2452                         }
2453                 }
2454 #endif
2455         }
2456
2457         rt_set_nexthop(rth, res, 0);
2458
2459         rth->rt_flags = flags;
2460         *result = rth;
2461         return 0;
2462 }
2463
2464 /* called with rcu_read_lock() */
2465 static int ip_mkroute_output(struct rtable **rp,
2466                              struct fib_result *res,
2467                              const struct flowi *fl,
2468                              const struct flowi *oldflp,
2469                              struct net_device *dev_out,
2470                              unsigned flags)
2471 {
2472         struct rtable *rth = NULL;
2473         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2474         unsigned hash;
2475         if (err == 0) {
2476                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2477                                rt_genid(dev_net(dev_out)));
2478                 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2479         }
2480
2481         return err;
2482 }
2483
2484 /*
2485  * Major route resolver routine.
2486  * called with rcu_read_lock();
2487  */
2488
2489 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2490                                 const struct flowi *oldflp)
2491 {
2492         u32 tos = RT_FL_TOS(oldflp);
2493         struct flowi fl = { .nl_u = { .ip4_u =
2494                                       { .daddr = oldflp->fl4_dst,
2495                                         .saddr = oldflp->fl4_src,
2496                                         .tos = tos & IPTOS_RT_MASK,
2497                                         .scope = ((tos & RTO_ONLINK) ?
2498                                                   RT_SCOPE_LINK :
2499                                                   RT_SCOPE_UNIVERSE),
2500                                       } },
2501                             .mark = oldflp->mark,
2502                             .iif = net->loopback_dev->ifindex,
2503                             .oif = oldflp->oif };
2504         struct fib_result res;
2505         unsigned int flags = 0;
2506         struct net_device *dev_out = NULL;
2507         int err;
2508
2509
2510         res.fi          = NULL;
2511 #ifdef CONFIG_IP_MULTIPLE_TABLES
2512         res.r           = NULL;
2513 #endif
2514
2515         if (oldflp->fl4_src) {
2516                 err = -EINVAL;
2517                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2518                     ipv4_is_lbcast(oldflp->fl4_src) ||
2519                     ipv4_is_zeronet(oldflp->fl4_src))
2520                         goto out;
2521
2522                 /* I removed check for oif == dev_out->oif here.
2523                    It was wrong for two reasons:
2524                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2525                       is assigned to multiple interfaces.
2526                    2. Moreover, we are allowed to send packets with saddr
2527                       of another iface. --ANK
2528                  */
2529
2530                 if (oldflp->oif == 0 &&
2531                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2532                      ipv4_is_lbcast(oldflp->fl4_dst))) {
2533                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2534                         dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2535                         if (dev_out == NULL)
2536                                 goto out;
2537
2538                         /* Special hack: user can direct multicasts
2539                            and limited broadcast via necessary interface
2540                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2541                            This hack is not just for fun, it allows
2542                            vic,vat and friends to work.
2543                            They bind socket to loopback, set ttl to zero
2544                            and expect that it will work.
2545                            From the viewpoint of routing cache they are broken,
2546                            because we are not allowed to build multicast path
2547                            with loopback source addr (look, routing cache
2548                            cannot know, that ttl is zero, so that packet
2549                            will not leave this host and route is valid).
2550                            Luckily, this hack is good workaround.
2551                          */
2552
2553                         fl.oif = dev_out->ifindex;
2554                         goto make_route;
2555                 }
2556
2557                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2558                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2559                         if (!__ip_dev_find(net, oldflp->fl4_src, false))
2560                                 goto out;
2561                 }
2562         }
2563
2564
2565         if (oldflp->oif) {
2566                 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2567                 err = -ENODEV;
2568                 if (dev_out == NULL)
2569                         goto out;
2570
2571                 /* RACE: Check return value of inet_select_addr instead. */
2572                 if (rcu_dereference(dev_out->ip_ptr) == NULL)
2573                         goto out;       /* Wrong error code */
2574
2575                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2576                     ipv4_is_lbcast(oldflp->fl4_dst)) {
2577                         if (!fl.fl4_src)
2578                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2579                                                               RT_SCOPE_LINK);
2580                         goto make_route;
2581                 }
2582                 if (!fl.fl4_src) {
2583                         if (ipv4_is_multicast(oldflp->fl4_dst))
2584                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2585                                                               fl.fl4_scope);
2586                         else if (!oldflp->fl4_dst)
2587                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2588                                                               RT_SCOPE_HOST);
2589                 }
2590         }
2591
2592         if (!fl.fl4_dst) {
2593                 fl.fl4_dst = fl.fl4_src;
2594                 if (!fl.fl4_dst)
2595                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2596                 dev_out = net->loopback_dev;
2597                 fl.oif = net->loopback_dev->ifindex;
2598                 res.type = RTN_LOCAL;
2599                 flags |= RTCF_LOCAL;
2600                 goto make_route;
2601         }
2602
2603         if (fib_lookup(net, &fl, &res)) {
2604                 res.fi = NULL;
2605                 if (oldflp->oif) {
2606                         /* Apparently, routing tables are wrong. Assume,
2607                            that the destination is on link.
2608
2609                            WHY? DW.
2610                            Because we are allowed to send to iface
2611                            even if it has NO routes and NO assigned
2612                            addresses. When oif is specified, routing
2613                            tables are looked up with only one purpose:
2614                            to catch if destination is gatewayed, rather than
2615                            direct. Moreover, if MSG_DONTROUTE is set,
2616                            we send packet, ignoring both routing tables
2617                            and ifaddr state. --ANK
2618
2619
2620                            We could make it even if oif is unknown,
2621                            likely IPv6, but we do not.
2622                          */
2623
2624                         if (fl.fl4_src == 0)
2625                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2626                                                               RT_SCOPE_LINK);
2627                         res.type = RTN_UNICAST;
2628                         goto make_route;
2629                 }
2630                 err = -ENETUNREACH;
2631                 goto out;
2632         }
2633
2634         if (res.type == RTN_LOCAL) {
2635                 if (!fl.fl4_src)
2636                         fl.fl4_src = fl.fl4_dst;
2637                 dev_out = net->loopback_dev;
2638                 fl.oif = dev_out->ifindex;
2639                 res.fi = NULL;
2640                 flags |= RTCF_LOCAL;
2641                 goto make_route;
2642         }
2643
2644 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2645         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2646                 fib_select_multipath(&fl, &res);
2647         else
2648 #endif
2649         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2650                 fib_select_default(net, &fl, &res);
2651
2652         if (!fl.fl4_src)
2653                 fl.fl4_src = FIB_RES_PREFSRC(res);
2654
2655         dev_out = FIB_RES_DEV(res);
2656         fl.oif = dev_out->ifindex;
2657
2658
2659 make_route:
2660         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2661
2662 out:    return err;
2663 }
2664
2665 int __ip_route_output_key(struct net *net, struct rtable **rp,
2666                           const struct flowi *flp)
2667 {
2668         unsigned int hash;
2669         int res;
2670         struct rtable *rth;
2671
2672         if (!rt_caching(net))
2673                 goto slow_output;
2674
2675         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2676
2677         rcu_read_lock_bh();
2678         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2679                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2680                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2681                     rth->fl.fl4_src == flp->fl4_src &&
2682                     rth->fl.iif == 0 &&
2683                     rth->fl.oif == flp->oif &&
2684                     rth->fl.mark == flp->mark &&
2685                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2686                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2687                     net_eq(dev_net(rth->dst.dev), net) &&
2688                     !rt_is_expired(rth)) {
2689                         dst_use(&rth->dst, jiffies);
2690                         RT_CACHE_STAT_INC(out_hit);
2691                         rcu_read_unlock_bh();
2692                         *rp = rth;
2693                         return 0;
2694                 }
2695                 RT_CACHE_STAT_INC(out_hlist_search);
2696         }
2697         rcu_read_unlock_bh();
2698
2699 slow_output:
2700         rcu_read_lock();
2701         res = ip_route_output_slow(net, rp, flp);
2702         rcu_read_unlock();
2703         return res;
2704 }
2705 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2706
2707 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2708 {
2709         return NULL;
2710 }
2711
2712 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2713 {
2714 }
2715
2716 static struct dst_ops ipv4_dst_blackhole_ops = {
2717         .family                 =       AF_INET,
2718         .protocol               =       cpu_to_be16(ETH_P_IP),
2719         .destroy                =       ipv4_dst_destroy,
2720         .check                  =       ipv4_blackhole_dst_check,
2721         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2722 };
2723
2724
2725 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2726 {
2727         struct rtable *ort = *rp;
2728         struct rtable *rt = (struct rtable *)
2729                 dst_alloc(&ipv4_dst_blackhole_ops);
2730
2731         if (rt) {
2732                 struct dst_entry *new = &rt->dst;
2733
2734                 atomic_set(&new->__refcnt, 1);
2735                 new->__use = 1;
2736                 new->input = dst_discard;
2737                 new->output = dst_discard;
2738                 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
2739
2740                 new->dev = ort->dst.dev;
2741                 if (new->dev)
2742                         dev_hold(new->dev);
2743
2744                 rt->fl = ort->fl;
2745
2746                 rt->idev = ort->idev;
2747                 if (rt->idev)
2748                         in_dev_hold(rt->idev);
2749                 rt->rt_genid = rt_genid(net);
2750                 rt->rt_flags = ort->rt_flags;
2751                 rt->rt_type = ort->rt_type;
2752                 rt->rt_dst = ort->rt_dst;
2753                 rt->rt_src = ort->rt_src;
2754                 rt->rt_iif = ort->rt_iif;
2755                 rt->rt_gateway = ort->rt_gateway;
2756                 rt->rt_spec_dst = ort->rt_spec_dst;
2757                 rt->peer = ort->peer;
2758                 if (rt->peer)
2759                         atomic_inc(&rt->peer->refcnt);
2760
2761                 dst_free(new);
2762         }
2763
2764         dst_release(&(*rp)->dst);
2765         *rp = rt;
2766         return rt ? 0 : -ENOMEM;
2767 }
2768
2769 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2770                          struct sock *sk, int flags)
2771 {
2772         int err;
2773
2774         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2775                 return err;
2776
2777         if (flp->proto) {
2778                 if (!flp->fl4_src)
2779                         flp->fl4_src = (*rp)->rt_src;
2780                 if (!flp->fl4_dst)
2781                         flp->fl4_dst = (*rp)->rt_dst;
2782                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2783                                     flags ? XFRM_LOOKUP_WAIT : 0);
2784                 if (err == -EREMOTE)
2785                         err = ipv4_dst_blackhole(net, rp, flp);
2786
2787                 return err;
2788         }
2789
2790         return 0;
2791 }
2792 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2793
2794 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2795 {
2796         return ip_route_output_flow(net, rp, flp, NULL, 0);
2797 }
2798 EXPORT_SYMBOL(ip_route_output_key);
2799
2800 static int rt_fill_info(struct net *net,
2801                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2802                         int nowait, unsigned int flags)
2803 {
2804         struct rtable *rt = skb_rtable(skb);
2805         struct rtmsg *r;
2806         struct nlmsghdr *nlh;
2807         long expires;
2808         u32 id = 0, ts = 0, tsage = 0, error;
2809
2810         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2811         if (nlh == NULL)
2812                 return -EMSGSIZE;
2813
2814         r = nlmsg_data(nlh);
2815         r->rtm_family    = AF_INET;
2816         r->rtm_dst_len  = 32;
2817         r->rtm_src_len  = 0;
2818         r->rtm_tos      = rt->fl.fl4_tos;
2819         r->rtm_table    = RT_TABLE_MAIN;
2820         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2821         r->rtm_type     = rt->rt_type;
2822         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2823         r->rtm_protocol = RTPROT_UNSPEC;
2824         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2825         if (rt->rt_flags & RTCF_NOTIFY)
2826                 r->rtm_flags |= RTM_F_NOTIFY;
2827
2828         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2829
2830         if (rt->fl.fl4_src) {
2831                 r->rtm_src_len = 32;
2832                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2833         }
2834         if (rt->dst.dev)
2835                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2836 #ifdef CONFIG_NET_CLS_ROUTE
2837         if (rt->dst.tclassid)
2838                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2839 #endif
2840         if (rt->fl.iif)
2841                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2842         else if (rt->rt_src != rt->fl.fl4_src)
2843                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2844
2845         if (rt->rt_dst != rt->rt_gateway)
2846                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2847
2848         if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2849                 goto nla_put_failure;
2850
2851         if (rt->fl.mark)
2852                 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2853
2854         error = rt->dst.error;
2855         expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2856         if (rt->peer) {
2857                 inet_peer_refcheck(rt->peer);
2858                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2859                 if (rt->peer->tcp_ts_stamp) {
2860                         ts = rt->peer->tcp_ts;
2861                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2862                 }
2863         }
2864
2865         if (rt->fl.iif) {
2866 #ifdef CONFIG_IP_MROUTE
2867                 __be32 dst = rt->rt_dst;
2868
2869                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2870                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2871                         int err = ipmr_get_route(net, skb, r, nowait);
2872                         if (err <= 0) {
2873                                 if (!nowait) {
2874                                         if (err == 0)
2875                                                 return 0;
2876                                         goto nla_put_failure;
2877                                 } else {
2878                                         if (err == -EMSGSIZE)
2879                                                 goto nla_put_failure;
2880                                         error = err;
2881                                 }
2882                         }
2883                 } else
2884 #endif
2885                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2886         }
2887
2888         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2889                                expires, error) < 0)
2890                 goto nla_put_failure;
2891
2892         return nlmsg_end(skb, nlh);
2893
2894 nla_put_failure:
2895         nlmsg_cancel(skb, nlh);
2896         return -EMSGSIZE;
2897 }
2898
2899 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2900 {
2901         struct net *net = sock_net(in_skb->sk);
2902         struct rtmsg *rtm;
2903         struct nlattr *tb[RTA_MAX+1];
2904         struct rtable *rt = NULL;
2905         __be32 dst = 0;
2906         __be32 src = 0;
2907         u32 iif;
2908         int err;
2909         int mark;
2910         struct sk_buff *skb;
2911
2912         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2913         if (err < 0)
2914                 goto errout;
2915
2916         rtm = nlmsg_data(nlh);
2917
2918         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2919         if (skb == NULL) {
2920                 err = -ENOBUFS;
2921                 goto errout;
2922         }
2923
2924         /* Reserve room for dummy headers, this skb can pass
2925            through good chunk of routing engine.
2926          */
2927         skb_reset_mac_header(skb);
2928         skb_reset_network_header(skb);
2929
2930         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2931         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2932         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2933
2934         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2935         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2936         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2937         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2938
2939         if (iif) {
2940                 struct net_device *dev;
2941
2942                 dev = __dev_get_by_index(net, iif);
2943                 if (dev == NULL) {
2944                         err = -ENODEV;
2945                         goto errout_free;
2946                 }
2947
2948                 skb->protocol   = htons(ETH_P_IP);
2949                 skb->dev        = dev;
2950                 skb->mark       = mark;
2951                 local_bh_disable();
2952                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2953                 local_bh_enable();
2954
2955                 rt = skb_rtable(skb);
2956                 if (err == 0 && rt->dst.error)
2957                         err = -rt->dst.error;
2958         } else {
2959                 struct flowi fl = {
2960                         .nl_u = {
2961                                 .ip4_u = {
2962                                         .daddr = dst,
2963                                         .saddr = src,
2964                                         .tos = rtm->rtm_tos,
2965                                 },
2966                         },
2967                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2968                         .mark = mark,
2969                 };
2970                 err = ip_route_output_key(net, &rt, &fl);
2971         }
2972
2973         if (err)
2974                 goto errout_free;
2975
2976         skb_dst_set(skb, &rt->dst);
2977         if (rtm->rtm_flags & RTM_F_NOTIFY)
2978                 rt->rt_flags |= RTCF_NOTIFY;
2979
2980         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2981                            RTM_NEWROUTE, 0, 0);
2982         if (err <= 0)
2983                 goto errout_free;
2984
2985         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2986 errout:
2987         return err;
2988
2989 errout_free:
2990         kfree_skb(skb);
2991         goto errout;
2992 }
2993
2994 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2995 {
2996         struct rtable *rt;
2997         int h, s_h;
2998         int idx, s_idx;
2999         struct net *net;
3000
3001         net = sock_net(skb->sk);
3002
3003         s_h = cb->args[0];
3004         if (s_h < 0)
3005                 s_h = 0;
3006         s_idx = idx = cb->args[1];
3007         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3008                 if (!rt_hash_table[h].chain)
3009                         continue;
3010                 rcu_read_lock_bh();
3011                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3012                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3013                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3014                                 continue;
3015                         if (rt_is_expired(rt))
3016                                 continue;
3017                         skb_dst_set_noref(skb, &rt->dst);
3018                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3019                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3020                                          1, NLM_F_MULTI) <= 0) {
3021                                 skb_dst_drop(skb);
3022                                 rcu_read_unlock_bh();
3023                                 goto done;
3024                         }
3025                         skb_dst_drop(skb);
3026                 }
3027                 rcu_read_unlock_bh();
3028         }
3029
3030 done:
3031         cb->args[0] = h;
3032         cb->args[1] = idx;
3033         return skb->len;
3034 }
3035
3036 void ip_rt_multicast_event(struct in_device *in_dev)
3037 {
3038         rt_cache_flush(dev_net(in_dev->dev), 0);
3039 }
3040
3041 #ifdef CONFIG_SYSCTL
3042 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3043                                         void __user *buffer,
3044                                         size_t *lenp, loff_t *ppos)
3045 {
3046         if (write) {
3047                 int flush_delay;
3048                 ctl_table ctl;
3049                 struct net *net;
3050
3051                 memcpy(&ctl, __ctl, sizeof(ctl));
3052                 ctl.data = &flush_delay;
3053                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3054
3055                 net = (struct net *)__ctl->extra1;
3056                 rt_cache_flush(net, flush_delay);
3057                 return 0;
3058         }
3059
3060         return -EINVAL;
3061 }
3062
3063 static ctl_table ipv4_route_table[] = {
3064         {
3065                 .procname       = "gc_thresh",
3066                 .data           = &ipv4_dst_ops.gc_thresh,
3067                 .maxlen         = sizeof(int),
3068                 .mode           = 0644,
3069                 .proc_handler   = proc_dointvec,
3070         },
3071         {
3072                 .procname       = "max_size",
3073                 .data           = &ip_rt_max_size,
3074                 .maxlen         = sizeof(int),
3075                 .mode           = 0644,
3076                 .proc_handler   = proc_dointvec,
3077         },
3078         {
3079                 /*  Deprecated. Use gc_min_interval_ms */
3080
3081                 .procname       = "gc_min_interval",
3082                 .data           = &ip_rt_gc_min_interval,
3083                 .maxlen         = sizeof(int),
3084                 .mode           = 0644,
3085                 .proc_handler   = proc_dointvec_jiffies,
3086         },
3087         {
3088                 .procname       = "gc_min_interval_ms",
3089                 .data           = &ip_rt_gc_min_interval,
3090                 .maxlen         = sizeof(int),
3091                 .mode           = 0644,
3092                 .proc_handler   = proc_dointvec_ms_jiffies,
3093         },
3094         {
3095                 .procname       = "gc_timeout",
3096                 .data           = &ip_rt_gc_timeout,
3097                 .maxlen         = sizeof(int),
3098                 .mode           = 0644,
3099                 .proc_handler   = proc_dointvec_jiffies,
3100         },
3101         {
3102                 .procname       = "gc_interval",
3103                 .data           = &ip_rt_gc_interval,
3104                 .maxlen         = sizeof(int),
3105                 .mode           = 0644,
3106                 .proc_handler   = proc_dointvec_jiffies,
3107         },
3108         {
3109                 .procname       = "redirect_load",
3110                 .data           = &ip_rt_redirect_load,
3111                 .maxlen         = sizeof(int),
3112                 .mode           = 0644,
3113                 .proc_handler   = proc_dointvec,
3114         },
3115         {
3116                 .procname       = "redirect_number",
3117                 .data           = &ip_rt_redirect_number,
3118                 .maxlen         = sizeof(int),
3119                 .mode           = 0644,
3120                 .proc_handler   = proc_dointvec,
3121         },
3122         {
3123                 .procname       = "redirect_silence",
3124                 .data           = &ip_rt_redirect_silence,
3125                 .maxlen         = sizeof(int),
3126                 .mode           = 0644,
3127                 .proc_handler   = proc_dointvec,
3128         },
3129         {
3130                 .procname       = "error_cost",
3131                 .data           = &ip_rt_error_cost,
3132                 .maxlen         = sizeof(int),
3133                 .mode           = 0644,
3134                 .proc_handler   = proc_dointvec,
3135         },
3136         {
3137                 .procname       = "error_burst",
3138                 .data           = &ip_rt_error_burst,
3139                 .maxlen         = sizeof(int),
3140                 .mode           = 0644,
3141                 .proc_handler   = proc_dointvec,
3142         },
3143         {
3144                 .procname       = "gc_elasticity",
3145                 .data           = &ip_rt_gc_elasticity,
3146                 .maxlen         = sizeof(int),
3147                 .mode           = 0644,
3148                 .proc_handler   = proc_dointvec,
3149         },
3150         {
3151                 .procname       = "mtu_expires",
3152                 .data           = &ip_rt_mtu_expires,
3153                 .maxlen         = sizeof(int),
3154                 .mode           = 0644,
3155                 .proc_handler   = proc_dointvec_jiffies,
3156         },
3157         {
3158                 .procname       = "min_pmtu",
3159                 .data           = &ip_rt_min_pmtu,
3160                 .maxlen         = sizeof(int),
3161                 .mode           = 0644,
3162                 .proc_handler   = proc_dointvec,
3163         },
3164         {
3165                 .procname       = "min_adv_mss",
3166                 .data           = &ip_rt_min_advmss,
3167                 .maxlen         = sizeof(int),
3168                 .mode           = 0644,
3169                 .proc_handler   = proc_dointvec,
3170         },
3171         { }
3172 };
3173
3174 static struct ctl_table empty[1];
3175
3176 static struct ctl_table ipv4_skeleton[] =
3177 {
3178         { .procname = "route", 
3179           .mode = 0555, .child = ipv4_route_table},
3180         { .procname = "neigh", 
3181           .mode = 0555, .child = empty},
3182         { }
3183 };
3184
3185 static __net_initdata struct ctl_path ipv4_path[] = {
3186         { .procname = "net", },
3187         { .procname = "ipv4", },
3188         { },
3189 };
3190
3191 static struct ctl_table ipv4_route_flush_table[] = {
3192         {
3193                 .procname       = "flush",
3194                 .maxlen         = sizeof(int),
3195                 .mode           = 0200,
3196                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3197         },
3198         { },
3199 };
3200
3201 static __net_initdata struct ctl_path ipv4_route_path[] = {
3202         { .procname = "net", },
3203         { .procname = "ipv4", },
3204         { .procname = "route", },
3205         { },
3206 };
3207
3208 static __net_init int sysctl_route_net_init(struct net *net)
3209 {
3210         struct ctl_table *tbl;
3211
3212         tbl = ipv4_route_flush_table;
3213         if (!net_eq(net, &init_net)) {
3214                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3215                 if (tbl == NULL)
3216                         goto err_dup;
3217         }
3218         tbl[0].extra1 = net;
3219
3220         net->ipv4.route_hdr =
3221                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3222         if (net->ipv4.route_hdr == NULL)
3223                 goto err_reg;
3224         return 0;
3225
3226 err_reg:
3227         if (tbl != ipv4_route_flush_table)
3228                 kfree(tbl);
3229 err_dup:
3230         return -ENOMEM;
3231 }
3232
3233 static __net_exit void sysctl_route_net_exit(struct net *net)
3234 {
3235         struct ctl_table *tbl;
3236
3237         tbl = net->ipv4.route_hdr->ctl_table_arg;
3238         unregister_net_sysctl_table(net->ipv4.route_hdr);
3239         BUG_ON(tbl == ipv4_route_flush_table);
3240         kfree(tbl);
3241 }
3242
3243 static __net_initdata struct pernet_operations sysctl_route_ops = {
3244         .init = sysctl_route_net_init,
3245         .exit = sysctl_route_net_exit,
3246 };
3247 #endif
3248
3249 static __net_init int rt_genid_init(struct net *net)
3250 {
3251         get_random_bytes(&net->ipv4.rt_genid,
3252                          sizeof(net->ipv4.rt_genid));
3253         return 0;
3254 }
3255
3256 static __net_initdata struct pernet_operations rt_genid_ops = {
3257         .init = rt_genid_init,
3258 };
3259
3260
3261 #ifdef CONFIG_NET_CLS_ROUTE
3262 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3263 #endif /* CONFIG_NET_CLS_ROUTE */
3264
3265 static __initdata unsigned long rhash_entries;
3266 static int __init set_rhash_entries(char *str)
3267 {
3268         if (!str)
3269                 return 0;
3270         rhash_entries = simple_strtoul(str, &str, 0);
3271         return 1;
3272 }
3273 __setup("rhash_entries=", set_rhash_entries);
3274
3275 int __init ip_rt_init(void)
3276 {
3277         int rc = 0;
3278
3279 #ifdef CONFIG_NET_CLS_ROUTE
3280         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3281         if (!ip_rt_acct)
3282                 panic("IP: failed to allocate ip_rt_acct\n");
3283 #endif
3284
3285         ipv4_dst_ops.kmem_cachep =
3286                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3287                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3288
3289         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3290
3291         if (dst_entries_init(&ipv4_dst_ops) < 0)
3292                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3293
3294         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3295                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3296
3297         rt_hash_table = (struct rt_hash_bucket *)
3298                 alloc_large_system_hash("IP route cache",
3299                                         sizeof(struct rt_hash_bucket),
3300                                         rhash_entries,
3301                                         (totalram_pages >= 128 * 1024) ?
3302                                         15 : 17,
3303                                         0,
3304                                         &rt_hash_log,
3305                                         &rt_hash_mask,
3306                                         rhash_entries ? 0 : 512 * 1024);
3307         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3308         rt_hash_lock_init();
3309
3310         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3311         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3312
3313         devinet_init();
3314         ip_fib_init();
3315
3316         /* All the timers, started at system startup tend
3317            to synchronize. Perturb it a bit.
3318          */
3319         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3320         expires_ljiffies = jiffies;
3321         schedule_delayed_work(&expires_work,
3322                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3323
3324         if (ip_rt_proc_init())
3325                 printk(KERN_ERR "Unable to create route proc files\n");
3326 #ifdef CONFIG_XFRM
3327         xfrm_init();
3328         xfrm4_init(ip_rt_max_size);
3329 #endif
3330         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3331
3332 #ifdef CONFIG_SYSCTL
3333         register_pernet_subsys(&sysctl_route_ops);
3334 #endif
3335         register_pernet_subsys(&rt_genid_ops);
3336         return rc;
3337 }
3338
3339 #ifdef CONFIG_SYSCTL
3340 /*
3341  * We really need to sanitize the damn ipv4 init order, then all
3342  * this nonsense will go away.
3343  */
3344 void __init ip_static_sysctl_init(void)
3345 {
3346         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3347 }
3348 #endif