]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv4/route.c
Merge branch 'for-2.6.25' of master.kernel.org:/pub/scm/linux/kernel/git/arnd/cell...
[net-next-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval            = 60 * HZ;
123 static int ip_rt_gc_min_interval        = HZ / 2;
124 static int ip_rt_redirect_number        = 9;
125 static int ip_rt_redirect_load          = HZ / 50;
126 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost             = HZ;
128 static int ip_rt_error_burst            = 5 * HZ;
129 static int ip_rt_gc_elasticity          = 8;
130 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu               = 512 + 20 + 20;
132 static int ip_rt_min_advmss             = 256;
133 static int ip_rt_secret_interval        = 10 * 60 * HZ;
134
135 #define RTprint(a...)   printk(KERN_DEBUG a)
136
137 static void rt_worker_func(struct work_struct *work);
138 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
139 static struct timer_list rt_secret_timer;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void              ipv4_dst_destroy(struct dst_entry *dst);
147 static void              ipv4_dst_ifdown(struct dst_entry *dst,
148                                          struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153
154
155 static struct dst_ops ipv4_dst_ops = {
156         .family =               AF_INET,
157         .protocol =             __constant_htons(ETH_P_IP),
158         .gc =                   rt_garbage_collect,
159         .check =                ipv4_dst_check,
160         .destroy =              ipv4_dst_destroy,
161         .ifdown =               ipv4_dst_ifdown,
162         .negative_advice =      ipv4_negative_advice,
163         .link_failure =         ipv4_link_failure,
164         .update_pmtu =          ip_rt_update_pmtu,
165         .local_out =            ip_local_out,
166         .entry_size =           sizeof(struct rtable),
167         .entries =              ATOMIC_INIT(0),
168 };
169
170 #define ECN_OR_COST(class)      TC_PRIO_##class
171
172 const __u8 ip_tos2prio[16] = {
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(FILLER),
175         TC_PRIO_BESTEFFORT,
176         ECN_OR_COST(BESTEFFORT),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_BULK,
180         ECN_OR_COST(BULK),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE,
184         ECN_OR_COST(INTERACTIVE),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK),
187         TC_PRIO_INTERACTIVE_BULK,
188         ECN_OR_COST(INTERACTIVE_BULK)
189 };
190
191
192 /*
193  * Route cache.
194  */
195
196 /* The locking scheme is rather straight forward:
197  *
198  * 1) Read-Copy Update protects the buckets of the central route hash.
199  * 2) Only writers remove entries, and they hold the lock
200  *    as they look at rtable reference counts.
201  * 3) Only readers acquire references to rtable entries,
202  *    they do so with atomic increments and with the
203  *    lock held.
204  */
205
206 struct rt_hash_bucket {
207         struct rtable   *chain;
208 };
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210         defined(CONFIG_PROVE_LOCKING)
211 /*
212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213  * The size of this table is a power of two and depends on the number of CPUS.
214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215  */
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ        256
218 #else
219 # if NR_CPUS >= 32
220 #  define RT_HASH_LOCK_SZ       4096
221 # elif NR_CPUS >= 16
222 #  define RT_HASH_LOCK_SZ       2048
223 # elif NR_CPUS >= 8
224 #  define RT_HASH_LOCK_SZ       1024
225 # elif NR_CPUS >= 4
226 #  define RT_HASH_LOCK_SZ       512
227 # else
228 #  define RT_HASH_LOCK_SZ       256
229 # endif
230 #endif
231
232 static spinlock_t       *rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234
235 static __init void rt_hash_lock_init(void)
236 {
237         int i;
238
239         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
240                         GFP_KERNEL);
241         if (!rt_hash_locks)
242                 panic("IP: failed to allocate rt_hash_locks\n");
243
244         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
245                 spin_lock_init(&rt_hash_locks[i]);
246 }
247 #else
248 # define rt_hash_lock_addr(slot) NULL
249
250 static inline void rt_hash_lock_init(void)
251 {
252 }
253 #endif
254
255 static struct rt_hash_bucket    *rt_hash_table;
256 static unsigned                 rt_hash_mask;
257 static unsigned int             rt_hash_log;
258 static atomic_t                 rt_genid;
259
260 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
261 #define RT_CACHE_STAT_INC(field) \
262         (__raw_get_cpu_var(rt_cache_stat).field++)
263
264 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
265 {
266         return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
267                 & rt_hash_mask;
268 }
269
270 #define rt_hash(daddr, saddr, idx) \
271         rt_hash_code((__force u32)(__be32)(daddr),\
272                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
273
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276         int bucket;
277         int genid;
278 };
279
280 static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st)
281 {
282         struct rtable *r = NULL;
283
284         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
285                 rcu_read_lock_bh();
286                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
287                 while (r) {
288                         if (r->rt_genid == st->genid)
289                                 return r;
290                         r = rcu_dereference(r->u.dst.rt_next);
291                 }
292                 rcu_read_unlock_bh();
293         }
294         return r;
295 }
296
297 static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st, struct rtable *r)
298 {
299         r = r->u.dst.rt_next;
300         while (!r) {
301                 rcu_read_unlock_bh();
302                 if (--st->bucket < 0)
303                         break;
304                 rcu_read_lock_bh();
305                 r = rt_hash_table[st->bucket].chain;
306         }
307         return rcu_dereference(r);
308 }
309
310 static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos)
311 {
312         struct rtable *r = rt_cache_get_first(st);
313
314         if (r)
315                 while (pos && (r = rt_cache_get_next(st, r))) {
316                         if (r->rt_genid != st->genid)
317                                 continue;
318                         --pos;
319                 }
320         return pos ? NULL : r;
321 }
322
323 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
324 {
325         struct rt_cache_iter_state *st = seq->private;
326
327         if (*pos)
328                 return rt_cache_get_idx(st, *pos - 1);
329         st->genid = atomic_read(&rt_genid);
330         return SEQ_START_TOKEN;
331 }
332
333 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
334 {
335         struct rtable *r;
336         struct rt_cache_iter_state *st = seq->private;
337
338         if (v == SEQ_START_TOKEN)
339                 r = rt_cache_get_first(st);
340         else
341                 r = rt_cache_get_next(st, v);
342         ++*pos;
343         return r;
344 }
345
346 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
347 {
348         if (v && v != SEQ_START_TOKEN)
349                 rcu_read_unlock_bh();
350 }
351
352 static int rt_cache_seq_show(struct seq_file *seq, void *v)
353 {
354         if (v == SEQ_START_TOKEN)
355                 seq_printf(seq, "%-127s\n",
356                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
357                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
358                            "HHUptod\tSpecDst");
359         else {
360                 struct rtable *r = v;
361                 char temp[256];
362
363                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
364                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
365                         r->u.dst.dev ? r->u.dst.dev->name : "*",
366                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
367                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
368                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
369                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
370                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
371                         dst_metric(&r->u.dst, RTAX_WINDOW),
372                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
373                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
374                         r->fl.fl4_tos,
375                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
376                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
377                                        dev_queue_xmit) : 0,
378                         r->rt_spec_dst);
379                 seq_printf(seq, "%-127s\n", temp);
380         }
381         return 0;
382 }
383
384 static const struct seq_operations rt_cache_seq_ops = {
385         .start  = rt_cache_seq_start,
386         .next   = rt_cache_seq_next,
387         .stop   = rt_cache_seq_stop,
388         .show   = rt_cache_seq_show,
389 };
390
391 static int rt_cache_seq_open(struct inode *inode, struct file *file)
392 {
393         return seq_open_private(file, &rt_cache_seq_ops,
394                         sizeof(struct rt_cache_iter_state));
395 }
396
397 static const struct file_operations rt_cache_seq_fops = {
398         .owner   = THIS_MODULE,
399         .open    = rt_cache_seq_open,
400         .read    = seq_read,
401         .llseek  = seq_lseek,
402         .release = seq_release_private,
403 };
404
405
406 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
407 {
408         int cpu;
409
410         if (*pos == 0)
411                 return SEQ_START_TOKEN;
412
413         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
414                 if (!cpu_possible(cpu))
415                         continue;
416                 *pos = cpu+1;
417                 return &per_cpu(rt_cache_stat, cpu);
418         }
419         return NULL;
420 }
421
422 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
423 {
424         int cpu;
425
426         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
427                 if (!cpu_possible(cpu))
428                         continue;
429                 *pos = cpu+1;
430                 return &per_cpu(rt_cache_stat, cpu);
431         }
432         return NULL;
433
434 }
435
436 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
437 {
438
439 }
440
441 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
442 {
443         struct rt_cache_stat *st = v;
444
445         if (v == SEQ_START_TOKEN) {
446                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
447                 return 0;
448         }
449
450         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
451                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
452                    atomic_read(&ipv4_dst_ops.entries),
453                    st->in_hit,
454                    st->in_slow_tot,
455                    st->in_slow_mc,
456                    st->in_no_route,
457                    st->in_brd,
458                    st->in_martian_dst,
459                    st->in_martian_src,
460
461                    st->out_hit,
462                    st->out_slow_tot,
463                    st->out_slow_mc,
464
465                    st->gc_total,
466                    st->gc_ignored,
467                    st->gc_goal_miss,
468                    st->gc_dst_overflow,
469                    st->in_hlist_search,
470                    st->out_hlist_search
471                 );
472         return 0;
473 }
474
475 static const struct seq_operations rt_cpu_seq_ops = {
476         .start  = rt_cpu_seq_start,
477         .next   = rt_cpu_seq_next,
478         .stop   = rt_cpu_seq_stop,
479         .show   = rt_cpu_seq_show,
480 };
481
482
483 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
484 {
485         return seq_open(file, &rt_cpu_seq_ops);
486 }
487
488 static const struct file_operations rt_cpu_seq_fops = {
489         .owner   = THIS_MODULE,
490         .open    = rt_cpu_seq_open,
491         .read    = seq_read,
492         .llseek  = seq_lseek,
493         .release = seq_release,
494 };
495
496 #ifdef CONFIG_NET_CLS_ROUTE
497 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
498                            int length, int *eof, void *data)
499 {
500         unsigned int i;
501
502         if ((offset & 3) || (length & 3))
503                 return -EIO;
504
505         if (offset >= sizeof(struct ip_rt_acct) * 256) {
506                 *eof = 1;
507                 return 0;
508         }
509
510         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
511                 length = sizeof(struct ip_rt_acct) * 256 - offset;
512                 *eof = 1;
513         }
514
515         offset /= sizeof(u32);
516
517         if (length > 0) {
518                 u32 *dst = (u32 *) buffer;
519
520                 *start = buffer;
521                 memset(dst, 0, length);
522
523                 for_each_possible_cpu(i) {
524                         unsigned int j;
525                         u32 *src;
526
527                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
528                         for (j = 0; j < length/4; j++)
529                                 dst[j] += src[j];
530                 }
531         }
532         return length;
533 }
534 #endif
535
536 static __init int ip_rt_proc_init(struct net *net)
537 {
538         struct proc_dir_entry *pde;
539
540         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
541                         &rt_cache_seq_fops);
542         if (!pde)
543                 goto err1;
544
545         pde = proc_create("rt_cache", S_IRUGO,
546                           net->proc_net_stat, &rt_cpu_seq_fops);
547         if (!pde)
548                 goto err2;
549
550 #ifdef CONFIG_NET_CLS_ROUTE
551         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
552                         ip_rt_acct_read, NULL);
553         if (!pde)
554                 goto err3;
555 #endif
556         return 0;
557
558 #ifdef CONFIG_NET_CLS_ROUTE
559 err3:
560         remove_proc_entry("rt_cache", net->proc_net_stat);
561 #endif
562 err2:
563         remove_proc_entry("rt_cache", net->proc_net);
564 err1:
565         return -ENOMEM;
566 }
567 #else
568 static inline int ip_rt_proc_init(struct net *net)
569 {
570         return 0;
571 }
572 #endif /* CONFIG_PROC_FS */
573
574 static __inline__ void rt_free(struct rtable *rt)
575 {
576         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
577 }
578
579 static __inline__ void rt_drop(struct rtable *rt)
580 {
581         ip_rt_put(rt);
582         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
583 }
584
585 static __inline__ int rt_fast_clean(struct rtable *rth)
586 {
587         /* Kill broadcast/multicast entries very aggresively, if they
588            collide in hash table with more useful entries */
589         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
590                 rth->fl.iif && rth->u.dst.rt_next;
591 }
592
593 static __inline__ int rt_valuable(struct rtable *rth)
594 {
595         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
596                 rth->u.dst.expires;
597 }
598
599 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
600 {
601         unsigned long age;
602         int ret = 0;
603
604         if (atomic_read(&rth->u.dst.__refcnt))
605                 goto out;
606
607         ret = 1;
608         if (rth->u.dst.expires &&
609             time_after_eq(jiffies, rth->u.dst.expires))
610                 goto out;
611
612         age = jiffies - rth->u.dst.lastuse;
613         ret = 0;
614         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
615             (age <= tmo2 && rt_valuable(rth)))
616                 goto out;
617         ret = 1;
618 out:    return ret;
619 }
620
621 /* Bits of score are:
622  * 31: very valuable
623  * 30: not quite useless
624  * 29..0: usage counter
625  */
626 static inline u32 rt_score(struct rtable *rt)
627 {
628         u32 score = jiffies - rt->u.dst.lastuse;
629
630         score = ~score & ~(3<<30);
631
632         if (rt_valuable(rt))
633                 score |= (1<<31);
634
635         if (!rt->fl.iif ||
636             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
637                 score |= (1<<30);
638
639         return score;
640 }
641
642 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
643 {
644         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
645                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
646                 (fl1->mark ^ fl2->mark) |
647                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
648                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
649                 (fl1->oif ^ fl2->oif) |
650                 (fl1->iif ^ fl2->iif)) == 0;
651 }
652
653 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
654 {
655         return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
656 }
657
658 /*
659  * Perform a full scan of hash table and free all entries.
660  * Can be called by a softirq or a process.
661  * In the later case, we want to be reschedule if necessary
662  */
663 static void rt_do_flush(int process_context)
664 {
665         unsigned int i;
666         struct rtable *rth, *next;
667
668         for (i = 0; i <= rt_hash_mask; i++) {
669                 if (process_context && need_resched())
670                         cond_resched();
671                 rth = rt_hash_table[i].chain;
672                 if (!rth)
673                         continue;
674
675                 spin_lock_bh(rt_hash_lock_addr(i));
676                 rth = rt_hash_table[i].chain;
677                 rt_hash_table[i].chain = NULL;
678                 spin_unlock_bh(rt_hash_lock_addr(i));
679
680                 for (; rth; rth = next) {
681                         next = rth->u.dst.rt_next;
682                         rt_free(rth);
683                 }
684         }
685 }
686
687 static void rt_check_expire(void)
688 {
689         static unsigned int rover;
690         unsigned int i = rover, goal;
691         struct rtable *rth, **rthp;
692         u64 mult;
693
694         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
695         if (ip_rt_gc_timeout > 1)
696                 do_div(mult, ip_rt_gc_timeout);
697         goal = (unsigned int)mult;
698         if (goal > rt_hash_mask)
699                 goal = rt_hash_mask + 1;
700         for (; goal > 0; goal--) {
701                 unsigned long tmo = ip_rt_gc_timeout;
702
703                 i = (i + 1) & rt_hash_mask;
704                 rthp = &rt_hash_table[i].chain;
705
706                 if (need_resched())
707                         cond_resched();
708
709                 if (*rthp == NULL)
710                         continue;
711                 spin_lock_bh(rt_hash_lock_addr(i));
712                 while ((rth = *rthp) != NULL) {
713                         if (rth->rt_genid != atomic_read(&rt_genid)) {
714                                 *rthp = rth->u.dst.rt_next;
715                                 rt_free(rth);
716                                 continue;
717                         }
718                         if (rth->u.dst.expires) {
719                                 /* Entry is expired even if it is in use */
720                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
721                                         tmo >>= 1;
722                                         rthp = &rth->u.dst.rt_next;
723                                         continue;
724                                 }
725                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
726                                 tmo >>= 1;
727                                 rthp = &rth->u.dst.rt_next;
728                                 continue;
729                         }
730
731                         /* Cleanup aged off entries. */
732                         *rthp = rth->u.dst.rt_next;
733                         rt_free(rth);
734                 }
735                 spin_unlock_bh(rt_hash_lock_addr(i));
736         }
737         rover = i;
738 }
739
740 /*
741  * rt_worker_func() is run in process context.
742  * we call rt_check_expire() to scan part of the hash table
743  */
744 static void rt_worker_func(struct work_struct *work)
745 {
746         rt_check_expire();
747         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
748 }
749
750 /*
751  * Pertubation of rt_genid by a small quantity [1..256]
752  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
753  * many times (2^24) without giving recent rt_genid.
754  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
755  */
756 static void rt_cache_invalidate(void)
757 {
758         unsigned char shuffle;
759
760         get_random_bytes(&shuffle, sizeof(shuffle));
761         atomic_add(shuffle + 1U, &rt_genid);
762 }
763
764 /*
765  * delay < 0  : invalidate cache (fast : entries will be deleted later)
766  * delay >= 0 : invalidate & flush cache (can be long)
767  */
768 void rt_cache_flush(int delay)
769 {
770         rt_cache_invalidate();
771         if (delay >= 0)
772                 rt_do_flush(!in_softirq());
773 }
774
775 /*
776  * We change rt_genid and let gc do the cleanup
777  */
778 static void rt_secret_rebuild(unsigned long dummy)
779 {
780         rt_cache_invalidate();
781         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
782 }
783
784 /*
785    Short description of GC goals.
786
787    We want to build algorithm, which will keep routing cache
788    at some equilibrium point, when number of aged off entries
789    is kept approximately equal to newly generated ones.
790
791    Current expiration strength is variable "expire".
792    We try to adjust it dynamically, so that if networking
793    is idle expires is large enough to keep enough of warm entries,
794    and when load increases it reduces to limit cache size.
795  */
796
797 static int rt_garbage_collect(struct dst_ops *ops)
798 {
799         static unsigned long expire = RT_GC_TIMEOUT;
800         static unsigned long last_gc;
801         static int rover;
802         static int equilibrium;
803         struct rtable *rth, **rthp;
804         unsigned long now = jiffies;
805         int goal;
806
807         /*
808          * Garbage collection is pretty expensive,
809          * do not make it too frequently.
810          */
811
812         RT_CACHE_STAT_INC(gc_total);
813
814         if (now - last_gc < ip_rt_gc_min_interval &&
815             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
816                 RT_CACHE_STAT_INC(gc_ignored);
817                 goto out;
818         }
819
820         /* Calculate number of entries, which we want to expire now. */
821         goal = atomic_read(&ipv4_dst_ops.entries) -
822                 (ip_rt_gc_elasticity << rt_hash_log);
823         if (goal <= 0) {
824                 if (equilibrium < ipv4_dst_ops.gc_thresh)
825                         equilibrium = ipv4_dst_ops.gc_thresh;
826                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
827                 if (goal > 0) {
828                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
829                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
830                 }
831         } else {
832                 /* We are in dangerous area. Try to reduce cache really
833                  * aggressively.
834                  */
835                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
836                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
837         }
838
839         if (now - last_gc >= ip_rt_gc_min_interval)
840                 last_gc = now;
841
842         if (goal <= 0) {
843                 equilibrium += goal;
844                 goto work_done;
845         }
846
847         do {
848                 int i, k;
849
850                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
851                         unsigned long tmo = expire;
852
853                         k = (k + 1) & rt_hash_mask;
854                         rthp = &rt_hash_table[k].chain;
855                         spin_lock_bh(rt_hash_lock_addr(k));
856                         while ((rth = *rthp) != NULL) {
857                                 if (rth->rt_genid == atomic_read(&rt_genid) &&
858                                         !rt_may_expire(rth, tmo, expire)) {
859                                         tmo >>= 1;
860                                         rthp = &rth->u.dst.rt_next;
861                                         continue;
862                                 }
863                                 *rthp = rth->u.dst.rt_next;
864                                 rt_free(rth);
865                                 goal--;
866                         }
867                         spin_unlock_bh(rt_hash_lock_addr(k));
868                         if (goal <= 0)
869                                 break;
870                 }
871                 rover = k;
872
873                 if (goal <= 0)
874                         goto work_done;
875
876                 /* Goal is not achieved. We stop process if:
877
878                    - if expire reduced to zero. Otherwise, expire is halfed.
879                    - if table is not full.
880                    - if we are called from interrupt.
881                    - jiffies check is just fallback/debug loop breaker.
882                      We will not spin here for long time in any case.
883                  */
884
885                 RT_CACHE_STAT_INC(gc_goal_miss);
886
887                 if (expire == 0)
888                         break;
889
890                 expire >>= 1;
891 #if RT_CACHE_DEBUG >= 2
892                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
893                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
894 #endif
895
896                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
897                         goto out;
898         } while (!in_softirq() && time_before_eq(jiffies, now));
899
900         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
901                 goto out;
902         if (net_ratelimit())
903                 printk(KERN_WARNING "dst cache overflow\n");
904         RT_CACHE_STAT_INC(gc_dst_overflow);
905         return 1;
906
907 work_done:
908         expire += ip_rt_gc_min_interval;
909         if (expire > ip_rt_gc_timeout ||
910             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
911                 expire = ip_rt_gc_timeout;
912 #if RT_CACHE_DEBUG >= 2
913         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
914                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
915 #endif
916 out:    return 0;
917 }
918
919 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
920 {
921         struct rtable   *rth, **rthp;
922         unsigned long   now;
923         struct rtable *cand, **candp;
924         u32             min_score;
925         int             chain_length;
926         int attempts = !in_softirq();
927
928 restart:
929         chain_length = 0;
930         min_score = ~(u32)0;
931         cand = NULL;
932         candp = NULL;
933         now = jiffies;
934
935         rthp = &rt_hash_table[hash].chain;
936
937         spin_lock_bh(rt_hash_lock_addr(hash));
938         while ((rth = *rthp) != NULL) {
939                 if (rth->rt_genid != atomic_read(&rt_genid)) {
940                         *rthp = rth->u.dst.rt_next;
941                         rt_free(rth);
942                         continue;
943                 }
944                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
945                         /* Put it first */
946                         *rthp = rth->u.dst.rt_next;
947                         /*
948                          * Since lookup is lockfree, the deletion
949                          * must be visible to another weakly ordered CPU before
950                          * the insertion at the start of the hash chain.
951                          */
952                         rcu_assign_pointer(rth->u.dst.rt_next,
953                                            rt_hash_table[hash].chain);
954                         /*
955                          * Since lookup is lockfree, the update writes
956                          * must be ordered for consistency on SMP.
957                          */
958                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
959
960                         dst_use(&rth->u.dst, now);
961                         spin_unlock_bh(rt_hash_lock_addr(hash));
962
963                         rt_drop(rt);
964                         *rp = rth;
965                         return 0;
966                 }
967
968                 if (!atomic_read(&rth->u.dst.__refcnt)) {
969                         u32 score = rt_score(rth);
970
971                         if (score <= min_score) {
972                                 cand = rth;
973                                 candp = rthp;
974                                 min_score = score;
975                         }
976                 }
977
978                 chain_length++;
979
980                 rthp = &rth->u.dst.rt_next;
981         }
982
983         if (cand) {
984                 /* ip_rt_gc_elasticity used to be average length of chain
985                  * length, when exceeded gc becomes really aggressive.
986                  *
987                  * The second limit is less certain. At the moment it allows
988                  * only 2 entries per bucket. We will see.
989                  */
990                 if (chain_length > ip_rt_gc_elasticity) {
991                         *candp = cand->u.dst.rt_next;
992                         rt_free(cand);
993                 }
994         }
995
996         /* Try to bind route to arp only if it is output
997            route or unicast forwarding path.
998          */
999         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1000                 int err = arp_bind_neighbour(&rt->u.dst);
1001                 if (err) {
1002                         spin_unlock_bh(rt_hash_lock_addr(hash));
1003
1004                         if (err != -ENOBUFS) {
1005                                 rt_drop(rt);
1006                                 return err;
1007                         }
1008
1009                         /* Neighbour tables are full and nothing
1010                            can be released. Try to shrink route cache,
1011                            it is most likely it holds some neighbour records.
1012                          */
1013                         if (attempts-- > 0) {
1014                                 int saved_elasticity = ip_rt_gc_elasticity;
1015                                 int saved_int = ip_rt_gc_min_interval;
1016                                 ip_rt_gc_elasticity     = 1;
1017                                 ip_rt_gc_min_interval   = 0;
1018                                 rt_garbage_collect(&ipv4_dst_ops);
1019                                 ip_rt_gc_min_interval   = saved_int;
1020                                 ip_rt_gc_elasticity     = saved_elasticity;
1021                                 goto restart;
1022                         }
1023
1024                         if (net_ratelimit())
1025                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1026                         rt_drop(rt);
1027                         return -ENOBUFS;
1028                 }
1029         }
1030
1031         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1032 #if RT_CACHE_DEBUG >= 2
1033         if (rt->u.dst.rt_next) {
1034                 struct rtable *trt;
1035                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1036                        NIPQUAD(rt->rt_dst));
1037                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1038                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1039                 printk("\n");
1040         }
1041 #endif
1042         rt_hash_table[hash].chain = rt;
1043         spin_unlock_bh(rt_hash_lock_addr(hash));
1044         *rp = rt;
1045         return 0;
1046 }
1047
1048 void rt_bind_peer(struct rtable *rt, int create)
1049 {
1050         static DEFINE_SPINLOCK(rt_peer_lock);
1051         struct inet_peer *peer;
1052
1053         peer = inet_getpeer(rt->rt_dst, create);
1054
1055         spin_lock_bh(&rt_peer_lock);
1056         if (rt->peer == NULL) {
1057                 rt->peer = peer;
1058                 peer = NULL;
1059         }
1060         spin_unlock_bh(&rt_peer_lock);
1061         if (peer)
1062                 inet_putpeer(peer);
1063 }
1064
1065 /*
1066  * Peer allocation may fail only in serious out-of-memory conditions.  However
1067  * we still can generate some output.
1068  * Random ID selection looks a bit dangerous because we have no chances to
1069  * select ID being unique in a reasonable period of time.
1070  * But broken packet identifier may be better than no packet at all.
1071  */
1072 static void ip_select_fb_ident(struct iphdr *iph)
1073 {
1074         static DEFINE_SPINLOCK(ip_fb_id_lock);
1075         static u32 ip_fallback_id;
1076         u32 salt;
1077
1078         spin_lock_bh(&ip_fb_id_lock);
1079         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1080         iph->id = htons(salt & 0xFFFF);
1081         ip_fallback_id = salt;
1082         spin_unlock_bh(&ip_fb_id_lock);
1083 }
1084
1085 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1086 {
1087         struct rtable *rt = (struct rtable *) dst;
1088
1089         if (rt) {
1090                 if (rt->peer == NULL)
1091                         rt_bind_peer(rt, 1);
1092
1093                 /* If peer is attached to destination, it is never detached,
1094                    so that we need not to grab a lock to dereference it.
1095                  */
1096                 if (rt->peer) {
1097                         iph->id = htons(inet_getid(rt->peer, more));
1098                         return;
1099                 }
1100         } else
1101                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1102                        __builtin_return_address(0));
1103
1104         ip_select_fb_ident(iph);
1105 }
1106
1107 static void rt_del(unsigned hash, struct rtable *rt)
1108 {
1109         struct rtable **rthp, *aux;
1110
1111         rthp = &rt_hash_table[hash].chain;
1112         spin_lock_bh(rt_hash_lock_addr(hash));
1113         ip_rt_put(rt);
1114         while ((aux = *rthp) != NULL) {
1115                 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1116                         *rthp = aux->u.dst.rt_next;
1117                         rt_free(aux);
1118                         continue;
1119                 }
1120                 rthp = &aux->u.dst.rt_next;
1121         }
1122         spin_unlock_bh(rt_hash_lock_addr(hash));
1123 }
1124
1125 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1126                     __be32 saddr, struct net_device *dev)
1127 {
1128         int i, k;
1129         struct in_device *in_dev = in_dev_get(dev);
1130         struct rtable *rth, **rthp;
1131         __be32  skeys[2] = { saddr, 0 };
1132         int  ikeys[2] = { dev->ifindex, 0 };
1133         struct netevent_redirect netevent;
1134
1135         if (!in_dev)
1136                 return;
1137
1138         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1139             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1140             || ipv4_is_zeronet(new_gw))
1141                 goto reject_redirect;
1142
1143         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1144                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1145                         goto reject_redirect;
1146                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1147                         goto reject_redirect;
1148         } else {
1149                 if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST)
1150                         goto reject_redirect;
1151         }
1152
1153         for (i = 0; i < 2; i++) {
1154                 for (k = 0; k < 2; k++) {
1155                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1156
1157                         rthp=&rt_hash_table[hash].chain;
1158
1159                         rcu_read_lock();
1160                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1161                                 struct rtable *rt;
1162
1163                                 if (rth->fl.fl4_dst != daddr ||
1164                                     rth->fl.fl4_src != skeys[i] ||
1165                                     rth->fl.oif != ikeys[k] ||
1166                                     rth->fl.iif != 0 ||
1167                                     rth->rt_genid != atomic_read(&rt_genid)) {
1168                                         rthp = &rth->u.dst.rt_next;
1169                                         continue;
1170                                 }
1171
1172                                 if (rth->rt_dst != daddr ||
1173                                     rth->rt_src != saddr ||
1174                                     rth->u.dst.error ||
1175                                     rth->rt_gateway != old_gw ||
1176                                     rth->u.dst.dev != dev)
1177                                         break;
1178
1179                                 dst_hold(&rth->u.dst);
1180                                 rcu_read_unlock();
1181
1182                                 rt = dst_alloc(&ipv4_dst_ops);
1183                                 if (rt == NULL) {
1184                                         ip_rt_put(rth);
1185                                         in_dev_put(in_dev);
1186                                         return;
1187                                 }
1188
1189                                 /* Copy all the information. */
1190                                 *rt = *rth;
1191                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1192                                 rt->u.dst.__use         = 1;
1193                                 atomic_set(&rt->u.dst.__refcnt, 1);
1194                                 rt->u.dst.child         = NULL;
1195                                 if (rt->u.dst.dev)
1196                                         dev_hold(rt->u.dst.dev);
1197                                 if (rt->idev)
1198                                         in_dev_hold(rt->idev);
1199                                 rt->u.dst.obsolete      = 0;
1200                                 rt->u.dst.lastuse       = jiffies;
1201                                 rt->u.dst.path          = &rt->u.dst;
1202                                 rt->u.dst.neighbour     = NULL;
1203                                 rt->u.dst.hh            = NULL;
1204                                 rt->u.dst.xfrm          = NULL;
1205                                 rt->rt_genid            = atomic_read(&rt_genid);
1206                                 rt->rt_flags            |= RTCF_REDIRECTED;
1207
1208                                 /* Gateway is different ... */
1209                                 rt->rt_gateway          = new_gw;
1210
1211                                 /* Redirect received -> path was valid */
1212                                 dst_confirm(&rth->u.dst);
1213
1214                                 if (rt->peer)
1215                                         atomic_inc(&rt->peer->refcnt);
1216
1217                                 if (arp_bind_neighbour(&rt->u.dst) ||
1218                                     !(rt->u.dst.neighbour->nud_state &
1219                                             NUD_VALID)) {
1220                                         if (rt->u.dst.neighbour)
1221                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1222                                         ip_rt_put(rth);
1223                                         rt_drop(rt);
1224                                         goto do_next;
1225                                 }
1226
1227                                 netevent.old = &rth->u.dst;
1228                                 netevent.new = &rt->u.dst;
1229                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1230                                                         &netevent);
1231
1232                                 rt_del(hash, rth);
1233                                 if (!rt_intern_hash(hash, rt, &rt))
1234                                         ip_rt_put(rt);
1235                                 goto do_next;
1236                         }
1237                         rcu_read_unlock();
1238                 do_next:
1239                         ;
1240                 }
1241         }
1242         in_dev_put(in_dev);
1243         return;
1244
1245 reject_redirect:
1246 #ifdef CONFIG_IP_ROUTE_VERBOSE
1247         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1248                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1249                         "%u.%u.%u.%u ignored.\n"
1250                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1251                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1252                        NIPQUAD(saddr), NIPQUAD(daddr));
1253 #endif
1254         in_dev_put(in_dev);
1255 }
1256
1257 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1258 {
1259         struct rtable *rt = (struct rtable*)dst;
1260         struct dst_entry *ret = dst;
1261
1262         if (rt) {
1263                 if (dst->obsolete) {
1264                         ip_rt_put(rt);
1265                         ret = NULL;
1266                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1267                            rt->u.dst.expires) {
1268                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1269                                                 rt->fl.oif);
1270 #if RT_CACHE_DEBUG >= 1
1271                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1272                                           "%u.%u.%u.%u/%02x dropped\n",
1273                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1274 #endif
1275                         rt_del(hash, rt);
1276                         ret = NULL;
1277                 }
1278         }
1279         return ret;
1280 }
1281
1282 /*
1283  * Algorithm:
1284  *      1. The first ip_rt_redirect_number redirects are sent
1285  *         with exponential backoff, then we stop sending them at all,
1286  *         assuming that the host ignores our redirects.
1287  *      2. If we did not see packets requiring redirects
1288  *         during ip_rt_redirect_silence, we assume that the host
1289  *         forgot redirected route and start to send redirects again.
1290  *
1291  * This algorithm is much cheaper and more intelligent than dumb load limiting
1292  * in icmp.c.
1293  *
1294  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1295  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1296  */
1297
1298 void ip_rt_send_redirect(struct sk_buff *skb)
1299 {
1300         struct rtable *rt = (struct rtable*)skb->dst;
1301         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1302
1303         if (!in_dev)
1304                 return;
1305
1306         if (!IN_DEV_TX_REDIRECTS(in_dev))
1307                 goto out;
1308
1309         /* No redirected packets during ip_rt_redirect_silence;
1310          * reset the algorithm.
1311          */
1312         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1313                 rt->u.dst.rate_tokens = 0;
1314
1315         /* Too many ignored redirects; do not send anything
1316          * set u.dst.rate_last to the last seen redirected packet.
1317          */
1318         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1319                 rt->u.dst.rate_last = jiffies;
1320                 goto out;
1321         }
1322
1323         /* Check for load limit; set rate_last to the latest sent
1324          * redirect.
1325          */
1326         if (rt->u.dst.rate_tokens == 0 ||
1327             time_after(jiffies,
1328                        (rt->u.dst.rate_last +
1329                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1330                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1331                 rt->u.dst.rate_last = jiffies;
1332                 ++rt->u.dst.rate_tokens;
1333 #ifdef CONFIG_IP_ROUTE_VERBOSE
1334                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1335                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1336                     net_ratelimit())
1337                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1338                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1339                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1340                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1341 #endif
1342         }
1343 out:
1344         in_dev_put(in_dev);
1345 }
1346
1347 static int ip_error(struct sk_buff *skb)
1348 {
1349         struct rtable *rt = (struct rtable*)skb->dst;
1350         unsigned long now;
1351         int code;
1352
1353         switch (rt->u.dst.error) {
1354                 case EINVAL:
1355                 default:
1356                         goto out;
1357                 case EHOSTUNREACH:
1358                         code = ICMP_HOST_UNREACH;
1359                         break;
1360                 case ENETUNREACH:
1361                         code = ICMP_NET_UNREACH;
1362                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1363                         break;
1364                 case EACCES:
1365                         code = ICMP_PKT_FILTERED;
1366                         break;
1367         }
1368
1369         now = jiffies;
1370         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1371         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1372                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1373         rt->u.dst.rate_last = now;
1374         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1375                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1376                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1377         }
1378
1379 out:    kfree_skb(skb);
1380         return 0;
1381 }
1382
1383 /*
1384  *      The last two values are not from the RFC but
1385  *      are needed for AMPRnet AX.25 paths.
1386  */
1387
1388 static const unsigned short mtu_plateau[] =
1389 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1390
1391 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1392 {
1393         int i;
1394
1395         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1396                 if (old_mtu > mtu_plateau[i])
1397                         return mtu_plateau[i];
1398         return 68;
1399 }
1400
1401 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1402                                  unsigned short new_mtu)
1403 {
1404         int i;
1405         unsigned short old_mtu = ntohs(iph->tot_len);
1406         struct rtable *rth;
1407         __be32  skeys[2] = { iph->saddr, 0, };
1408         __be32  daddr = iph->daddr;
1409         unsigned short est_mtu = 0;
1410
1411         if (ipv4_config.no_pmtu_disc)
1412                 return 0;
1413
1414         for (i = 0; i < 2; i++) {
1415                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1416
1417                 rcu_read_lock();
1418                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1419                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1420                         if (rth->fl.fl4_dst == daddr &&
1421                             rth->fl.fl4_src == skeys[i] &&
1422                             rth->rt_dst  == daddr &&
1423                             rth->rt_src  == iph->saddr &&
1424                             rth->fl.iif == 0 &&
1425                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1426                             rth->u.dst.dev->nd_net == net &&
1427                             rth->rt_genid == atomic_read(&rt_genid)) {
1428                                 unsigned short mtu = new_mtu;
1429
1430                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1431
1432                                         /* BSD 4.2 compatibility hack :-( */
1433                                         if (mtu == 0 &&
1434                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1435                                             old_mtu >= 68 + (iph->ihl << 2))
1436                                                 old_mtu -= iph->ihl << 2;
1437
1438                                         mtu = guess_mtu(old_mtu);
1439                                 }
1440                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1441                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1442                                                 dst_confirm(&rth->u.dst);
1443                                                 if (mtu < ip_rt_min_pmtu) {
1444                                                         mtu = ip_rt_min_pmtu;
1445                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1446                                                                 (1 << RTAX_MTU);
1447                                                 }
1448                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1449                                                 dst_set_expires(&rth->u.dst,
1450                                                         ip_rt_mtu_expires);
1451                                         }
1452                                         est_mtu = mtu;
1453                                 }
1454                         }
1455                 }
1456                 rcu_read_unlock();
1457         }
1458         return est_mtu ? : new_mtu;
1459 }
1460
1461 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1462 {
1463         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1464             !(dst_metric_locked(dst, RTAX_MTU))) {
1465                 if (mtu < ip_rt_min_pmtu) {
1466                         mtu = ip_rt_min_pmtu;
1467                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1468                 }
1469                 dst->metrics[RTAX_MTU-1] = mtu;
1470                 dst_set_expires(dst, ip_rt_mtu_expires);
1471                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1472         }
1473 }
1474
1475 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1476 {
1477         return NULL;
1478 }
1479
1480 static void ipv4_dst_destroy(struct dst_entry *dst)
1481 {
1482         struct rtable *rt = (struct rtable *) dst;
1483         struct inet_peer *peer = rt->peer;
1484         struct in_device *idev = rt->idev;
1485
1486         if (peer) {
1487                 rt->peer = NULL;
1488                 inet_putpeer(peer);
1489         }
1490
1491         if (idev) {
1492                 rt->idev = NULL;
1493                 in_dev_put(idev);
1494         }
1495 }
1496
1497 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1498                             int how)
1499 {
1500         struct rtable *rt = (struct rtable *) dst;
1501         struct in_device *idev = rt->idev;
1502         if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1503                 struct in_device *loopback_idev =
1504                         in_dev_get(dev->nd_net->loopback_dev);
1505                 if (loopback_idev) {
1506                         rt->idev = loopback_idev;
1507                         in_dev_put(idev);
1508                 }
1509         }
1510 }
1511
1512 static void ipv4_link_failure(struct sk_buff *skb)
1513 {
1514         struct rtable *rt;
1515
1516         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1517
1518         rt = (struct rtable *) skb->dst;
1519         if (rt)
1520                 dst_set_expires(&rt->u.dst, 0);
1521 }
1522
1523 static int ip_rt_bug(struct sk_buff *skb)
1524 {
1525         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1526                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1527                 skb->dev ? skb->dev->name : "?");
1528         kfree_skb(skb);
1529         return 0;
1530 }
1531
1532 /*
1533    We do not cache source address of outgoing interface,
1534    because it is used only by IP RR, TS and SRR options,
1535    so that it out of fast path.
1536
1537    BTW remember: "addr" is allowed to be not aligned
1538    in IP options!
1539  */
1540
1541 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1542 {
1543         __be32 src;
1544         struct fib_result res;
1545
1546         if (rt->fl.iif == 0)
1547                 src = rt->rt_src;
1548         else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
1549                 src = FIB_RES_PREFSRC(res);
1550                 fib_res_put(&res);
1551         } else
1552                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1553                                         RT_SCOPE_UNIVERSE);
1554         memcpy(addr, &src, 4);
1555 }
1556
1557 #ifdef CONFIG_NET_CLS_ROUTE
1558 static void set_class_tag(struct rtable *rt, u32 tag)
1559 {
1560         if (!(rt->u.dst.tclassid & 0xFFFF))
1561                 rt->u.dst.tclassid |= tag & 0xFFFF;
1562         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1563                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1564 }
1565 #endif
1566
1567 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1568 {
1569         struct fib_info *fi = res->fi;
1570
1571         if (fi) {
1572                 if (FIB_RES_GW(*res) &&
1573                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1574                         rt->rt_gateway = FIB_RES_GW(*res);
1575                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1576                        sizeof(rt->u.dst.metrics));
1577                 if (fi->fib_mtu == 0) {
1578                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1579                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1580                             rt->rt_gateway != rt->rt_dst &&
1581                             rt->u.dst.dev->mtu > 576)
1582                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1583                 }
1584 #ifdef CONFIG_NET_CLS_ROUTE
1585                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1586 #endif
1587         } else
1588                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1589
1590         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1591                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1592         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1593                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1594         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1595                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1596                                        ip_rt_min_advmss);
1597         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1598                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1599
1600 #ifdef CONFIG_NET_CLS_ROUTE
1601 #ifdef CONFIG_IP_MULTIPLE_TABLES
1602         set_class_tag(rt, fib_rules_tclass(res));
1603 #endif
1604         set_class_tag(rt, itag);
1605 #endif
1606         rt->rt_type = res->type;
1607 }
1608
1609 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1610                                 u8 tos, struct net_device *dev, int our)
1611 {
1612         unsigned hash;
1613         struct rtable *rth;
1614         __be32 spec_dst;
1615         struct in_device *in_dev = in_dev_get(dev);
1616         u32 itag = 0;
1617
1618         /* Primary sanity checks. */
1619
1620         if (in_dev == NULL)
1621                 return -EINVAL;
1622
1623         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1624             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1625                 goto e_inval;
1626
1627         if (ipv4_is_zeronet(saddr)) {
1628                 if (!ipv4_is_local_multicast(daddr))
1629                         goto e_inval;
1630                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1631         } else if (fib_validate_source(saddr, 0, tos, 0,
1632                                         dev, &spec_dst, &itag) < 0)
1633                 goto e_inval;
1634
1635         rth = dst_alloc(&ipv4_dst_ops);
1636         if (!rth)
1637                 goto e_nobufs;
1638
1639         rth->u.dst.output= ip_rt_bug;
1640
1641         atomic_set(&rth->u.dst.__refcnt, 1);
1642         rth->u.dst.flags= DST_HOST;
1643         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1644                 rth->u.dst.flags |= DST_NOPOLICY;
1645         rth->fl.fl4_dst = daddr;
1646         rth->rt_dst     = daddr;
1647         rth->fl.fl4_tos = tos;
1648         rth->fl.mark    = skb->mark;
1649         rth->fl.fl4_src = saddr;
1650         rth->rt_src     = saddr;
1651 #ifdef CONFIG_NET_CLS_ROUTE
1652         rth->u.dst.tclassid = itag;
1653 #endif
1654         rth->rt_iif     =
1655         rth->fl.iif     = dev->ifindex;
1656         rth->u.dst.dev  = init_net.loopback_dev;
1657         dev_hold(rth->u.dst.dev);
1658         rth->idev       = in_dev_get(rth->u.dst.dev);
1659         rth->fl.oif     = 0;
1660         rth->rt_gateway = daddr;
1661         rth->rt_spec_dst= spec_dst;
1662         rth->rt_genid   = atomic_read(&rt_genid);
1663         rth->rt_flags   = RTCF_MULTICAST;
1664         rth->rt_type    = RTN_MULTICAST;
1665         if (our) {
1666                 rth->u.dst.input= ip_local_deliver;
1667                 rth->rt_flags |= RTCF_LOCAL;
1668         }
1669
1670 #ifdef CONFIG_IP_MROUTE
1671         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1672                 rth->u.dst.input = ip_mr_input;
1673 #endif
1674         RT_CACHE_STAT_INC(in_slow_mc);
1675
1676         in_dev_put(in_dev);
1677         hash = rt_hash(daddr, saddr, dev->ifindex);
1678         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1679
1680 e_nobufs:
1681         in_dev_put(in_dev);
1682         return -ENOBUFS;
1683
1684 e_inval:
1685         in_dev_put(in_dev);
1686         return -EINVAL;
1687 }
1688
1689
1690 static void ip_handle_martian_source(struct net_device *dev,
1691                                      struct in_device *in_dev,
1692                                      struct sk_buff *skb,
1693                                      __be32 daddr,
1694                                      __be32 saddr)
1695 {
1696         RT_CACHE_STAT_INC(in_martian_src);
1697 #ifdef CONFIG_IP_ROUTE_VERBOSE
1698         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1699                 /*
1700                  *      RFC1812 recommendation, if source is martian,
1701                  *      the only hint is MAC header.
1702                  */
1703                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1704                         "%u.%u.%u.%u, on dev %s\n",
1705                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1706                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1707                         int i;
1708                         const unsigned char *p = skb_mac_header(skb);
1709                         printk(KERN_WARNING "ll header: ");
1710                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1711                                 printk("%02x", *p);
1712                                 if (i < (dev->hard_header_len - 1))
1713                                         printk(":");
1714                         }
1715                         printk("\n");
1716                 }
1717         }
1718 #endif
1719 }
1720
1721 static inline int __mkroute_input(struct sk_buff *skb,
1722                                   struct fib_result* res,
1723                                   struct in_device *in_dev,
1724                                   __be32 daddr, __be32 saddr, u32 tos,
1725                                   struct rtable **result)
1726 {
1727
1728         struct rtable *rth;
1729         int err;
1730         struct in_device *out_dev;
1731         unsigned flags = 0;
1732         __be32 spec_dst;
1733         u32 itag;
1734
1735         /* get a working reference to the output device */
1736         out_dev = in_dev_get(FIB_RES_DEV(*res));
1737         if (out_dev == NULL) {
1738                 if (net_ratelimit())
1739                         printk(KERN_CRIT "Bug in ip_route_input" \
1740                                "_slow(). Please, report\n");
1741                 return -EINVAL;
1742         }
1743
1744
1745         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1746                                   in_dev->dev, &spec_dst, &itag);
1747         if (err < 0) {
1748                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1749                                          saddr);
1750
1751                 err = -EINVAL;
1752                 goto cleanup;
1753         }
1754
1755         if (err)
1756                 flags |= RTCF_DIRECTSRC;
1757
1758         if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1759             (IN_DEV_SHARED_MEDIA(out_dev) ||
1760              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1761                 flags |= RTCF_DOREDIRECT;
1762
1763         if (skb->protocol != htons(ETH_P_IP)) {
1764                 /* Not IP (i.e. ARP). Do not create route, if it is
1765                  * invalid for proxy arp. DNAT routes are always valid.
1766                  */
1767                 if (out_dev == in_dev) {
1768                         err = -EINVAL;
1769                         goto cleanup;
1770                 }
1771         }
1772
1773
1774         rth = dst_alloc(&ipv4_dst_ops);
1775         if (!rth) {
1776                 err = -ENOBUFS;
1777                 goto cleanup;
1778         }
1779
1780         atomic_set(&rth->u.dst.__refcnt, 1);
1781         rth->u.dst.flags= DST_HOST;
1782         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1783                 rth->u.dst.flags |= DST_NOPOLICY;
1784         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1785                 rth->u.dst.flags |= DST_NOXFRM;
1786         rth->fl.fl4_dst = daddr;
1787         rth->rt_dst     = daddr;
1788         rth->fl.fl4_tos = tos;
1789         rth->fl.mark    = skb->mark;
1790         rth->fl.fl4_src = saddr;
1791         rth->rt_src     = saddr;
1792         rth->rt_gateway = daddr;
1793         rth->rt_iif     =
1794                 rth->fl.iif     = in_dev->dev->ifindex;
1795         rth->u.dst.dev  = (out_dev)->dev;
1796         dev_hold(rth->u.dst.dev);
1797         rth->idev       = in_dev_get(rth->u.dst.dev);
1798         rth->fl.oif     = 0;
1799         rth->rt_spec_dst= spec_dst;
1800
1801         rth->u.dst.input = ip_forward;
1802         rth->u.dst.output = ip_output;
1803         rth->rt_genid = atomic_read(&rt_genid);
1804
1805         rt_set_nexthop(rth, res, itag);
1806
1807         rth->rt_flags = flags;
1808
1809         *result = rth;
1810         err = 0;
1811  cleanup:
1812         /* release the working reference to the output device */
1813         in_dev_put(out_dev);
1814         return err;
1815 }
1816
1817 static inline int ip_mkroute_input(struct sk_buff *skb,
1818                                    struct fib_result* res,
1819                                    const struct flowi *fl,
1820                                    struct in_device *in_dev,
1821                                    __be32 daddr, __be32 saddr, u32 tos)
1822 {
1823         struct rtable* rth = NULL;
1824         int err;
1825         unsigned hash;
1826
1827 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1828         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1829                 fib_select_multipath(fl, res);
1830 #endif
1831
1832         /* create a routing cache entry */
1833         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1834         if (err)
1835                 return err;
1836
1837         /* put it into the cache */
1838         hash = rt_hash(daddr, saddr, fl->iif);
1839         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1840 }
1841
1842 /*
1843  *      NOTE. We drop all the packets that has local source
1844  *      addresses, because every properly looped back packet
1845  *      must have correct destination already attached by output routine.
1846  *
1847  *      Such approach solves two big problems:
1848  *      1. Not simplex devices are handled properly.
1849  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1850  */
1851
1852 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1853                                u8 tos, struct net_device *dev)
1854 {
1855         struct fib_result res;
1856         struct in_device *in_dev = in_dev_get(dev);
1857         struct flowi fl = { .nl_u = { .ip4_u =
1858                                       { .daddr = daddr,
1859                                         .saddr = saddr,
1860                                         .tos = tos,
1861                                         .scope = RT_SCOPE_UNIVERSE,
1862                                       } },
1863                             .mark = skb->mark,
1864                             .iif = dev->ifindex };
1865         unsigned        flags = 0;
1866         u32             itag = 0;
1867         struct rtable * rth;
1868         unsigned        hash;
1869         __be32          spec_dst;
1870         int             err = -EINVAL;
1871         int             free_res = 0;
1872         struct net    * net = dev->nd_net;
1873
1874         /* IP on this device is disabled. */
1875
1876         if (!in_dev)
1877                 goto out;
1878
1879         /* Check for the most weird martians, which can be not detected
1880            by fib_lookup.
1881          */
1882
1883         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1884             ipv4_is_loopback(saddr))
1885                 goto martian_source;
1886
1887         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1888                 goto brd_input;
1889
1890         /* Accept zero addresses only to limited broadcast;
1891          * I even do not know to fix it or not. Waiting for complains :-)
1892          */
1893         if (ipv4_is_zeronet(saddr))
1894                 goto martian_source;
1895
1896         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1897             ipv4_is_loopback(daddr))
1898                 goto martian_destination;
1899
1900         /*
1901          *      Now we are ready to route packet.
1902          */
1903         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1904                 if (!IN_DEV_FORWARD(in_dev))
1905                         goto e_hostunreach;
1906                 goto no_route;
1907         }
1908         free_res = 1;
1909
1910         RT_CACHE_STAT_INC(in_slow_tot);
1911
1912         if (res.type == RTN_BROADCAST)
1913                 goto brd_input;
1914
1915         if (res.type == RTN_LOCAL) {
1916                 int result;
1917                 result = fib_validate_source(saddr, daddr, tos,
1918                                              net->loopback_dev->ifindex,
1919                                              dev, &spec_dst, &itag);
1920                 if (result < 0)
1921                         goto martian_source;
1922                 if (result)
1923                         flags |= RTCF_DIRECTSRC;
1924                 spec_dst = daddr;
1925                 goto local_input;
1926         }
1927
1928         if (!IN_DEV_FORWARD(in_dev))
1929                 goto e_hostunreach;
1930         if (res.type != RTN_UNICAST)
1931                 goto martian_destination;
1932
1933         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1934 done:
1935         in_dev_put(in_dev);
1936         if (free_res)
1937                 fib_res_put(&res);
1938 out:    return err;
1939
1940 brd_input:
1941         if (skb->protocol != htons(ETH_P_IP))
1942                 goto e_inval;
1943
1944         if (ipv4_is_zeronet(saddr))
1945                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1946         else {
1947                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1948                                           &itag);
1949                 if (err < 0)
1950                         goto martian_source;
1951                 if (err)
1952                         flags |= RTCF_DIRECTSRC;
1953         }
1954         flags |= RTCF_BROADCAST;
1955         res.type = RTN_BROADCAST;
1956         RT_CACHE_STAT_INC(in_brd);
1957
1958 local_input:
1959         rth = dst_alloc(&ipv4_dst_ops);
1960         if (!rth)
1961                 goto e_nobufs;
1962
1963         rth->u.dst.output= ip_rt_bug;
1964         rth->rt_genid = atomic_read(&rt_genid);
1965
1966         atomic_set(&rth->u.dst.__refcnt, 1);
1967         rth->u.dst.flags= DST_HOST;
1968         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1969                 rth->u.dst.flags |= DST_NOPOLICY;
1970         rth->fl.fl4_dst = daddr;
1971         rth->rt_dst     = daddr;
1972         rth->fl.fl4_tos = tos;
1973         rth->fl.mark    = skb->mark;
1974         rth->fl.fl4_src = saddr;
1975         rth->rt_src     = saddr;
1976 #ifdef CONFIG_NET_CLS_ROUTE
1977         rth->u.dst.tclassid = itag;
1978 #endif
1979         rth->rt_iif     =
1980         rth->fl.iif     = dev->ifindex;
1981         rth->u.dst.dev  = net->loopback_dev;
1982         dev_hold(rth->u.dst.dev);
1983         rth->idev       = in_dev_get(rth->u.dst.dev);
1984         rth->rt_gateway = daddr;
1985         rth->rt_spec_dst= spec_dst;
1986         rth->u.dst.input= ip_local_deliver;
1987         rth->rt_flags   = flags|RTCF_LOCAL;
1988         if (res.type == RTN_UNREACHABLE) {
1989                 rth->u.dst.input= ip_error;
1990                 rth->u.dst.error= -err;
1991                 rth->rt_flags   &= ~RTCF_LOCAL;
1992         }
1993         rth->rt_type    = res.type;
1994         hash = rt_hash(daddr, saddr, fl.iif);
1995         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1996         goto done;
1997
1998 no_route:
1999         RT_CACHE_STAT_INC(in_no_route);
2000         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2001         res.type = RTN_UNREACHABLE;
2002         if (err == -ESRCH)
2003                 err = -ENETUNREACH;
2004         goto local_input;
2005
2006         /*
2007          *      Do not cache martian addresses: they should be logged (RFC1812)
2008          */
2009 martian_destination:
2010         RT_CACHE_STAT_INC(in_martian_dst);
2011 #ifdef CONFIG_IP_ROUTE_VERBOSE
2012         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2013                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2014                         "%u.%u.%u.%u, dev %s\n",
2015                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2016 #endif
2017
2018 e_hostunreach:
2019         err = -EHOSTUNREACH;
2020         goto done;
2021
2022 e_inval:
2023         err = -EINVAL;
2024         goto done;
2025
2026 e_nobufs:
2027         err = -ENOBUFS;
2028         goto done;
2029
2030 martian_source:
2031         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2032         goto e_inval;
2033 }
2034
2035 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2036                    u8 tos, struct net_device *dev)
2037 {
2038         struct rtable * rth;
2039         unsigned        hash;
2040         int iif = dev->ifindex;
2041         struct net *net;
2042
2043         net = dev->nd_net;
2044         tos &= IPTOS_RT_MASK;
2045         hash = rt_hash(daddr, saddr, iif);
2046
2047         rcu_read_lock();
2048         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2049              rth = rcu_dereference(rth->u.dst.rt_next)) {
2050                 if (rth->fl.fl4_dst == daddr &&
2051                     rth->fl.fl4_src == saddr &&
2052                     rth->fl.iif == iif &&
2053                     rth->fl.oif == 0 &&
2054                     rth->fl.mark == skb->mark &&
2055                     rth->fl.fl4_tos == tos &&
2056                     rth->u.dst.dev->nd_net == net &&
2057                     rth->rt_genid == atomic_read(&rt_genid)) {
2058                         dst_use(&rth->u.dst, jiffies);
2059                         RT_CACHE_STAT_INC(in_hit);
2060                         rcu_read_unlock();
2061                         skb->dst = (struct dst_entry*)rth;
2062                         return 0;
2063                 }
2064                 RT_CACHE_STAT_INC(in_hlist_search);
2065         }
2066         rcu_read_unlock();
2067
2068         /* Multicast recognition logic is moved from route cache to here.
2069            The problem was that too many Ethernet cards have broken/missing
2070            hardware multicast filters :-( As result the host on multicasting
2071            network acquires a lot of useless route cache entries, sort of
2072            SDR messages from all the world. Now we try to get rid of them.
2073            Really, provided software IP multicast filter is organized
2074            reasonably (at least, hashed), it does not result in a slowdown
2075            comparing with route cache reject entries.
2076            Note, that multicast routers are not affected, because
2077            route cache entry is created eventually.
2078          */
2079         if (ipv4_is_multicast(daddr)) {
2080                 struct in_device *in_dev;
2081
2082                 rcu_read_lock();
2083                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2084                         int our = ip_check_mc(in_dev, daddr, saddr,
2085                                 ip_hdr(skb)->protocol);
2086                         if (our
2087 #ifdef CONFIG_IP_MROUTE
2088                             || (!ipv4_is_local_multicast(daddr) &&
2089                                 IN_DEV_MFORWARD(in_dev))
2090 #endif
2091                             ) {
2092                                 rcu_read_unlock();
2093                                 return ip_route_input_mc(skb, daddr, saddr,
2094                                                          tos, dev, our);
2095                         }
2096                 }
2097                 rcu_read_unlock();
2098                 return -EINVAL;
2099         }
2100         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2101 }
2102
2103 static inline int __mkroute_output(struct rtable **result,
2104                                    struct fib_result* res,
2105                                    const struct flowi *fl,
2106                                    const struct flowi *oldflp,
2107                                    struct net_device *dev_out,
2108                                    unsigned flags)
2109 {
2110         struct rtable *rth;
2111         struct in_device *in_dev;
2112         u32 tos = RT_FL_TOS(oldflp);
2113         int err = 0;
2114
2115         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2116                 return -EINVAL;
2117
2118         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2119                 res->type = RTN_BROADCAST;
2120         else if (ipv4_is_multicast(fl->fl4_dst))
2121                 res->type = RTN_MULTICAST;
2122         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2123                 return -EINVAL;
2124
2125         if (dev_out->flags & IFF_LOOPBACK)
2126                 flags |= RTCF_LOCAL;
2127
2128         /* get work reference to inet device */
2129         in_dev = in_dev_get(dev_out);
2130         if (!in_dev)
2131                 return -EINVAL;
2132
2133         if (res->type == RTN_BROADCAST) {
2134                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2135                 if (res->fi) {
2136                         fib_info_put(res->fi);
2137                         res->fi = NULL;
2138                 }
2139         } else if (res->type == RTN_MULTICAST) {
2140                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2141                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2142                                  oldflp->proto))
2143                         flags &= ~RTCF_LOCAL;
2144                 /* If multicast route do not exist use
2145                    default one, but do not gateway in this case.
2146                    Yes, it is hack.
2147                  */
2148                 if (res->fi && res->prefixlen < 4) {
2149                         fib_info_put(res->fi);
2150                         res->fi = NULL;
2151                 }
2152         }
2153
2154
2155         rth = dst_alloc(&ipv4_dst_ops);
2156         if (!rth) {
2157                 err = -ENOBUFS;
2158                 goto cleanup;
2159         }
2160
2161         atomic_set(&rth->u.dst.__refcnt, 1);
2162         rth->u.dst.flags= DST_HOST;
2163         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2164                 rth->u.dst.flags |= DST_NOXFRM;
2165         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2166                 rth->u.dst.flags |= DST_NOPOLICY;
2167
2168         rth->fl.fl4_dst = oldflp->fl4_dst;
2169         rth->fl.fl4_tos = tos;
2170         rth->fl.fl4_src = oldflp->fl4_src;
2171         rth->fl.oif     = oldflp->oif;
2172         rth->fl.mark    = oldflp->mark;
2173         rth->rt_dst     = fl->fl4_dst;
2174         rth->rt_src     = fl->fl4_src;
2175         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2176         /* get references to the devices that are to be hold by the routing
2177            cache entry */
2178         rth->u.dst.dev  = dev_out;
2179         dev_hold(dev_out);
2180         rth->idev       = in_dev_get(dev_out);
2181         rth->rt_gateway = fl->fl4_dst;
2182         rth->rt_spec_dst= fl->fl4_src;
2183
2184         rth->u.dst.output=ip_output;
2185         rth->rt_genid = atomic_read(&rt_genid);
2186
2187         RT_CACHE_STAT_INC(out_slow_tot);
2188
2189         if (flags & RTCF_LOCAL) {
2190                 rth->u.dst.input = ip_local_deliver;
2191                 rth->rt_spec_dst = fl->fl4_dst;
2192         }
2193         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2194                 rth->rt_spec_dst = fl->fl4_src;
2195                 if (flags & RTCF_LOCAL &&
2196                     !(dev_out->flags & IFF_LOOPBACK)) {
2197                         rth->u.dst.output = ip_mc_output;
2198                         RT_CACHE_STAT_INC(out_slow_mc);
2199                 }
2200 #ifdef CONFIG_IP_MROUTE
2201                 if (res->type == RTN_MULTICAST) {
2202                         if (IN_DEV_MFORWARD(in_dev) &&
2203                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2204                                 rth->u.dst.input = ip_mr_input;
2205                                 rth->u.dst.output = ip_mc_output;
2206                         }
2207                 }
2208 #endif
2209         }
2210
2211         rt_set_nexthop(rth, res, 0);
2212
2213         rth->rt_flags = flags;
2214
2215         *result = rth;
2216  cleanup:
2217         /* release work reference to inet device */
2218         in_dev_put(in_dev);
2219
2220         return err;
2221 }
2222
2223 static inline int ip_mkroute_output(struct rtable **rp,
2224                                     struct fib_result* res,
2225                                     const struct flowi *fl,
2226                                     const struct flowi *oldflp,
2227                                     struct net_device *dev_out,
2228                                     unsigned flags)
2229 {
2230         struct rtable *rth = NULL;
2231         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2232         unsigned hash;
2233         if (err == 0) {
2234                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2235                 err = rt_intern_hash(hash, rth, rp);
2236         }
2237
2238         return err;
2239 }
2240
2241 /*
2242  * Major route resolver routine.
2243  */
2244
2245 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2246                                 const struct flowi *oldflp)
2247 {
2248         u32 tos = RT_FL_TOS(oldflp);
2249         struct flowi fl = { .nl_u = { .ip4_u =
2250                                       { .daddr = oldflp->fl4_dst,
2251                                         .saddr = oldflp->fl4_src,
2252                                         .tos = tos & IPTOS_RT_MASK,
2253                                         .scope = ((tos & RTO_ONLINK) ?
2254                                                   RT_SCOPE_LINK :
2255                                                   RT_SCOPE_UNIVERSE),
2256                                       } },
2257                             .mark = oldflp->mark,
2258                             .iif = net->loopback_dev->ifindex,
2259                             .oif = oldflp->oif };
2260         struct fib_result res;
2261         unsigned flags = 0;
2262         struct net_device *dev_out = NULL;
2263         int free_res = 0;
2264         int err;
2265
2266
2267         res.fi          = NULL;
2268 #ifdef CONFIG_IP_MULTIPLE_TABLES
2269         res.r           = NULL;
2270 #endif
2271
2272         if (oldflp->fl4_src) {
2273                 err = -EINVAL;
2274                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2275                     ipv4_is_lbcast(oldflp->fl4_src) ||
2276                     ipv4_is_zeronet(oldflp->fl4_src))
2277                         goto out;
2278
2279                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2280                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2281                 if (dev_out == NULL)
2282                         goto out;
2283
2284                 /* I removed check for oif == dev_out->oif here.
2285                    It was wrong for two reasons:
2286                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2287                       is assigned to multiple interfaces.
2288                    2. Moreover, we are allowed to send packets with saddr
2289                       of another iface. --ANK
2290                  */
2291
2292                 if (oldflp->oif == 0
2293                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2294                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2295                         /* Special hack: user can direct multicasts
2296                            and limited broadcast via necessary interface
2297                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2298                            This hack is not just for fun, it allows
2299                            vic,vat and friends to work.
2300                            They bind socket to loopback, set ttl to zero
2301                            and expect that it will work.
2302                            From the viewpoint of routing cache they are broken,
2303                            because we are not allowed to build multicast path
2304                            with loopback source addr (look, routing cache
2305                            cannot know, that ttl is zero, so that packet
2306                            will not leave this host and route is valid).
2307                            Luckily, this hack is good workaround.
2308                          */
2309
2310                         fl.oif = dev_out->ifindex;
2311                         goto make_route;
2312                 }
2313                 if (dev_out)
2314                         dev_put(dev_out);
2315                 dev_out = NULL;
2316         }
2317
2318
2319         if (oldflp->oif) {
2320                 dev_out = dev_get_by_index(net, oldflp->oif);
2321                 err = -ENODEV;
2322                 if (dev_out == NULL)
2323                         goto out;
2324
2325                 /* RACE: Check return value of inet_select_addr instead. */
2326                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2327                         dev_put(dev_out);
2328                         goto out;       /* Wrong error code */
2329                 }
2330
2331                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2332                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2333                         if (!fl.fl4_src)
2334                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2335                                                               RT_SCOPE_LINK);
2336                         goto make_route;
2337                 }
2338                 if (!fl.fl4_src) {
2339                         if (ipv4_is_multicast(oldflp->fl4_dst))
2340                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2341                                                               fl.fl4_scope);
2342                         else if (!oldflp->fl4_dst)
2343                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2344                                                               RT_SCOPE_HOST);
2345                 }
2346         }
2347
2348         if (!fl.fl4_dst) {
2349                 fl.fl4_dst = fl.fl4_src;
2350                 if (!fl.fl4_dst)
2351                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2352                 if (dev_out)
2353                         dev_put(dev_out);
2354                 dev_out = net->loopback_dev;
2355                 dev_hold(dev_out);
2356                 fl.oif = net->loopback_dev->ifindex;
2357                 res.type = RTN_LOCAL;
2358                 flags |= RTCF_LOCAL;
2359                 goto make_route;
2360         }
2361
2362         if (fib_lookup(net, &fl, &res)) {
2363                 res.fi = NULL;
2364                 if (oldflp->oif) {
2365                         /* Apparently, routing tables are wrong. Assume,
2366                            that the destination is on link.
2367
2368                            WHY? DW.
2369                            Because we are allowed to send to iface
2370                            even if it has NO routes and NO assigned
2371                            addresses. When oif is specified, routing
2372                            tables are looked up with only one purpose:
2373                            to catch if destination is gatewayed, rather than
2374                            direct. Moreover, if MSG_DONTROUTE is set,
2375                            we send packet, ignoring both routing tables
2376                            and ifaddr state. --ANK
2377
2378
2379                            We could make it even if oif is unknown,
2380                            likely IPv6, but we do not.
2381                          */
2382
2383                         if (fl.fl4_src == 0)
2384                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2385                                                               RT_SCOPE_LINK);
2386                         res.type = RTN_UNICAST;
2387                         goto make_route;
2388                 }
2389                 if (dev_out)
2390                         dev_put(dev_out);
2391                 err = -ENETUNREACH;
2392                 goto out;
2393         }
2394         free_res = 1;
2395
2396         if (res.type == RTN_LOCAL) {
2397                 if (!fl.fl4_src)
2398                         fl.fl4_src = fl.fl4_dst;
2399                 if (dev_out)
2400                         dev_put(dev_out);
2401                 dev_out = net->loopback_dev;
2402                 dev_hold(dev_out);
2403                 fl.oif = dev_out->ifindex;
2404                 if (res.fi)
2405                         fib_info_put(res.fi);
2406                 res.fi = NULL;
2407                 flags |= RTCF_LOCAL;
2408                 goto make_route;
2409         }
2410
2411 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2412         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2413                 fib_select_multipath(&fl, &res);
2414         else
2415 #endif
2416         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2417                 fib_select_default(net, &fl, &res);
2418
2419         if (!fl.fl4_src)
2420                 fl.fl4_src = FIB_RES_PREFSRC(res);
2421
2422         if (dev_out)
2423                 dev_put(dev_out);
2424         dev_out = FIB_RES_DEV(res);
2425         dev_hold(dev_out);
2426         fl.oif = dev_out->ifindex;
2427
2428
2429 make_route:
2430         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2431
2432
2433         if (free_res)
2434                 fib_res_put(&res);
2435         if (dev_out)
2436                 dev_put(dev_out);
2437 out:    return err;
2438 }
2439
2440 int __ip_route_output_key(struct net *net, struct rtable **rp,
2441                           const struct flowi *flp)
2442 {
2443         unsigned hash;
2444         struct rtable *rth;
2445
2446         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2447
2448         rcu_read_lock_bh();
2449         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2450                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2451                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2452                     rth->fl.fl4_src == flp->fl4_src &&
2453                     rth->fl.iif == 0 &&
2454                     rth->fl.oif == flp->oif &&
2455                     rth->fl.mark == flp->mark &&
2456                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2457                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2458                     rth->u.dst.dev->nd_net == net &&
2459                     rth->rt_genid == atomic_read(&rt_genid)) {
2460                         dst_use(&rth->u.dst, jiffies);
2461                         RT_CACHE_STAT_INC(out_hit);
2462                         rcu_read_unlock_bh();
2463                         *rp = rth;
2464                         return 0;
2465                 }
2466                 RT_CACHE_STAT_INC(out_hlist_search);
2467         }
2468         rcu_read_unlock_bh();
2469
2470         return ip_route_output_slow(net, rp, flp);
2471 }
2472
2473 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2474
2475 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2476 {
2477 }
2478
2479 static struct dst_ops ipv4_dst_blackhole_ops = {
2480         .family                 =       AF_INET,
2481         .protocol               =       __constant_htons(ETH_P_IP),
2482         .destroy                =       ipv4_dst_destroy,
2483         .check                  =       ipv4_dst_check,
2484         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2485         .entry_size             =       sizeof(struct rtable),
2486         .entries                =       ATOMIC_INIT(0),
2487 };
2488
2489
2490 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2491 {
2492         struct rtable *ort = *rp;
2493         struct rtable *rt = (struct rtable *)
2494                 dst_alloc(&ipv4_dst_blackhole_ops);
2495
2496         if (rt) {
2497                 struct dst_entry *new = &rt->u.dst;
2498
2499                 atomic_set(&new->__refcnt, 1);
2500                 new->__use = 1;
2501                 new->input = dst_discard;
2502                 new->output = dst_discard;
2503                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2504
2505                 new->dev = ort->u.dst.dev;
2506                 if (new->dev)
2507                         dev_hold(new->dev);
2508
2509                 rt->fl = ort->fl;
2510
2511                 rt->idev = ort->idev;
2512                 if (rt->idev)
2513                         in_dev_hold(rt->idev);
2514                 rt->rt_genid = atomic_read(&rt_genid);
2515                 rt->rt_flags = ort->rt_flags;
2516                 rt->rt_type = ort->rt_type;
2517                 rt->rt_dst = ort->rt_dst;
2518                 rt->rt_src = ort->rt_src;
2519                 rt->rt_iif = ort->rt_iif;
2520                 rt->rt_gateway = ort->rt_gateway;
2521                 rt->rt_spec_dst = ort->rt_spec_dst;
2522                 rt->peer = ort->peer;
2523                 if (rt->peer)
2524                         atomic_inc(&rt->peer->refcnt);
2525
2526                 dst_free(new);
2527         }
2528
2529         dst_release(&(*rp)->u.dst);
2530         *rp = rt;
2531         return (rt ? 0 : -ENOMEM);
2532 }
2533
2534 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2535                          struct sock *sk, int flags)
2536 {
2537         int err;
2538
2539         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2540                 return err;
2541
2542         if (flp->proto) {
2543                 if (!flp->fl4_src)
2544                         flp->fl4_src = (*rp)->rt_src;
2545                 if (!flp->fl4_dst)
2546                         flp->fl4_dst = (*rp)->rt_dst;
2547                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2548                                     flags ? XFRM_LOOKUP_WAIT : 0);
2549                 if (err == -EREMOTE)
2550                         err = ipv4_dst_blackhole(rp, flp, sk);
2551
2552                 return err;
2553         }
2554
2555         return 0;
2556 }
2557
2558 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2559
2560 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2561 {
2562         return ip_route_output_flow(net, rp, flp, NULL, 0);
2563 }
2564
2565 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2566                         int nowait, unsigned int flags)
2567 {
2568         struct rtable *rt = (struct rtable*)skb->dst;
2569         struct rtmsg *r;
2570         struct nlmsghdr *nlh;
2571         long expires;
2572         u32 id = 0, ts = 0, tsage = 0, error;
2573
2574         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2575         if (nlh == NULL)
2576                 return -EMSGSIZE;
2577
2578         r = nlmsg_data(nlh);
2579         r->rtm_family    = AF_INET;
2580         r->rtm_dst_len  = 32;
2581         r->rtm_src_len  = 0;
2582         r->rtm_tos      = rt->fl.fl4_tos;
2583         r->rtm_table    = RT_TABLE_MAIN;
2584         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2585         r->rtm_type     = rt->rt_type;
2586         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2587         r->rtm_protocol = RTPROT_UNSPEC;
2588         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2589         if (rt->rt_flags & RTCF_NOTIFY)
2590                 r->rtm_flags |= RTM_F_NOTIFY;
2591
2592         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2593
2594         if (rt->fl.fl4_src) {
2595                 r->rtm_src_len = 32;
2596                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2597         }
2598         if (rt->u.dst.dev)
2599                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2600 #ifdef CONFIG_NET_CLS_ROUTE
2601         if (rt->u.dst.tclassid)
2602                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2603 #endif
2604         if (rt->fl.iif)
2605                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2606         else if (rt->rt_src != rt->fl.fl4_src)
2607                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2608
2609         if (rt->rt_dst != rt->rt_gateway)
2610                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2611
2612         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2613                 goto nla_put_failure;
2614
2615         error = rt->u.dst.error;
2616         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2617         if (rt->peer) {
2618                 id = rt->peer->ip_id_count;
2619                 if (rt->peer->tcp_ts_stamp) {
2620                         ts = rt->peer->tcp_ts;
2621                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2622                 }
2623         }
2624
2625         if (rt->fl.iif) {
2626 #ifdef CONFIG_IP_MROUTE
2627                 __be32 dst = rt->rt_dst;
2628
2629                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2630                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2631                         int err = ipmr_get_route(skb, r, nowait);
2632                         if (err <= 0) {
2633                                 if (!nowait) {
2634                                         if (err == 0)
2635                                                 return 0;
2636                                         goto nla_put_failure;
2637                                 } else {
2638                                         if (err == -EMSGSIZE)
2639                                                 goto nla_put_failure;
2640                                         error = err;
2641                                 }
2642                         }
2643                 } else
2644 #endif
2645                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2646         }
2647
2648         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2649                                expires, error) < 0)
2650                 goto nla_put_failure;
2651
2652         return nlmsg_end(skb, nlh);
2653
2654 nla_put_failure:
2655         nlmsg_cancel(skb, nlh);
2656         return -EMSGSIZE;
2657 }
2658
2659 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2660 {
2661         struct net *net = in_skb->sk->sk_net;
2662         struct rtmsg *rtm;
2663         struct nlattr *tb[RTA_MAX+1];
2664         struct rtable *rt = NULL;
2665         __be32 dst = 0;
2666         __be32 src = 0;
2667         u32 iif;
2668         int err;
2669         struct sk_buff *skb;
2670
2671         if (net != &init_net)
2672                 return -EINVAL;
2673
2674         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2675         if (err < 0)
2676                 goto errout;
2677
2678         rtm = nlmsg_data(nlh);
2679
2680         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2681         if (skb == NULL) {
2682                 err = -ENOBUFS;
2683                 goto errout;
2684         }
2685
2686         /* Reserve room for dummy headers, this skb can pass
2687            through good chunk of routing engine.
2688          */
2689         skb_reset_mac_header(skb);
2690         skb_reset_network_header(skb);
2691
2692         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2693         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2694         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2695
2696         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2697         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2698         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2699
2700         if (iif) {
2701                 struct net_device *dev;
2702
2703                 dev = __dev_get_by_index(&init_net, iif);
2704                 if (dev == NULL) {
2705                         err = -ENODEV;
2706                         goto errout_free;
2707                 }
2708
2709                 skb->protocol   = htons(ETH_P_IP);
2710                 skb->dev        = dev;
2711                 local_bh_disable();
2712                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2713                 local_bh_enable();
2714
2715                 rt = (struct rtable*) skb->dst;
2716                 if (err == 0 && rt->u.dst.error)
2717                         err = -rt->u.dst.error;
2718         } else {
2719                 struct flowi fl = {
2720                         .nl_u = {
2721                                 .ip4_u = {
2722                                         .daddr = dst,
2723                                         .saddr = src,
2724                                         .tos = rtm->rtm_tos,
2725                                 },
2726                         },
2727                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2728                 };
2729                 err = ip_route_output_key(&init_net, &rt, &fl);
2730         }
2731
2732         if (err)
2733                 goto errout_free;
2734
2735         skb->dst = &rt->u.dst;
2736         if (rtm->rtm_flags & RTM_F_NOTIFY)
2737                 rt->rt_flags |= RTCF_NOTIFY;
2738
2739         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2740                                 RTM_NEWROUTE, 0, 0);
2741         if (err <= 0)
2742                 goto errout_free;
2743
2744         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2745 errout:
2746         return err;
2747
2748 errout_free:
2749         kfree_skb(skb);
2750         goto errout;
2751 }
2752
2753 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2754 {
2755         struct rtable *rt;
2756         int h, s_h;
2757         int idx, s_idx;
2758
2759         s_h = cb->args[0];
2760         if (s_h < 0)
2761                 s_h = 0;
2762         s_idx = idx = cb->args[1];
2763         for (h = s_h; h <= rt_hash_mask; h++) {
2764                 rcu_read_lock_bh();
2765                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2766                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2767                         if (idx < s_idx)
2768                                 continue;
2769                         if (rt->rt_genid != atomic_read(&rt_genid))
2770                                 continue;
2771                         skb->dst = dst_clone(&rt->u.dst);
2772                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2773                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2774                                          1, NLM_F_MULTI) <= 0) {
2775                                 dst_release(xchg(&skb->dst, NULL));
2776                                 rcu_read_unlock_bh();
2777                                 goto done;
2778                         }
2779                         dst_release(xchg(&skb->dst, NULL));
2780                 }
2781                 rcu_read_unlock_bh();
2782                 s_idx = 0;
2783         }
2784
2785 done:
2786         cb->args[0] = h;
2787         cb->args[1] = idx;
2788         return skb->len;
2789 }
2790
2791 void ip_rt_multicast_event(struct in_device *in_dev)
2792 {
2793         rt_cache_flush(0);
2794 }
2795
2796 #ifdef CONFIG_SYSCTL
2797 static int flush_delay;
2798
2799 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2800                                         struct file *filp, void __user *buffer,
2801                                         size_t *lenp, loff_t *ppos)
2802 {
2803         if (write) {
2804                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2805                 rt_cache_flush(flush_delay);
2806                 return 0;
2807         }
2808
2809         return -EINVAL;
2810 }
2811
2812 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2813                                                 int __user *name,
2814                                                 int nlen,
2815                                                 void __user *oldval,
2816                                                 size_t __user *oldlenp,
2817                                                 void __user *newval,
2818                                                 size_t newlen)
2819 {
2820         int delay;
2821         if (newlen != sizeof(int))
2822                 return -EINVAL;
2823         if (get_user(delay, (int __user *)newval))
2824                 return -EFAULT;
2825         rt_cache_flush(delay);
2826         return 0;
2827 }
2828
2829 ctl_table ipv4_route_table[] = {
2830         {
2831                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2832                 .procname       = "flush",
2833                 .data           = &flush_delay,
2834                 .maxlen         = sizeof(int),
2835                 .mode           = 0200,
2836                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2837                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2838         },
2839         {
2840                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2841                 .procname       = "gc_thresh",
2842                 .data           = &ipv4_dst_ops.gc_thresh,
2843                 .maxlen         = sizeof(int),
2844                 .mode           = 0644,
2845                 .proc_handler   = &proc_dointvec,
2846         },
2847         {
2848                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2849                 .procname       = "max_size",
2850                 .data           = &ip_rt_max_size,
2851                 .maxlen         = sizeof(int),
2852                 .mode           = 0644,
2853                 .proc_handler   = &proc_dointvec,
2854         },
2855         {
2856                 /*  Deprecated. Use gc_min_interval_ms */
2857
2858                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2859                 .procname       = "gc_min_interval",
2860                 .data           = &ip_rt_gc_min_interval,
2861                 .maxlen         = sizeof(int),
2862                 .mode           = 0644,
2863                 .proc_handler   = &proc_dointvec_jiffies,
2864                 .strategy       = &sysctl_jiffies,
2865         },
2866         {
2867                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2868                 .procname       = "gc_min_interval_ms",
2869                 .data           = &ip_rt_gc_min_interval,
2870                 .maxlen         = sizeof(int),
2871                 .mode           = 0644,
2872                 .proc_handler   = &proc_dointvec_ms_jiffies,
2873                 .strategy       = &sysctl_ms_jiffies,
2874         },
2875         {
2876                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2877                 .procname       = "gc_timeout",
2878                 .data           = &ip_rt_gc_timeout,
2879                 .maxlen         = sizeof(int),
2880                 .mode           = 0644,
2881                 .proc_handler   = &proc_dointvec_jiffies,
2882                 .strategy       = &sysctl_jiffies,
2883         },
2884         {
2885                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2886                 .procname       = "gc_interval",
2887                 .data           = &ip_rt_gc_interval,
2888                 .maxlen         = sizeof(int),
2889                 .mode           = 0644,
2890                 .proc_handler   = &proc_dointvec_jiffies,
2891                 .strategy       = &sysctl_jiffies,
2892         },
2893         {
2894                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2895                 .procname       = "redirect_load",
2896                 .data           = &ip_rt_redirect_load,
2897                 .maxlen         = sizeof(int),
2898                 .mode           = 0644,
2899                 .proc_handler   = &proc_dointvec,
2900         },
2901         {
2902                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2903                 .procname       = "redirect_number",
2904                 .data           = &ip_rt_redirect_number,
2905                 .maxlen         = sizeof(int),
2906                 .mode           = 0644,
2907                 .proc_handler   = &proc_dointvec,
2908         },
2909         {
2910                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2911                 .procname       = "redirect_silence",
2912                 .data           = &ip_rt_redirect_silence,
2913                 .maxlen         = sizeof(int),
2914                 .mode           = 0644,
2915                 .proc_handler   = &proc_dointvec,
2916         },
2917         {
2918                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2919                 .procname       = "error_cost",
2920                 .data           = &ip_rt_error_cost,
2921                 .maxlen         = sizeof(int),
2922                 .mode           = 0644,
2923                 .proc_handler   = &proc_dointvec,
2924         },
2925         {
2926                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2927                 .procname       = "error_burst",
2928                 .data           = &ip_rt_error_burst,
2929                 .maxlen         = sizeof(int),
2930                 .mode           = 0644,
2931                 .proc_handler   = &proc_dointvec,
2932         },
2933         {
2934                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2935                 .procname       = "gc_elasticity",
2936                 .data           = &ip_rt_gc_elasticity,
2937                 .maxlen         = sizeof(int),
2938                 .mode           = 0644,
2939                 .proc_handler   = &proc_dointvec,
2940         },
2941         {
2942                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2943                 .procname       = "mtu_expires",
2944                 .data           = &ip_rt_mtu_expires,
2945                 .maxlen         = sizeof(int),
2946                 .mode           = 0644,
2947                 .proc_handler   = &proc_dointvec_jiffies,
2948                 .strategy       = &sysctl_jiffies,
2949         },
2950         {
2951                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2952                 .procname       = "min_pmtu",
2953                 .data           = &ip_rt_min_pmtu,
2954                 .maxlen         = sizeof(int),
2955                 .mode           = 0644,
2956                 .proc_handler   = &proc_dointvec,
2957         },
2958         {
2959                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2960                 .procname       = "min_adv_mss",
2961                 .data           = &ip_rt_min_advmss,
2962                 .maxlen         = sizeof(int),
2963                 .mode           = 0644,
2964                 .proc_handler   = &proc_dointvec,
2965         },
2966         {
2967                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2968                 .procname       = "secret_interval",
2969                 .data           = &ip_rt_secret_interval,
2970                 .maxlen         = sizeof(int),
2971                 .mode           = 0644,
2972                 .proc_handler   = &proc_dointvec_jiffies,
2973                 .strategy       = &sysctl_jiffies,
2974         },
2975         { .ctl_name = 0 }
2976 };
2977 #endif
2978
2979 #ifdef CONFIG_NET_CLS_ROUTE
2980 struct ip_rt_acct *ip_rt_acct __read_mostly;
2981 #endif /* CONFIG_NET_CLS_ROUTE */
2982
2983 static __initdata unsigned long rhash_entries;
2984 static int __init set_rhash_entries(char *str)
2985 {
2986         if (!str)
2987                 return 0;
2988         rhash_entries = simple_strtoul(str, &str, 0);
2989         return 1;
2990 }
2991 __setup("rhash_entries=", set_rhash_entries);
2992
2993 int __init ip_rt_init(void)
2994 {
2995         int rc = 0;
2996
2997         atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
2998                              (jiffies ^ (jiffies >> 7))));
2999
3000 #ifdef CONFIG_NET_CLS_ROUTE
3001         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3002         if (!ip_rt_acct)
3003                 panic("IP: failed to allocate ip_rt_acct\n");
3004 #endif
3005
3006         ipv4_dst_ops.kmem_cachep =
3007                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3008                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3009
3010         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3011
3012         rt_hash_table = (struct rt_hash_bucket *)
3013                 alloc_large_system_hash("IP route cache",
3014                                         sizeof(struct rt_hash_bucket),
3015                                         rhash_entries,
3016                                         (num_physpages >= 128 * 1024) ?
3017                                         15 : 17,
3018                                         0,
3019                                         &rt_hash_log,
3020                                         &rt_hash_mask,
3021                                         0);
3022         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3023         rt_hash_lock_init();
3024
3025         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3026         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3027
3028         devinet_init();
3029         ip_fib_init();
3030
3031         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3032
3033         /* All the timers, started at system startup tend
3034            to synchronize. Perturb it a bit.
3035          */
3036         schedule_delayed_work(&expires_work,
3037                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3038
3039         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3040                 ip_rt_secret_interval;
3041         add_timer(&rt_secret_timer);
3042
3043         if (ip_rt_proc_init(&init_net))
3044                 printk(KERN_ERR "Unable to create route proc files\n");
3045 #ifdef CONFIG_XFRM
3046         xfrm_init();
3047         xfrm4_init();
3048 #endif
3049         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3050
3051         return rc;
3052 }
3053
3054 EXPORT_SYMBOL(__ip_select_ident);
3055 EXPORT_SYMBOL(ip_route_input);
3056 EXPORT_SYMBOL(ip_route_output_key);