]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv4/route.c
[IPV6]: Assorted trivial endianness annotations.
[net-next-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K 
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *              
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/sched.h>
74 #include <linux/mm.h>
75 #include <linux/bootmem.h>
76 #include <linux/string.h>
77 #include <linux/socket.h>
78 #include <linux/sockios.h>
79 #include <linux/errno.h>
80 #include <linux/in.h>
81 #include <linux/inet.h>
82 #include <linux/netdevice.h>
83 #include <linux/proc_fs.h>
84 #include <linux/init.h>
85 #include <linux/skbuff.h>
86 #include <linux/rtnetlink.h>
87 #include <linux/inetdevice.h>
88 #include <linux/igmp.h>
89 #include <linux/pkt_sched.h>
90 #include <linux/mroute.h>
91 #include <linux/netfilter_ipv4.h>
92 #include <linux/random.h>
93 #include <linux/jhash.h>
94 #include <linux/rcupdate.h>
95 #include <linux/times.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/ip_mp_alg.h>
107 #include <net/netevent.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_min_delay              = 2 * HZ;
120 static int ip_rt_max_delay              = 10 * HZ;
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval            = 60 * HZ;
124 static int ip_rt_gc_min_interval        = HZ / 2;
125 static int ip_rt_redirect_number        = 9;
126 static int ip_rt_redirect_load          = HZ / 50;
127 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost             = HZ;
129 static int ip_rt_error_burst            = 5 * HZ;
130 static int ip_rt_gc_elasticity          = 8;
131 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu               = 512 + 20 + 20;
133 static int ip_rt_min_advmss             = 256;
134 static int ip_rt_secret_interval        = 10 * 60 * HZ;
135 static unsigned long rt_deadline;
136
137 #define RTprint(a...)   printk(KERN_DEBUG a)
138
139 static struct timer_list rt_flush_timer;
140 static struct timer_list rt_periodic_timer;
141 static struct timer_list rt_secret_timer;
142
143 /*
144  *      Interface to generic destination cache.
145  */
146
147 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
148 static void              ipv4_dst_destroy(struct dst_entry *dst);
149 static void              ipv4_dst_ifdown(struct dst_entry *dst,
150                                          struct net_device *dev, int how);
151 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
152 static void              ipv4_link_failure(struct sk_buff *skb);
153 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
154 static int rt_garbage_collect(void);
155
156
157 static struct dst_ops ipv4_dst_ops = {
158         .family =               AF_INET,
159         .protocol =             __constant_htons(ETH_P_IP),
160         .gc =                   rt_garbage_collect,
161         .check =                ipv4_dst_check,
162         .destroy =              ipv4_dst_destroy,
163         .ifdown =               ipv4_dst_ifdown,
164         .negative_advice =      ipv4_negative_advice,
165         .link_failure =         ipv4_link_failure,
166         .update_pmtu =          ip_rt_update_pmtu,
167         .entry_size =           sizeof(struct rtable),
168 };
169
170 #define ECN_OR_COST(class)      TC_PRIO_##class
171
172 __u8 ip_tos2prio[16] = {
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(FILLER),
175         TC_PRIO_BESTEFFORT,
176         ECN_OR_COST(BESTEFFORT),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_BULK,
180         ECN_OR_COST(BULK),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE,
184         ECN_OR_COST(INTERACTIVE),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK),
187         TC_PRIO_INTERACTIVE_BULK,
188         ECN_OR_COST(INTERACTIVE_BULK)
189 };
190
191
192 /*
193  * Route cache.
194  */
195
196 /* The locking scheme is rather straight forward:
197  *
198  * 1) Read-Copy Update protects the buckets of the central route hash.
199  * 2) Only writers remove entries, and they hold the lock
200  *    as they look at rtable reference counts.
201  * 3) Only readers acquire references to rtable entries,
202  *    they do so with atomic increments and with the
203  *    lock held.
204  */
205
206 struct rt_hash_bucket {
207         struct rtable   *chain;
208 };
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210         defined(CONFIG_PROVE_LOCKING)
211 /*
212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213  * The size of this table is a power of two and depends on the number of CPUS.
214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215  */
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ        256
218 #else
219 # if NR_CPUS >= 32
220 #  define RT_HASH_LOCK_SZ       4096
221 # elif NR_CPUS >= 16
222 #  define RT_HASH_LOCK_SZ       2048
223 # elif NR_CPUS >= 8
224 #  define RT_HASH_LOCK_SZ       1024
225 # elif NR_CPUS >= 4
226 #  define RT_HASH_LOCK_SZ       512
227 # else
228 #  define RT_HASH_LOCK_SZ       256
229 # endif
230 #endif
231
232 static spinlock_t       *rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234 # define rt_hash_lock_init()    { \
235                 int i; \
236                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
237                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
238                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
239                         spin_lock_init(&rt_hash_locks[i]); \
240                 }
241 #else
242 # define rt_hash_lock_addr(slot) NULL
243 # define rt_hash_lock_init()
244 #endif
245
246 static struct rt_hash_bucket    *rt_hash_table;
247 static unsigned                 rt_hash_mask;
248 static int                      rt_hash_log;
249 static unsigned int             rt_hash_rnd;
250
251 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
252 #define RT_CACHE_STAT_INC(field) \
253         (__raw_get_cpu_var(rt_cache_stat).field++)
254
255 static int rt_intern_hash(unsigned hash, struct rtable *rth,
256                                 struct rtable **res);
257
258 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
259 {
260         return (jhash_2words(daddr, saddr, rt_hash_rnd)
261                 & rt_hash_mask);
262 }
263
264 #define rt_hash(daddr, saddr, idx) \
265         rt_hash_code((__force u32)(__be32)(daddr),\
266                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
267
268 #ifdef CONFIG_PROC_FS
269 struct rt_cache_iter_state {
270         int bucket;
271 };
272
273 static struct rtable *rt_cache_get_first(struct seq_file *seq)
274 {
275         struct rtable *r = NULL;
276         struct rt_cache_iter_state *st = seq->private;
277
278         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
279                 rcu_read_lock_bh();
280                 r = rt_hash_table[st->bucket].chain;
281                 if (r)
282                         break;
283                 rcu_read_unlock_bh();
284         }
285         return r;
286 }
287
288 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
289 {
290         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
291
292         r = r->u.rt_next;
293         while (!r) {
294                 rcu_read_unlock_bh();
295                 if (--st->bucket < 0)
296                         break;
297                 rcu_read_lock_bh();
298                 r = rt_hash_table[st->bucket].chain;
299         }
300         return r;
301 }
302
303 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
304 {
305         struct rtable *r = rt_cache_get_first(seq);
306
307         if (r)
308                 while (pos && (r = rt_cache_get_next(seq, r)))
309                         --pos;
310         return pos ? NULL : r;
311 }
312
313 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
314 {
315         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
316 }
317
318 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
319 {
320         struct rtable *r = NULL;
321
322         if (v == SEQ_START_TOKEN)
323                 r = rt_cache_get_first(seq);
324         else
325                 r = rt_cache_get_next(seq, v);
326         ++*pos;
327         return r;
328 }
329
330 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
331 {
332         if (v && v != SEQ_START_TOKEN)
333                 rcu_read_unlock_bh();
334 }
335
336 static int rt_cache_seq_show(struct seq_file *seq, void *v)
337 {
338         if (v == SEQ_START_TOKEN)
339                 seq_printf(seq, "%-127s\n",
340                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
341                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
342                            "HHUptod\tSpecDst");
343         else {
344                 struct rtable *r = v;
345                 char temp[256];
346
347                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
348                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
349                         r->u.dst.dev ? r->u.dst.dev->name : "*",
350                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
351                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
352                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
353                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
354                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
355                         dst_metric(&r->u.dst, RTAX_WINDOW),
356                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
357                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
358                         r->fl.fl4_tos,
359                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
360                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
361                                        dev_queue_xmit) : 0,
362                         r->rt_spec_dst);
363                 seq_printf(seq, "%-127s\n", temp);
364         }
365         return 0;
366 }
367
368 static struct seq_operations rt_cache_seq_ops = {
369         .start  = rt_cache_seq_start,
370         .next   = rt_cache_seq_next,
371         .stop   = rt_cache_seq_stop,
372         .show   = rt_cache_seq_show,
373 };
374
375 static int rt_cache_seq_open(struct inode *inode, struct file *file)
376 {
377         struct seq_file *seq;
378         int rc = -ENOMEM;
379         struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
380
381         if (!s)
382                 goto out;
383         rc = seq_open(file, &rt_cache_seq_ops);
384         if (rc)
385                 goto out_kfree;
386         seq          = file->private_data;
387         seq->private = s;
388         memset(s, 0, sizeof(*s));
389 out:
390         return rc;
391 out_kfree:
392         kfree(s);
393         goto out;
394 }
395
396 static struct file_operations rt_cache_seq_fops = {
397         .owner   = THIS_MODULE,
398         .open    = rt_cache_seq_open,
399         .read    = seq_read,
400         .llseek  = seq_lseek,
401         .release = seq_release_private,
402 };
403
404
405 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
406 {
407         int cpu;
408
409         if (*pos == 0)
410                 return SEQ_START_TOKEN;
411
412         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
413                 if (!cpu_possible(cpu))
414                         continue;
415                 *pos = cpu+1;
416                 return &per_cpu(rt_cache_stat, cpu);
417         }
418         return NULL;
419 }
420
421 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
422 {
423         int cpu;
424
425         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
426                 if (!cpu_possible(cpu))
427                         continue;
428                 *pos = cpu+1;
429                 return &per_cpu(rt_cache_stat, cpu);
430         }
431         return NULL;
432         
433 }
434
435 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
436 {
437
438 }
439
440 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
441 {
442         struct rt_cache_stat *st = v;
443
444         if (v == SEQ_START_TOKEN) {
445                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
446                 return 0;
447         }
448         
449         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
450                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
451                    atomic_read(&ipv4_dst_ops.entries),
452                    st->in_hit,
453                    st->in_slow_tot,
454                    st->in_slow_mc,
455                    st->in_no_route,
456                    st->in_brd,
457                    st->in_martian_dst,
458                    st->in_martian_src,
459
460                    st->out_hit,
461                    st->out_slow_tot,
462                    st->out_slow_mc, 
463
464                    st->gc_total,
465                    st->gc_ignored,
466                    st->gc_goal_miss,
467                    st->gc_dst_overflow,
468                    st->in_hlist_search,
469                    st->out_hlist_search
470                 );
471         return 0;
472 }
473
474 static struct seq_operations rt_cpu_seq_ops = {
475         .start  = rt_cpu_seq_start,
476         .next   = rt_cpu_seq_next,
477         .stop   = rt_cpu_seq_stop,
478         .show   = rt_cpu_seq_show,
479 };
480
481
482 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
483 {
484         return seq_open(file, &rt_cpu_seq_ops);
485 }
486
487 static struct file_operations rt_cpu_seq_fops = {
488         .owner   = THIS_MODULE,
489         .open    = rt_cpu_seq_open,
490         .read    = seq_read,
491         .llseek  = seq_lseek,
492         .release = seq_release,
493 };
494
495 #endif /* CONFIG_PROC_FS */
496   
497 static __inline__ void rt_free(struct rtable *rt)
498 {
499         multipath_remove(rt);
500         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
501 }
502
503 static __inline__ void rt_drop(struct rtable *rt)
504 {
505         multipath_remove(rt);
506         ip_rt_put(rt);
507         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
508 }
509
510 static __inline__ int rt_fast_clean(struct rtable *rth)
511 {
512         /* Kill broadcast/multicast entries very aggresively, if they
513            collide in hash table with more useful entries */
514         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
515                 rth->fl.iif && rth->u.rt_next;
516 }
517
518 static __inline__ int rt_valuable(struct rtable *rth)
519 {
520         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
521                 rth->u.dst.expires;
522 }
523
524 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
525 {
526         unsigned long age;
527         int ret = 0;
528
529         if (atomic_read(&rth->u.dst.__refcnt))
530                 goto out;
531
532         ret = 1;
533         if (rth->u.dst.expires &&
534             time_after_eq(jiffies, rth->u.dst.expires))
535                 goto out;
536
537         age = jiffies - rth->u.dst.lastuse;
538         ret = 0;
539         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
540             (age <= tmo2 && rt_valuable(rth)))
541                 goto out;
542         ret = 1;
543 out:    return ret;
544 }
545
546 /* Bits of score are:
547  * 31: very valuable
548  * 30: not quite useless
549  * 29..0: usage counter
550  */
551 static inline u32 rt_score(struct rtable *rt)
552 {
553         u32 score = jiffies - rt->u.dst.lastuse;
554
555         score = ~score & ~(3<<30);
556
557         if (rt_valuable(rt))
558                 score |= (1<<31);
559
560         if (!rt->fl.iif ||
561             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
562                 score |= (1<<30);
563
564         return score;
565 }
566
567 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
568 {
569         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
570                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
571                 (fl1->mark ^ fl2->mark) |
572                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
573                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
574                 (fl1->oif ^ fl2->oif) |
575                 (fl1->iif ^ fl2->iif)) == 0;
576 }
577
578 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
579 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
580                                                 struct rtable *expentry,
581                                                 int *removed_count)
582 {
583         int passedexpired = 0;
584         struct rtable **nextstep = NULL;
585         struct rtable **rthp = chain_head;
586         struct rtable *rth;
587
588         if (removed_count)
589                 *removed_count = 0;
590
591         while ((rth = *rthp) != NULL) {
592                 if (rth == expentry)
593                         passedexpired = 1;
594
595                 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
596                     compare_keys(&(*rthp)->fl, &expentry->fl)) {
597                         if (*rthp == expentry) {
598                                 *rthp = rth->u.rt_next;
599                                 continue;
600                         } else {
601                                 *rthp = rth->u.rt_next;
602                                 rt_free(rth);
603                                 if (removed_count)
604                                         ++(*removed_count);
605                         }
606                 } else {
607                         if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
608                             passedexpired && !nextstep)
609                                 nextstep = &rth->u.rt_next;
610
611                         rthp = &rth->u.rt_next;
612                 }
613         }
614
615         rt_free(expentry);
616         if (removed_count)
617                 ++(*removed_count);
618
619         return nextstep;
620 }
621 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622
623
624 /* This runs via a timer and thus is always in BH context. */
625 static void rt_check_expire(unsigned long dummy)
626 {
627         static unsigned int rover;
628         unsigned int i = rover, goal;
629         struct rtable *rth, **rthp;
630         unsigned long now = jiffies;
631         u64 mult;
632
633         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
634         if (ip_rt_gc_timeout > 1)
635                 do_div(mult, ip_rt_gc_timeout);
636         goal = (unsigned int)mult;
637         if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
638         for (; goal > 0; goal--) {
639                 unsigned long tmo = ip_rt_gc_timeout;
640
641                 i = (i + 1) & rt_hash_mask;
642                 rthp = &rt_hash_table[i].chain;
643
644                 if (*rthp == 0)
645                         continue;
646                 spin_lock(rt_hash_lock_addr(i));
647                 while ((rth = *rthp) != NULL) {
648                         if (rth->u.dst.expires) {
649                                 /* Entry is expired even if it is in use */
650                                 if (time_before_eq(now, rth->u.dst.expires)) {
651                                         tmo >>= 1;
652                                         rthp = &rth->u.rt_next;
653                                         continue;
654                                 }
655                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
656                                 tmo >>= 1;
657                                 rthp = &rth->u.rt_next;
658                                 continue;
659                         }
660
661                         /* Cleanup aged off entries. */
662 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
663                         /* remove all related balanced entries if necessary */
664                         if (rth->u.dst.flags & DST_BALANCED) {
665                                 rthp = rt_remove_balanced_route(
666                                         &rt_hash_table[i].chain,
667                                         rth, NULL);
668                                 if (!rthp)
669                                         break;
670                         } else {
671                                 *rthp = rth->u.rt_next;
672                                 rt_free(rth);
673                         }
674 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
675                         *rthp = rth->u.rt_next;
676                         rt_free(rth);
677 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
678                 }
679                 spin_unlock(rt_hash_lock_addr(i));
680
681                 /* Fallback loop breaker. */
682                 if (time_after(jiffies, now))
683                         break;
684         }
685         rover = i;
686         mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
687 }
688
689 /* This can run from both BH and non-BH contexts, the latter
690  * in the case of a forced flush event.
691  */
692 static void rt_run_flush(unsigned long dummy)
693 {
694         int i;
695         struct rtable *rth, *next;
696
697         rt_deadline = 0;
698
699         get_random_bytes(&rt_hash_rnd, 4);
700
701         for (i = rt_hash_mask; i >= 0; i--) {
702                 spin_lock_bh(rt_hash_lock_addr(i));
703                 rth = rt_hash_table[i].chain;
704                 if (rth)
705                         rt_hash_table[i].chain = NULL;
706                 spin_unlock_bh(rt_hash_lock_addr(i));
707
708                 for (; rth; rth = next) {
709                         next = rth->u.rt_next;
710                         rt_free(rth);
711                 }
712         }
713 }
714
715 static DEFINE_SPINLOCK(rt_flush_lock);
716
717 void rt_cache_flush(int delay)
718 {
719         unsigned long now = jiffies;
720         int user_mode = !in_softirq();
721
722         if (delay < 0)
723                 delay = ip_rt_min_delay;
724
725         /* flush existing multipath state*/
726         multipath_flush();
727
728         spin_lock_bh(&rt_flush_lock);
729
730         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
731                 long tmo = (long)(rt_deadline - now);
732
733                 /* If flush timer is already running
734                    and flush request is not immediate (delay > 0):
735
736                    if deadline is not achieved, prolongate timer to "delay",
737                    otherwise fire it at deadline time.
738                  */
739
740                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
741                         tmo = 0;
742                 
743                 if (delay > tmo)
744                         delay = tmo;
745         }
746
747         if (delay <= 0) {
748                 spin_unlock_bh(&rt_flush_lock);
749                 rt_run_flush(0);
750                 return;
751         }
752
753         if (rt_deadline == 0)
754                 rt_deadline = now + ip_rt_max_delay;
755
756         mod_timer(&rt_flush_timer, now+delay);
757         spin_unlock_bh(&rt_flush_lock);
758 }
759
760 static void rt_secret_rebuild(unsigned long dummy)
761 {
762         unsigned long now = jiffies;
763
764         rt_cache_flush(0);
765         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
766 }
767
768 /*
769    Short description of GC goals.
770
771    We want to build algorithm, which will keep routing cache
772    at some equilibrium point, when number of aged off entries
773    is kept approximately equal to newly generated ones.
774
775    Current expiration strength is variable "expire".
776    We try to adjust it dynamically, so that if networking
777    is idle expires is large enough to keep enough of warm entries,
778    and when load increases it reduces to limit cache size.
779  */
780
781 static int rt_garbage_collect(void)
782 {
783         static unsigned long expire = RT_GC_TIMEOUT;
784         static unsigned long last_gc;
785         static int rover;
786         static int equilibrium;
787         struct rtable *rth, **rthp;
788         unsigned long now = jiffies;
789         int goal;
790
791         /*
792          * Garbage collection is pretty expensive,
793          * do not make it too frequently.
794          */
795
796         RT_CACHE_STAT_INC(gc_total);
797
798         if (now - last_gc < ip_rt_gc_min_interval &&
799             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
800                 RT_CACHE_STAT_INC(gc_ignored);
801                 goto out;
802         }
803
804         /* Calculate number of entries, which we want to expire now. */
805         goal = atomic_read(&ipv4_dst_ops.entries) -
806                 (ip_rt_gc_elasticity << rt_hash_log);
807         if (goal <= 0) {
808                 if (equilibrium < ipv4_dst_ops.gc_thresh)
809                         equilibrium = ipv4_dst_ops.gc_thresh;
810                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
811                 if (goal > 0) {
812                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
813                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
814                 }
815         } else {
816                 /* We are in dangerous area. Try to reduce cache really
817                  * aggressively.
818                  */
819                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
820                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
821         }
822
823         if (now - last_gc >= ip_rt_gc_min_interval)
824                 last_gc = now;
825
826         if (goal <= 0) {
827                 equilibrium += goal;
828                 goto work_done;
829         }
830
831         do {
832                 int i, k;
833
834                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
835                         unsigned long tmo = expire;
836
837                         k = (k + 1) & rt_hash_mask;
838                         rthp = &rt_hash_table[k].chain;
839                         spin_lock_bh(rt_hash_lock_addr(k));
840                         while ((rth = *rthp) != NULL) {
841                                 if (!rt_may_expire(rth, tmo, expire)) {
842                                         tmo >>= 1;
843                                         rthp = &rth->u.rt_next;
844                                         continue;
845                                 }
846 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
847                                 /* remove all related balanced entries
848                                  * if necessary
849                                  */
850                                 if (rth->u.dst.flags & DST_BALANCED) {
851                                         int r;
852
853                                         rthp = rt_remove_balanced_route(
854                                                 &rt_hash_table[k].chain,
855                                                 rth,
856                                                 &r);
857                                         goal -= r;
858                                         if (!rthp)
859                                                 break;
860                                 } else {
861                                         *rthp = rth->u.rt_next;
862                                         rt_free(rth);
863                                         goal--;
864                                 }
865 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
866                                 *rthp = rth->u.rt_next;
867                                 rt_free(rth);
868                                 goal--;
869 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
870                         }
871                         spin_unlock_bh(rt_hash_lock_addr(k));
872                         if (goal <= 0)
873                                 break;
874                 }
875                 rover = k;
876
877                 if (goal <= 0)
878                         goto work_done;
879
880                 /* Goal is not achieved. We stop process if:
881
882                    - if expire reduced to zero. Otherwise, expire is halfed.
883                    - if table is not full.
884                    - if we are called from interrupt.
885                    - jiffies check is just fallback/debug loop breaker.
886                      We will not spin here for long time in any case.
887                  */
888
889                 RT_CACHE_STAT_INC(gc_goal_miss);
890
891                 if (expire == 0)
892                         break;
893
894                 expire >>= 1;
895 #if RT_CACHE_DEBUG >= 2
896                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
897                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
898 #endif
899
900                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
901                         goto out;
902         } while (!in_softirq() && time_before_eq(jiffies, now));
903
904         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
905                 goto out;
906         if (net_ratelimit())
907                 printk(KERN_WARNING "dst cache overflow\n");
908         RT_CACHE_STAT_INC(gc_dst_overflow);
909         return 1;
910
911 work_done:
912         expire += ip_rt_gc_min_interval;
913         if (expire > ip_rt_gc_timeout ||
914             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
915                 expire = ip_rt_gc_timeout;
916 #if RT_CACHE_DEBUG >= 2
917         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
918                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
919 #endif
920 out:    return 0;
921 }
922
923 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
924 {
925         struct rtable   *rth, **rthp;
926         unsigned long   now;
927         struct rtable *cand, **candp;
928         u32             min_score;
929         int             chain_length;
930         int attempts = !in_softirq();
931
932 restart:
933         chain_length = 0;
934         min_score = ~(u32)0;
935         cand = NULL;
936         candp = NULL;
937         now = jiffies;
938
939         rthp = &rt_hash_table[hash].chain;
940
941         spin_lock_bh(rt_hash_lock_addr(hash));
942         while ((rth = *rthp) != NULL) {
943 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
944                 if (!(rth->u.dst.flags & DST_BALANCED) &&
945                     compare_keys(&rth->fl, &rt->fl)) {
946 #else
947                 if (compare_keys(&rth->fl, &rt->fl)) {
948 #endif
949                         /* Put it first */
950                         *rthp = rth->u.rt_next;
951                         /*
952                          * Since lookup is lockfree, the deletion
953                          * must be visible to another weakly ordered CPU before
954                          * the insertion at the start of the hash chain.
955                          */
956                         rcu_assign_pointer(rth->u.rt_next,
957                                            rt_hash_table[hash].chain);
958                         /*
959                          * Since lookup is lockfree, the update writes
960                          * must be ordered for consistency on SMP.
961                          */
962                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
963
964                         rth->u.dst.__use++;
965                         dst_hold(&rth->u.dst);
966                         rth->u.dst.lastuse = now;
967                         spin_unlock_bh(rt_hash_lock_addr(hash));
968
969                         rt_drop(rt);
970                         *rp = rth;
971                         return 0;
972                 }
973
974                 if (!atomic_read(&rth->u.dst.__refcnt)) {
975                         u32 score = rt_score(rth);
976
977                         if (score <= min_score) {
978                                 cand = rth;
979                                 candp = rthp;
980                                 min_score = score;
981                         }
982                 }
983
984                 chain_length++;
985
986                 rthp = &rth->u.rt_next;
987         }
988
989         if (cand) {
990                 /* ip_rt_gc_elasticity used to be average length of chain
991                  * length, when exceeded gc becomes really aggressive.
992                  *
993                  * The second limit is less certain. At the moment it allows
994                  * only 2 entries per bucket. We will see.
995                  */
996                 if (chain_length > ip_rt_gc_elasticity) {
997                         *candp = cand->u.rt_next;
998                         rt_free(cand);
999                 }
1000         }
1001
1002         /* Try to bind route to arp only if it is output
1003            route or unicast forwarding path.
1004          */
1005         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1006                 int err = arp_bind_neighbour(&rt->u.dst);
1007                 if (err) {
1008                         spin_unlock_bh(rt_hash_lock_addr(hash));
1009
1010                         if (err != -ENOBUFS) {
1011                                 rt_drop(rt);
1012                                 return err;
1013                         }
1014
1015                         /* Neighbour tables are full and nothing
1016                            can be released. Try to shrink route cache,
1017                            it is most likely it holds some neighbour records.
1018                          */
1019                         if (attempts-- > 0) {
1020                                 int saved_elasticity = ip_rt_gc_elasticity;
1021                                 int saved_int = ip_rt_gc_min_interval;
1022                                 ip_rt_gc_elasticity     = 1;
1023                                 ip_rt_gc_min_interval   = 0;
1024                                 rt_garbage_collect();
1025                                 ip_rt_gc_min_interval   = saved_int;
1026                                 ip_rt_gc_elasticity     = saved_elasticity;
1027                                 goto restart;
1028                         }
1029
1030                         if (net_ratelimit())
1031                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1032                         rt_drop(rt);
1033                         return -ENOBUFS;
1034                 }
1035         }
1036
1037         rt->u.rt_next = rt_hash_table[hash].chain;
1038 #if RT_CACHE_DEBUG >= 2
1039         if (rt->u.rt_next) {
1040                 struct rtable *trt;
1041                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1042                        NIPQUAD(rt->rt_dst));
1043                 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1044                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1045                 printk("\n");
1046         }
1047 #endif
1048         rt_hash_table[hash].chain = rt;
1049         spin_unlock_bh(rt_hash_lock_addr(hash));
1050         *rp = rt;
1051         return 0;
1052 }
1053
1054 void rt_bind_peer(struct rtable *rt, int create)
1055 {
1056         static DEFINE_SPINLOCK(rt_peer_lock);
1057         struct inet_peer *peer;
1058
1059         peer = inet_getpeer(rt->rt_dst, create);
1060
1061         spin_lock_bh(&rt_peer_lock);
1062         if (rt->peer == NULL) {
1063                 rt->peer = peer;
1064                 peer = NULL;
1065         }
1066         spin_unlock_bh(&rt_peer_lock);
1067         if (peer)
1068                 inet_putpeer(peer);
1069 }
1070
1071 /*
1072  * Peer allocation may fail only in serious out-of-memory conditions.  However
1073  * we still can generate some output.
1074  * Random ID selection looks a bit dangerous because we have no chances to
1075  * select ID being unique in a reasonable period of time.
1076  * But broken packet identifier may be better than no packet at all.
1077  */
1078 static void ip_select_fb_ident(struct iphdr *iph)
1079 {
1080         static DEFINE_SPINLOCK(ip_fb_id_lock);
1081         static u32 ip_fallback_id;
1082         u32 salt;
1083
1084         spin_lock_bh(&ip_fb_id_lock);
1085         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1086         iph->id = htons(salt & 0xFFFF);
1087         ip_fallback_id = salt;
1088         spin_unlock_bh(&ip_fb_id_lock);
1089 }
1090
1091 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1092 {
1093         struct rtable *rt = (struct rtable *) dst;
1094
1095         if (rt) {
1096                 if (rt->peer == NULL)
1097                         rt_bind_peer(rt, 1);
1098
1099                 /* If peer is attached to destination, it is never detached,
1100                    so that we need not to grab a lock to dereference it.
1101                  */
1102                 if (rt->peer) {
1103                         iph->id = htons(inet_getid(rt->peer, more));
1104                         return;
1105                 }
1106         } else
1107                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 
1108                        __builtin_return_address(0));
1109
1110         ip_select_fb_ident(iph);
1111 }
1112
1113 static void rt_del(unsigned hash, struct rtable *rt)
1114 {
1115         struct rtable **rthp;
1116
1117         spin_lock_bh(rt_hash_lock_addr(hash));
1118         ip_rt_put(rt);
1119         for (rthp = &rt_hash_table[hash].chain; *rthp;
1120              rthp = &(*rthp)->u.rt_next)
1121                 if (*rthp == rt) {
1122                         *rthp = rt->u.rt_next;
1123                         rt_free(rt);
1124                         break;
1125                 }
1126         spin_unlock_bh(rt_hash_lock_addr(hash));
1127 }
1128
1129 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1130                     __be32 saddr, struct net_device *dev)
1131 {
1132         int i, k;
1133         struct in_device *in_dev = in_dev_get(dev);
1134         struct rtable *rth, **rthp;
1135         __be32  skeys[2] = { saddr, 0 };
1136         int  ikeys[2] = { dev->ifindex, 0 };
1137         struct netevent_redirect netevent;
1138
1139         if (!in_dev)
1140                 return;
1141
1142         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1143             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1144                 goto reject_redirect;
1145
1146         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1147                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1148                         goto reject_redirect;
1149                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1150                         goto reject_redirect;
1151         } else {
1152                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1153                         goto reject_redirect;
1154         }
1155
1156         for (i = 0; i < 2; i++) {
1157                 for (k = 0; k < 2; k++) {
1158                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1159
1160                         rthp=&rt_hash_table[hash].chain;
1161
1162                         rcu_read_lock();
1163                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1164                                 struct rtable *rt;
1165
1166                                 if (rth->fl.fl4_dst != daddr ||
1167                                     rth->fl.fl4_src != skeys[i] ||
1168                                     rth->fl.oif != ikeys[k] ||
1169                                     rth->fl.iif != 0) {
1170                                         rthp = &rth->u.rt_next;
1171                                         continue;
1172                                 }
1173
1174                                 if (rth->rt_dst != daddr ||
1175                                     rth->rt_src != saddr ||
1176                                     rth->u.dst.error ||
1177                                     rth->rt_gateway != old_gw ||
1178                                     rth->u.dst.dev != dev)
1179                                         break;
1180
1181                                 dst_hold(&rth->u.dst);
1182                                 rcu_read_unlock();
1183
1184                                 rt = dst_alloc(&ipv4_dst_ops);
1185                                 if (rt == NULL) {
1186                                         ip_rt_put(rth);
1187                                         in_dev_put(in_dev);
1188                                         return;
1189                                 }
1190
1191                                 /* Copy all the information. */
1192                                 *rt = *rth;
1193                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1194                                 rt->u.dst.__use         = 1;
1195                                 atomic_set(&rt->u.dst.__refcnt, 1);
1196                                 rt->u.dst.child         = NULL;
1197                                 if (rt->u.dst.dev)
1198                                         dev_hold(rt->u.dst.dev);
1199                                 if (rt->idev)
1200                                         in_dev_hold(rt->idev);
1201                                 rt->u.dst.obsolete      = 0;
1202                                 rt->u.dst.lastuse       = jiffies;
1203                                 rt->u.dst.path          = &rt->u.dst;
1204                                 rt->u.dst.neighbour     = NULL;
1205                                 rt->u.dst.hh            = NULL;
1206                                 rt->u.dst.xfrm          = NULL;
1207
1208                                 rt->rt_flags            |= RTCF_REDIRECTED;
1209
1210                                 /* Gateway is different ... */
1211                                 rt->rt_gateway          = new_gw;
1212
1213                                 /* Redirect received -> path was valid */
1214                                 dst_confirm(&rth->u.dst);
1215
1216                                 if (rt->peer)
1217                                         atomic_inc(&rt->peer->refcnt);
1218
1219                                 if (arp_bind_neighbour(&rt->u.dst) ||
1220                                     !(rt->u.dst.neighbour->nud_state &
1221                                             NUD_VALID)) {
1222                                         if (rt->u.dst.neighbour)
1223                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1224                                         ip_rt_put(rth);
1225                                         rt_drop(rt);
1226                                         goto do_next;
1227                                 }
1228                                 
1229                                 netevent.old = &rth->u.dst;
1230                                 netevent.new = &rt->u.dst;
1231                                 call_netevent_notifiers(NETEVENT_REDIRECT, 
1232                                                         &netevent);
1233
1234                                 rt_del(hash, rth);
1235                                 if (!rt_intern_hash(hash, rt, &rt))
1236                                         ip_rt_put(rt);
1237                                 goto do_next;
1238                         }
1239                         rcu_read_unlock();
1240                 do_next:
1241                         ;
1242                 }
1243         }
1244         in_dev_put(in_dev);
1245         return;
1246
1247 reject_redirect:
1248 #ifdef CONFIG_IP_ROUTE_VERBOSE
1249         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1250                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1251                         "%u.%u.%u.%u ignored.\n"
1252                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1253                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1254                        NIPQUAD(saddr), NIPQUAD(daddr));
1255 #endif
1256         in_dev_put(in_dev);
1257 }
1258
1259 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1260 {
1261         struct rtable *rt = (struct rtable*)dst;
1262         struct dst_entry *ret = dst;
1263
1264         if (rt) {
1265                 if (dst->obsolete) {
1266                         ip_rt_put(rt);
1267                         ret = NULL;
1268                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1269                            rt->u.dst.expires) {
1270                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1271                                                 rt->fl.oif);
1272 #if RT_CACHE_DEBUG >= 1
1273                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1274                                           "%u.%u.%u.%u/%02x dropped\n",
1275                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1276 #endif
1277                         rt_del(hash, rt);
1278                         ret = NULL;
1279                 }
1280         }
1281         return ret;
1282 }
1283
1284 /*
1285  * Algorithm:
1286  *      1. The first ip_rt_redirect_number redirects are sent
1287  *         with exponential backoff, then we stop sending them at all,
1288  *         assuming that the host ignores our redirects.
1289  *      2. If we did not see packets requiring redirects
1290  *         during ip_rt_redirect_silence, we assume that the host
1291  *         forgot redirected route and start to send redirects again.
1292  *
1293  * This algorithm is much cheaper and more intelligent than dumb load limiting
1294  * in icmp.c.
1295  *
1296  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1297  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1298  */
1299
1300 void ip_rt_send_redirect(struct sk_buff *skb)
1301 {
1302         struct rtable *rt = (struct rtable*)skb->dst;
1303         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1304
1305         if (!in_dev)
1306                 return;
1307
1308         if (!IN_DEV_TX_REDIRECTS(in_dev))
1309                 goto out;
1310
1311         /* No redirected packets during ip_rt_redirect_silence;
1312          * reset the algorithm.
1313          */
1314         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1315                 rt->u.dst.rate_tokens = 0;
1316
1317         /* Too many ignored redirects; do not send anything
1318          * set u.dst.rate_last to the last seen redirected packet.
1319          */
1320         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1321                 rt->u.dst.rate_last = jiffies;
1322                 goto out;
1323         }
1324
1325         /* Check for load limit; set rate_last to the latest sent
1326          * redirect.
1327          */
1328         if (time_after(jiffies,
1329                        (rt->u.dst.rate_last +
1330                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1331                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1332                 rt->u.dst.rate_last = jiffies;
1333                 ++rt->u.dst.rate_tokens;
1334 #ifdef CONFIG_IP_ROUTE_VERBOSE
1335                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1336                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1337                     net_ratelimit())
1338                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1339                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1340                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1341                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1342 #endif
1343         }
1344 out:
1345         in_dev_put(in_dev);
1346 }
1347
1348 static int ip_error(struct sk_buff *skb)
1349 {
1350         struct rtable *rt = (struct rtable*)skb->dst;
1351         unsigned long now;
1352         int code;
1353
1354         switch (rt->u.dst.error) {
1355                 case EINVAL:
1356                 default:
1357                         goto out;
1358                 case EHOSTUNREACH:
1359                         code = ICMP_HOST_UNREACH;
1360                         break;
1361                 case ENETUNREACH:
1362                         code = ICMP_NET_UNREACH;
1363                         break;
1364                 case EACCES:
1365                         code = ICMP_PKT_FILTERED;
1366                         break;
1367         }
1368
1369         now = jiffies;
1370         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1371         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1372                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1373         rt->u.dst.rate_last = now;
1374         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1375                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1376                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1377         }
1378
1379 out:    kfree_skb(skb);
1380         return 0;
1381
1382
1383 /*
1384  *      The last two values are not from the RFC but
1385  *      are needed for AMPRnet AX.25 paths.
1386  */
1387
1388 static const unsigned short mtu_plateau[] =
1389 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1390
1391 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1392 {
1393         int i;
1394         
1395         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1396                 if (old_mtu > mtu_plateau[i])
1397                         return mtu_plateau[i];
1398         return 68;
1399 }
1400
1401 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1402 {
1403         int i;
1404         unsigned short old_mtu = ntohs(iph->tot_len);
1405         struct rtable *rth;
1406         __be32  skeys[2] = { iph->saddr, 0, };
1407         __be32  daddr = iph->daddr;
1408         unsigned short est_mtu = 0;
1409
1410         if (ipv4_config.no_pmtu_disc)
1411                 return 0;
1412
1413         for (i = 0; i < 2; i++) {
1414                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1415
1416                 rcu_read_lock();
1417                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1418                      rth = rcu_dereference(rth->u.rt_next)) {
1419                         if (rth->fl.fl4_dst == daddr &&
1420                             rth->fl.fl4_src == skeys[i] &&
1421                             rth->rt_dst  == daddr &&
1422                             rth->rt_src  == iph->saddr &&
1423                             rth->fl.iif == 0 &&
1424                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1425                                 unsigned short mtu = new_mtu;
1426
1427                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1428
1429                                         /* BSD 4.2 compatibility hack :-( */
1430                                         if (mtu == 0 &&
1431                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1432                                             old_mtu >= 68 + (iph->ihl << 2))
1433                                                 old_mtu -= iph->ihl << 2;
1434
1435                                         mtu = guess_mtu(old_mtu);
1436                                 }
1437                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1438                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
1439                                                 dst_confirm(&rth->u.dst);
1440                                                 if (mtu < ip_rt_min_pmtu) {
1441                                                         mtu = ip_rt_min_pmtu;
1442                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1443                                                                 (1 << RTAX_MTU);
1444                                                 }
1445                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1446                                                 dst_set_expires(&rth->u.dst,
1447                                                         ip_rt_mtu_expires);
1448                                         }
1449                                         est_mtu = mtu;
1450                                 }
1451                         }
1452                 }
1453                 rcu_read_unlock();
1454         }
1455         return est_mtu ? : new_mtu;
1456 }
1457
1458 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1459 {
1460         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1461             !(dst_metric_locked(dst, RTAX_MTU))) {
1462                 if (mtu < ip_rt_min_pmtu) {
1463                         mtu = ip_rt_min_pmtu;
1464                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1465                 }
1466                 dst->metrics[RTAX_MTU-1] = mtu;
1467                 dst_set_expires(dst, ip_rt_mtu_expires);
1468                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1469         }
1470 }
1471
1472 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1473 {
1474         return NULL;
1475 }
1476
1477 static void ipv4_dst_destroy(struct dst_entry *dst)
1478 {
1479         struct rtable *rt = (struct rtable *) dst;
1480         struct inet_peer *peer = rt->peer;
1481         struct in_device *idev = rt->idev;
1482
1483         if (peer) {
1484                 rt->peer = NULL;
1485                 inet_putpeer(peer);
1486         }
1487
1488         if (idev) {
1489                 rt->idev = NULL;
1490                 in_dev_put(idev);
1491         }
1492 }
1493
1494 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1495                             int how)
1496 {
1497         struct rtable *rt = (struct rtable *) dst;
1498         struct in_device *idev = rt->idev;
1499         if (dev != &loopback_dev && idev && idev->dev == dev) {
1500                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1501                 if (loopback_idev) {
1502                         rt->idev = loopback_idev;
1503                         in_dev_put(idev);
1504                 }
1505         }
1506 }
1507
1508 static void ipv4_link_failure(struct sk_buff *skb)
1509 {
1510         struct rtable *rt;
1511
1512         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1513
1514         rt = (struct rtable *) skb->dst;
1515         if (rt)
1516                 dst_set_expires(&rt->u.dst, 0);
1517 }
1518
1519 static int ip_rt_bug(struct sk_buff *skb)
1520 {
1521         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1522                 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1523                 skb->dev ? skb->dev->name : "?");
1524         kfree_skb(skb);
1525         return 0;
1526 }
1527
1528 /*
1529    We do not cache source address of outgoing interface,
1530    because it is used only by IP RR, TS and SRR options,
1531    so that it out of fast path.
1532
1533    BTW remember: "addr" is allowed to be not aligned
1534    in IP options!
1535  */
1536
1537 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1538 {
1539         __be32 src;
1540         struct fib_result res;
1541
1542         if (rt->fl.iif == 0)
1543                 src = rt->rt_src;
1544         else if (fib_lookup(&rt->fl, &res) == 0) {
1545                 src = FIB_RES_PREFSRC(res);
1546                 fib_res_put(&res);
1547         } else
1548                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1549                                         RT_SCOPE_UNIVERSE);
1550         memcpy(addr, &src, 4);
1551 }
1552
1553 #ifdef CONFIG_NET_CLS_ROUTE
1554 static void set_class_tag(struct rtable *rt, u32 tag)
1555 {
1556         if (!(rt->u.dst.tclassid & 0xFFFF))
1557                 rt->u.dst.tclassid |= tag & 0xFFFF;
1558         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1559                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1560 }
1561 #endif
1562
1563 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1564 {
1565         struct fib_info *fi = res->fi;
1566
1567         if (fi) {
1568                 if (FIB_RES_GW(*res) &&
1569                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1570                         rt->rt_gateway = FIB_RES_GW(*res);
1571                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1572                        sizeof(rt->u.dst.metrics));
1573                 if (fi->fib_mtu == 0) {
1574                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1575                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1576                             rt->rt_gateway != rt->rt_dst &&
1577                             rt->u.dst.dev->mtu > 576)
1578                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1579                 }
1580 #ifdef CONFIG_NET_CLS_ROUTE
1581                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1582 #endif
1583         } else
1584                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1585
1586         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1587                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1588         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1589                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1590         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1591                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1592                                        ip_rt_min_advmss);
1593         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1594                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1595
1596 #ifdef CONFIG_NET_CLS_ROUTE
1597 #ifdef CONFIG_IP_MULTIPLE_TABLES
1598         set_class_tag(rt, fib_rules_tclass(res));
1599 #endif
1600         set_class_tag(rt, itag);
1601 #endif
1602         rt->rt_type = res->type;
1603 }
1604
1605 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1606                                 u8 tos, struct net_device *dev, int our)
1607 {
1608         unsigned hash;
1609         struct rtable *rth;
1610         __be32 spec_dst;
1611         struct in_device *in_dev = in_dev_get(dev);
1612         u32 itag = 0;
1613
1614         /* Primary sanity checks. */
1615
1616         if (in_dev == NULL)
1617                 return -EINVAL;
1618
1619         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1620             skb->protocol != htons(ETH_P_IP))
1621                 goto e_inval;
1622
1623         if (ZERONET(saddr)) {
1624                 if (!LOCAL_MCAST(daddr))
1625                         goto e_inval;
1626                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1627         } else if (fib_validate_source(saddr, 0, tos, 0,
1628                                         dev, &spec_dst, &itag) < 0)
1629                 goto e_inval;
1630
1631         rth = dst_alloc(&ipv4_dst_ops);
1632         if (!rth)
1633                 goto e_nobufs;
1634
1635         rth->u.dst.output= ip_rt_bug;
1636
1637         atomic_set(&rth->u.dst.__refcnt, 1);
1638         rth->u.dst.flags= DST_HOST;
1639         if (in_dev->cnf.no_policy)
1640                 rth->u.dst.flags |= DST_NOPOLICY;
1641         rth->fl.fl4_dst = daddr;
1642         rth->rt_dst     = daddr;
1643         rth->fl.fl4_tos = tos;
1644         rth->fl.mark    = skb->mark;
1645         rth->fl.fl4_src = saddr;
1646         rth->rt_src     = saddr;
1647 #ifdef CONFIG_NET_CLS_ROUTE
1648         rth->u.dst.tclassid = itag;
1649 #endif
1650         rth->rt_iif     =
1651         rth->fl.iif     = dev->ifindex;
1652         rth->u.dst.dev  = &loopback_dev;
1653         dev_hold(rth->u.dst.dev);
1654         rth->idev       = in_dev_get(rth->u.dst.dev);
1655         rth->fl.oif     = 0;
1656         rth->rt_gateway = daddr;
1657         rth->rt_spec_dst= spec_dst;
1658         rth->rt_type    = RTN_MULTICAST;
1659         rth->rt_flags   = RTCF_MULTICAST;
1660         if (our) {
1661                 rth->u.dst.input= ip_local_deliver;
1662                 rth->rt_flags |= RTCF_LOCAL;
1663         }
1664
1665 #ifdef CONFIG_IP_MROUTE
1666         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1667                 rth->u.dst.input = ip_mr_input;
1668 #endif
1669         RT_CACHE_STAT_INC(in_slow_mc);
1670
1671         in_dev_put(in_dev);
1672         hash = rt_hash(daddr, saddr, dev->ifindex);
1673         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1674
1675 e_nobufs:
1676         in_dev_put(in_dev);
1677         return -ENOBUFS;
1678
1679 e_inval:
1680         in_dev_put(in_dev);
1681         return -EINVAL;
1682 }
1683
1684
1685 static void ip_handle_martian_source(struct net_device *dev,
1686                                      struct in_device *in_dev,
1687                                      struct sk_buff *skb,
1688                                      __be32 daddr,
1689                                      __be32 saddr)
1690 {
1691         RT_CACHE_STAT_INC(in_martian_src);
1692 #ifdef CONFIG_IP_ROUTE_VERBOSE
1693         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1694                 /*
1695                  *      RFC1812 recommendation, if source is martian,
1696                  *      the only hint is MAC header.
1697                  */
1698                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1699                         "%u.%u.%u.%u, on dev %s\n",
1700                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1701                 if (dev->hard_header_len && skb->mac.raw) {
1702                         int i;
1703                         unsigned char *p = skb->mac.raw;
1704                         printk(KERN_WARNING "ll header: ");
1705                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1706                                 printk("%02x", *p);
1707                                 if (i < (dev->hard_header_len - 1))
1708                                         printk(":");
1709                         }
1710                         printk("\n");
1711                 }
1712         }
1713 #endif
1714 }
1715
1716 static inline int __mkroute_input(struct sk_buff *skb, 
1717                                   struct fib_result* res, 
1718                                   struct in_device *in_dev, 
1719                                   __be32 daddr, __be32 saddr, u32 tos,
1720                                   struct rtable **result) 
1721 {
1722
1723         struct rtable *rth;
1724         int err;
1725         struct in_device *out_dev;
1726         unsigned flags = 0;
1727         __be32 spec_dst;
1728         u32 itag;
1729
1730         /* get a working reference to the output device */
1731         out_dev = in_dev_get(FIB_RES_DEV(*res));
1732         if (out_dev == NULL) {
1733                 if (net_ratelimit())
1734                         printk(KERN_CRIT "Bug in ip_route_input" \
1735                                "_slow(). Please, report\n");
1736                 return -EINVAL;
1737         }
1738
1739
1740         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 
1741                                   in_dev->dev, &spec_dst, &itag);
1742         if (err < 0) {
1743                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 
1744                                          saddr);
1745                 
1746                 err = -EINVAL;
1747                 goto cleanup;
1748         }
1749
1750         if (err)
1751                 flags |= RTCF_DIRECTSRC;
1752
1753         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1754             (IN_DEV_SHARED_MEDIA(out_dev) ||
1755              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1756                 flags |= RTCF_DOREDIRECT;
1757
1758         if (skb->protocol != htons(ETH_P_IP)) {
1759                 /* Not IP (i.e. ARP). Do not create route, if it is
1760                  * invalid for proxy arp. DNAT routes are always valid.
1761                  */
1762                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1763                         err = -EINVAL;
1764                         goto cleanup;
1765                 }
1766         }
1767
1768
1769         rth = dst_alloc(&ipv4_dst_ops);
1770         if (!rth) {
1771                 err = -ENOBUFS;
1772                 goto cleanup;
1773         }
1774
1775         atomic_set(&rth->u.dst.__refcnt, 1);
1776         rth->u.dst.flags= DST_HOST;
1777 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1778         if (res->fi->fib_nhs > 1)
1779                 rth->u.dst.flags |= DST_BALANCED;
1780 #endif
1781         if (in_dev->cnf.no_policy)
1782                 rth->u.dst.flags |= DST_NOPOLICY;
1783         if (in_dev->cnf.no_xfrm)
1784                 rth->u.dst.flags |= DST_NOXFRM;
1785         rth->fl.fl4_dst = daddr;
1786         rth->rt_dst     = daddr;
1787         rth->fl.fl4_tos = tos;
1788         rth->fl.mark    = skb->mark;
1789         rth->fl.fl4_src = saddr;
1790         rth->rt_src     = saddr;
1791         rth->rt_gateway = daddr;
1792         rth->rt_iif     =
1793                 rth->fl.iif     = in_dev->dev->ifindex;
1794         rth->u.dst.dev  = (out_dev)->dev;
1795         dev_hold(rth->u.dst.dev);
1796         rth->idev       = in_dev_get(rth->u.dst.dev);
1797         rth->fl.oif     = 0;
1798         rth->rt_spec_dst= spec_dst;
1799
1800         rth->u.dst.input = ip_forward;
1801         rth->u.dst.output = ip_output;
1802
1803         rt_set_nexthop(rth, res, itag);
1804
1805         rth->rt_flags = flags;
1806
1807         *result = rth;
1808         err = 0;
1809  cleanup:
1810         /* release the working reference to the output device */
1811         in_dev_put(out_dev);
1812         return err;
1813 }                                               
1814
1815 static inline int ip_mkroute_input_def(struct sk_buff *skb, 
1816                                        struct fib_result* res, 
1817                                        const struct flowi *fl,
1818                                        struct in_device *in_dev,
1819                                        __be32 daddr, __be32 saddr, u32 tos)
1820 {
1821         struct rtable* rth = NULL;
1822         int err;
1823         unsigned hash;
1824
1825 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1826         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1827                 fib_select_multipath(fl, res);
1828 #endif
1829
1830         /* create a routing cache entry */
1831         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1832         if (err)
1833                 return err;
1834
1835         /* put it into the cache */
1836         hash = rt_hash(daddr, saddr, fl->iif);
1837         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);   
1838 }
1839
1840 static inline int ip_mkroute_input(struct sk_buff *skb, 
1841                                    struct fib_result* res, 
1842                                    const struct flowi *fl,
1843                                    struct in_device *in_dev,
1844                                    __be32 daddr, __be32 saddr, u32 tos)
1845 {
1846 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1847         struct rtable* rth = NULL, *rtres;
1848         unsigned char hop, hopcount;
1849         int err = -EINVAL;
1850         unsigned int hash;
1851
1852         if (res->fi)
1853                 hopcount = res->fi->fib_nhs;
1854         else
1855                 hopcount = 1;
1856
1857         /* distinguish between multipath and singlepath */
1858         if (hopcount < 2)
1859                 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1860                                             saddr, tos);
1861         
1862         /* add all alternatives to the routing cache */
1863         for (hop = 0; hop < hopcount; hop++) {
1864                 res->nh_sel = hop;
1865
1866                 /* put reference to previous result */
1867                 if (hop)
1868                         ip_rt_put(rtres);
1869
1870                 /* create a routing cache entry */
1871                 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1872                                       &rth);
1873                 if (err)
1874                         return err;
1875
1876                 /* put it into the cache */
1877                 hash = rt_hash(daddr, saddr, fl->iif);
1878                 err = rt_intern_hash(hash, rth, &rtres);
1879                 if (err)
1880                         return err;
1881
1882                 /* forward hop information to multipath impl. */
1883                 multipath_set_nhinfo(rth,
1884                                      FIB_RES_NETWORK(*res),
1885                                      FIB_RES_NETMASK(*res),
1886                                      res->prefixlen,
1887                                      &FIB_RES_NH(*res));
1888         }
1889         skb->dst = &rtres->u.dst;
1890         return err;
1891 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1892         return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1893 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
1894 }
1895
1896
1897 /*
1898  *      NOTE. We drop all the packets that has local source
1899  *      addresses, because every properly looped back packet
1900  *      must have correct destination already attached by output routine.
1901  *
1902  *      Such approach solves two big problems:
1903  *      1. Not simplex devices are handled properly.
1904  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1905  */
1906
1907 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1908                                u8 tos, struct net_device *dev)
1909 {
1910         struct fib_result res;
1911         struct in_device *in_dev = in_dev_get(dev);
1912         struct flowi fl = { .nl_u = { .ip4_u =
1913                                       { .daddr = daddr,
1914                                         .saddr = saddr,
1915                                         .tos = tos,
1916                                         .scope = RT_SCOPE_UNIVERSE,
1917                                       } },
1918                             .mark = skb->mark,
1919                             .iif = dev->ifindex };
1920         unsigned        flags = 0;
1921         u32             itag = 0;
1922         struct rtable * rth;
1923         unsigned        hash;
1924         __be32          spec_dst;
1925         int             err = -EINVAL;
1926         int             free_res = 0;
1927
1928         /* IP on this device is disabled. */
1929
1930         if (!in_dev)
1931                 goto out;
1932
1933         /* Check for the most weird martians, which can be not detected
1934            by fib_lookup.
1935          */
1936
1937         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1938                 goto martian_source;
1939
1940         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1941                 goto brd_input;
1942
1943         /* Accept zero addresses only to limited broadcast;
1944          * I even do not know to fix it or not. Waiting for complains :-)
1945          */
1946         if (ZERONET(saddr))
1947                 goto martian_source;
1948
1949         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1950                 goto martian_destination;
1951
1952         /*
1953          *      Now we are ready to route packet.
1954          */
1955         if ((err = fib_lookup(&fl, &res)) != 0) {
1956                 if (!IN_DEV_FORWARD(in_dev))
1957                         goto e_hostunreach;
1958                 goto no_route;
1959         }
1960         free_res = 1;
1961
1962         RT_CACHE_STAT_INC(in_slow_tot);
1963
1964         if (res.type == RTN_BROADCAST)
1965                 goto brd_input;
1966
1967         if (res.type == RTN_LOCAL) {
1968                 int result;
1969                 result = fib_validate_source(saddr, daddr, tos,
1970                                              loopback_dev.ifindex,
1971                                              dev, &spec_dst, &itag);
1972                 if (result < 0)
1973                         goto martian_source;
1974                 if (result)
1975                         flags |= RTCF_DIRECTSRC;
1976                 spec_dst = daddr;
1977                 goto local_input;
1978         }
1979
1980         if (!IN_DEV_FORWARD(in_dev))
1981                 goto e_hostunreach;
1982         if (res.type != RTN_UNICAST)
1983                 goto martian_destination;
1984
1985         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1986         if (err == -ENOBUFS)
1987                 goto e_nobufs;
1988         if (err == -EINVAL)
1989                 goto e_inval;
1990         
1991 done:
1992         in_dev_put(in_dev);
1993         if (free_res)
1994                 fib_res_put(&res);
1995 out:    return err;
1996
1997 brd_input:
1998         if (skb->protocol != htons(ETH_P_IP))
1999                 goto e_inval;
2000
2001         if (ZERONET(saddr))
2002                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2003         else {
2004                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2005                                           &itag);
2006                 if (err < 0)
2007                         goto martian_source;
2008                 if (err)
2009                         flags |= RTCF_DIRECTSRC;
2010         }
2011         flags |= RTCF_BROADCAST;
2012         res.type = RTN_BROADCAST;
2013         RT_CACHE_STAT_INC(in_brd);
2014
2015 local_input:
2016         rth = dst_alloc(&ipv4_dst_ops);
2017         if (!rth)
2018                 goto e_nobufs;
2019
2020         rth->u.dst.output= ip_rt_bug;
2021
2022         atomic_set(&rth->u.dst.__refcnt, 1);
2023         rth->u.dst.flags= DST_HOST;
2024         if (in_dev->cnf.no_policy)
2025                 rth->u.dst.flags |= DST_NOPOLICY;
2026         rth->fl.fl4_dst = daddr;
2027         rth->rt_dst     = daddr;
2028         rth->fl.fl4_tos = tos;
2029         rth->fl.mark    = skb->mark;
2030         rth->fl.fl4_src = saddr;
2031         rth->rt_src     = saddr;
2032 #ifdef CONFIG_NET_CLS_ROUTE
2033         rth->u.dst.tclassid = itag;
2034 #endif
2035         rth->rt_iif     =
2036         rth->fl.iif     = dev->ifindex;
2037         rth->u.dst.dev  = &loopback_dev;
2038         dev_hold(rth->u.dst.dev);
2039         rth->idev       = in_dev_get(rth->u.dst.dev);
2040         rth->rt_gateway = daddr;
2041         rth->rt_spec_dst= spec_dst;
2042         rth->u.dst.input= ip_local_deliver;
2043         rth->rt_flags   = flags|RTCF_LOCAL;
2044         if (res.type == RTN_UNREACHABLE) {
2045                 rth->u.dst.input= ip_error;
2046                 rth->u.dst.error= -err;
2047                 rth->rt_flags   &= ~RTCF_LOCAL;
2048         }
2049         rth->rt_type    = res.type;
2050         hash = rt_hash(daddr, saddr, fl.iif);
2051         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2052         goto done;
2053
2054 no_route:
2055         RT_CACHE_STAT_INC(in_no_route);
2056         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2057         res.type = RTN_UNREACHABLE;
2058         goto local_input;
2059
2060         /*
2061          *      Do not cache martian addresses: they should be logged (RFC1812)
2062          */
2063 martian_destination:
2064         RT_CACHE_STAT_INC(in_martian_dst);
2065 #ifdef CONFIG_IP_ROUTE_VERBOSE
2066         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2067                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2068                         "%u.%u.%u.%u, dev %s\n",
2069                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2070 #endif
2071
2072 e_hostunreach:
2073         err = -EHOSTUNREACH;
2074         goto done;
2075
2076 e_inval:
2077         err = -EINVAL;
2078         goto done;
2079
2080 e_nobufs:
2081         err = -ENOBUFS;
2082         goto done;
2083
2084 martian_source:
2085         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2086         goto e_inval;
2087 }
2088
2089 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2090                    u8 tos, struct net_device *dev)
2091 {
2092         struct rtable * rth;
2093         unsigned        hash;
2094         int iif = dev->ifindex;
2095
2096         tos &= IPTOS_RT_MASK;
2097         hash = rt_hash(daddr, saddr, iif);
2098
2099         rcu_read_lock();
2100         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2101              rth = rcu_dereference(rth->u.rt_next)) {
2102                 if (rth->fl.fl4_dst == daddr &&
2103                     rth->fl.fl4_src == saddr &&
2104                     rth->fl.iif == iif &&
2105                     rth->fl.oif == 0 &&
2106                     rth->fl.mark == skb->mark &&
2107                     rth->fl.fl4_tos == tos) {
2108                         rth->u.dst.lastuse = jiffies;
2109                         dst_hold(&rth->u.dst);
2110                         rth->u.dst.__use++;
2111                         RT_CACHE_STAT_INC(in_hit);
2112                         rcu_read_unlock();
2113                         skb->dst = (struct dst_entry*)rth;
2114                         return 0;
2115                 }
2116                 RT_CACHE_STAT_INC(in_hlist_search);
2117         }
2118         rcu_read_unlock();
2119
2120         /* Multicast recognition logic is moved from route cache to here.
2121            The problem was that too many Ethernet cards have broken/missing
2122            hardware multicast filters :-( As result the host on multicasting
2123            network acquires a lot of useless route cache entries, sort of
2124            SDR messages from all the world. Now we try to get rid of them.
2125            Really, provided software IP multicast filter is organized
2126            reasonably (at least, hashed), it does not result in a slowdown
2127            comparing with route cache reject entries.
2128            Note, that multicast routers are not affected, because
2129            route cache entry is created eventually.
2130          */
2131         if (MULTICAST(daddr)) {
2132                 struct in_device *in_dev;
2133
2134                 rcu_read_lock();
2135                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2136                         int our = ip_check_mc(in_dev, daddr, saddr,
2137                                 skb->nh.iph->protocol);
2138                         if (our
2139 #ifdef CONFIG_IP_MROUTE
2140                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2141 #endif
2142                             ) {
2143                                 rcu_read_unlock();
2144                                 return ip_route_input_mc(skb, daddr, saddr,
2145                                                          tos, dev, our);
2146                         }
2147                 }
2148                 rcu_read_unlock();
2149                 return -EINVAL;
2150         }
2151         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2152 }
2153
2154 static inline int __mkroute_output(struct rtable **result,
2155                                    struct fib_result* res, 
2156                                    const struct flowi *fl,
2157                                    const struct flowi *oldflp, 
2158                                    struct net_device *dev_out, 
2159                                    unsigned flags) 
2160 {
2161         struct rtable *rth;
2162         struct in_device *in_dev;
2163         u32 tos = RT_FL_TOS(oldflp);
2164         int err = 0;
2165
2166         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2167                 return -EINVAL;
2168
2169         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2170                 res->type = RTN_BROADCAST;
2171         else if (MULTICAST(fl->fl4_dst))
2172                 res->type = RTN_MULTICAST;
2173         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2174                 return -EINVAL;
2175
2176         if (dev_out->flags & IFF_LOOPBACK)
2177                 flags |= RTCF_LOCAL;
2178
2179         /* get work reference to inet device */
2180         in_dev = in_dev_get(dev_out);
2181         if (!in_dev)
2182                 return -EINVAL;
2183
2184         if (res->type == RTN_BROADCAST) {
2185                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2186                 if (res->fi) {
2187                         fib_info_put(res->fi);
2188                         res->fi = NULL;
2189                 }
2190         } else if (res->type == RTN_MULTICAST) {
2191                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2192                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 
2193                                  oldflp->proto))
2194                         flags &= ~RTCF_LOCAL;
2195                 /* If multicast route do not exist use
2196                    default one, but do not gateway in this case.
2197                    Yes, it is hack.
2198                  */
2199                 if (res->fi && res->prefixlen < 4) {
2200                         fib_info_put(res->fi);
2201                         res->fi = NULL;
2202                 }
2203         }
2204
2205
2206         rth = dst_alloc(&ipv4_dst_ops);
2207         if (!rth) {
2208                 err = -ENOBUFS;
2209                 goto cleanup;
2210         }               
2211
2212         atomic_set(&rth->u.dst.__refcnt, 1);
2213         rth->u.dst.flags= DST_HOST;
2214 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2215         if (res->fi) {
2216                 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2217                 if (res->fi->fib_nhs > 1)
2218                         rth->u.dst.flags |= DST_BALANCED;
2219         }
2220 #endif
2221         if (in_dev->cnf.no_xfrm)
2222                 rth->u.dst.flags |= DST_NOXFRM;
2223         if (in_dev->cnf.no_policy)
2224                 rth->u.dst.flags |= DST_NOPOLICY;
2225
2226         rth->fl.fl4_dst = oldflp->fl4_dst;
2227         rth->fl.fl4_tos = tos;
2228         rth->fl.fl4_src = oldflp->fl4_src;
2229         rth->fl.oif     = oldflp->oif;
2230         rth->fl.mark    = oldflp->mark;
2231         rth->rt_dst     = fl->fl4_dst;
2232         rth->rt_src     = fl->fl4_src;
2233         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2234         /* get references to the devices that are to be hold by the routing 
2235            cache entry */
2236         rth->u.dst.dev  = dev_out;
2237         dev_hold(dev_out);
2238         rth->idev       = in_dev_get(dev_out);
2239         rth->rt_gateway = fl->fl4_dst;
2240         rth->rt_spec_dst= fl->fl4_src;
2241
2242         rth->u.dst.output=ip_output;
2243
2244         RT_CACHE_STAT_INC(out_slow_tot);
2245
2246         if (flags & RTCF_LOCAL) {
2247                 rth->u.dst.input = ip_local_deliver;
2248                 rth->rt_spec_dst = fl->fl4_dst;
2249         }
2250         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2251                 rth->rt_spec_dst = fl->fl4_src;
2252                 if (flags & RTCF_LOCAL && 
2253                     !(dev_out->flags & IFF_LOOPBACK)) {
2254                         rth->u.dst.output = ip_mc_output;
2255                         RT_CACHE_STAT_INC(out_slow_mc);
2256                 }
2257 #ifdef CONFIG_IP_MROUTE
2258                 if (res->type == RTN_MULTICAST) {
2259                         if (IN_DEV_MFORWARD(in_dev) &&
2260                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2261                                 rth->u.dst.input = ip_mr_input;
2262                                 rth->u.dst.output = ip_mc_output;
2263                         }
2264                 }
2265 #endif
2266         }
2267
2268         rt_set_nexthop(rth, res, 0);
2269
2270         rth->rt_flags = flags;
2271
2272         *result = rth;
2273  cleanup:
2274         /* release work reference to inet device */
2275         in_dev_put(in_dev);
2276
2277         return err;
2278 }
2279
2280 static inline int ip_mkroute_output_def(struct rtable **rp,
2281                                         struct fib_result* res,
2282                                         const struct flowi *fl,
2283                                         const struct flowi *oldflp,
2284                                         struct net_device *dev_out,
2285                                         unsigned flags)
2286 {
2287         struct rtable *rth = NULL;
2288         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2289         unsigned hash;
2290         if (err == 0) {
2291                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2292                 err = rt_intern_hash(hash, rth, rp);
2293         }
2294         
2295         return err;
2296 }
2297
2298 static inline int ip_mkroute_output(struct rtable** rp,
2299                                     struct fib_result* res,
2300                                     const struct flowi *fl,
2301                                     const struct flowi *oldflp,
2302                                     struct net_device *dev_out,
2303                                     unsigned flags)
2304 {
2305 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2306         unsigned char hop;
2307         unsigned hash;
2308         int err = -EINVAL;
2309         struct rtable *rth = NULL;
2310
2311         if (res->fi && res->fi->fib_nhs > 1) {
2312                 unsigned char hopcount = res->fi->fib_nhs;
2313
2314                 for (hop = 0; hop < hopcount; hop++) {
2315                         struct net_device *dev2nexthop;
2316
2317                         res->nh_sel = hop;
2318
2319                         /* hold a work reference to the output device */
2320                         dev2nexthop = FIB_RES_DEV(*res);
2321                         dev_hold(dev2nexthop);
2322
2323                         /* put reference to previous result */
2324                         if (hop)
2325                                 ip_rt_put(*rp);
2326
2327                         err = __mkroute_output(&rth, res, fl, oldflp,
2328                                                dev2nexthop, flags);
2329
2330                         if (err != 0)
2331                                 goto cleanup;
2332
2333                         hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2334                                         oldflp->oif);
2335                         err = rt_intern_hash(hash, rth, rp);
2336
2337                         /* forward hop information to multipath impl. */
2338                         multipath_set_nhinfo(rth,
2339                                              FIB_RES_NETWORK(*res),
2340                                              FIB_RES_NETMASK(*res),
2341                                              res->prefixlen,
2342                                              &FIB_RES_NH(*res));
2343                 cleanup:
2344                         /* release work reference to output device */
2345                         dev_put(dev2nexthop);
2346
2347                         if (err != 0)
2348                                 return err;
2349                 }
2350                 return err;
2351         } else {
2352                 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2353                                              flags);
2354         }
2355 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2356         return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2357 #endif
2358 }
2359
2360 /*
2361  * Major route resolver routine.
2362  */
2363
2364 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2365 {
2366         u32 tos = RT_FL_TOS(oldflp);
2367         struct flowi fl = { .nl_u = { .ip4_u =
2368                                       { .daddr = oldflp->fl4_dst,
2369                                         .saddr = oldflp->fl4_src,
2370                                         .tos = tos & IPTOS_RT_MASK,
2371                                         .scope = ((tos & RTO_ONLINK) ?
2372                                                   RT_SCOPE_LINK :
2373                                                   RT_SCOPE_UNIVERSE),
2374                                       } },
2375                             .mark = oldflp->mark,
2376                             .iif = loopback_dev.ifindex,
2377                             .oif = oldflp->oif };
2378         struct fib_result res;
2379         unsigned flags = 0;
2380         struct net_device *dev_out = NULL;
2381         int free_res = 0;
2382         int err;
2383
2384
2385         res.fi          = NULL;
2386 #ifdef CONFIG_IP_MULTIPLE_TABLES
2387         res.r           = NULL;
2388 #endif
2389
2390         if (oldflp->fl4_src) {
2391                 err = -EINVAL;
2392                 if (MULTICAST(oldflp->fl4_src) ||
2393                     BADCLASS(oldflp->fl4_src) ||
2394                     ZERONET(oldflp->fl4_src))
2395                         goto out;
2396
2397                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2398                 dev_out = ip_dev_find(oldflp->fl4_src);
2399                 if (dev_out == NULL)
2400                         goto out;
2401
2402                 /* I removed check for oif == dev_out->oif here.
2403                    It was wrong for two reasons:
2404                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2405                       assigned to multiple interfaces.
2406                    2. Moreover, we are allowed to send packets with saddr
2407                       of another iface. --ANK
2408                  */
2409
2410                 if (oldflp->oif == 0
2411                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2412                         /* Special hack: user can direct multicasts
2413                            and limited broadcast via necessary interface
2414                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2415                            This hack is not just for fun, it allows
2416                            vic,vat and friends to work.
2417                            They bind socket to loopback, set ttl to zero
2418                            and expect that it will work.
2419                            From the viewpoint of routing cache they are broken,
2420                            because we are not allowed to build multicast path
2421                            with loopback source addr (look, routing cache
2422                            cannot know, that ttl is zero, so that packet
2423                            will not leave this host and route is valid).
2424                            Luckily, this hack is good workaround.
2425                          */
2426
2427                         fl.oif = dev_out->ifindex;
2428                         goto make_route;
2429                 }
2430                 if (dev_out)
2431                         dev_put(dev_out);
2432                 dev_out = NULL;
2433         }
2434
2435
2436         if (oldflp->oif) {
2437                 dev_out = dev_get_by_index(oldflp->oif);
2438                 err = -ENODEV;
2439                 if (dev_out == NULL)
2440                         goto out;
2441
2442                 /* RACE: Check return value of inet_select_addr instead. */
2443                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2444                         dev_put(dev_out);
2445                         goto out;       /* Wrong error code */
2446                 }
2447
2448                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2449                         if (!fl.fl4_src)
2450                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2451                                                               RT_SCOPE_LINK);
2452                         goto make_route;
2453                 }
2454                 if (!fl.fl4_src) {
2455                         if (MULTICAST(oldflp->fl4_dst))
2456                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2457                                                               fl.fl4_scope);
2458                         else if (!oldflp->fl4_dst)
2459                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2460                                                               RT_SCOPE_HOST);
2461                 }
2462         }
2463
2464         if (!fl.fl4_dst) {
2465                 fl.fl4_dst = fl.fl4_src;
2466                 if (!fl.fl4_dst)
2467                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2468                 if (dev_out)
2469                         dev_put(dev_out);
2470                 dev_out = &loopback_dev;
2471                 dev_hold(dev_out);
2472                 fl.oif = loopback_dev.ifindex;
2473                 res.type = RTN_LOCAL;
2474                 flags |= RTCF_LOCAL;
2475                 goto make_route;
2476         }
2477
2478         if (fib_lookup(&fl, &res)) {
2479                 res.fi = NULL;
2480                 if (oldflp->oif) {
2481                         /* Apparently, routing tables are wrong. Assume,
2482                            that the destination is on link.
2483
2484                            WHY? DW.
2485                            Because we are allowed to send to iface
2486                            even if it has NO routes and NO assigned
2487                            addresses. When oif is specified, routing
2488                            tables are looked up with only one purpose:
2489                            to catch if destination is gatewayed, rather than
2490                            direct. Moreover, if MSG_DONTROUTE is set,
2491                            we send packet, ignoring both routing tables
2492                            and ifaddr state. --ANK
2493
2494
2495                            We could make it even if oif is unknown,
2496                            likely IPv6, but we do not.
2497                          */
2498
2499                         if (fl.fl4_src == 0)
2500                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2501                                                               RT_SCOPE_LINK);
2502                         res.type = RTN_UNICAST;
2503                         goto make_route;
2504                 }
2505                 if (dev_out)
2506                         dev_put(dev_out);
2507                 err = -ENETUNREACH;
2508                 goto out;
2509         }
2510         free_res = 1;
2511
2512         if (res.type == RTN_LOCAL) {
2513                 if (!fl.fl4_src)
2514                         fl.fl4_src = fl.fl4_dst;
2515                 if (dev_out)
2516                         dev_put(dev_out);
2517                 dev_out = &loopback_dev;
2518                 dev_hold(dev_out);
2519                 fl.oif = dev_out->ifindex;
2520                 if (res.fi)
2521                         fib_info_put(res.fi);
2522                 res.fi = NULL;
2523                 flags |= RTCF_LOCAL;
2524                 goto make_route;
2525         }
2526
2527 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2528         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2529                 fib_select_multipath(&fl, &res);
2530         else
2531 #endif
2532         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2533                 fib_select_default(&fl, &res);
2534
2535         if (!fl.fl4_src)
2536                 fl.fl4_src = FIB_RES_PREFSRC(res);
2537
2538         if (dev_out)
2539                 dev_put(dev_out);
2540         dev_out = FIB_RES_DEV(res);
2541         dev_hold(dev_out);
2542         fl.oif = dev_out->ifindex;
2543
2544
2545 make_route:
2546         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2547
2548
2549         if (free_res)
2550                 fib_res_put(&res);
2551         if (dev_out)
2552                 dev_put(dev_out);
2553 out:    return err;
2554 }
2555
2556 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2557 {
2558         unsigned hash;
2559         struct rtable *rth;
2560
2561         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2562
2563         rcu_read_lock_bh();
2564         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2565                 rth = rcu_dereference(rth->u.rt_next)) {
2566                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2567                     rth->fl.fl4_src == flp->fl4_src &&
2568                     rth->fl.iif == 0 &&
2569                     rth->fl.oif == flp->oif &&
2570                     rth->fl.mark == flp->mark &&
2571                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2572                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2573
2574                         /* check for multipath routes and choose one if
2575                          * necessary
2576                          */
2577                         if (multipath_select_route(flp, rth, rp)) {
2578                                 dst_hold(&(*rp)->u.dst);
2579                                 RT_CACHE_STAT_INC(out_hit);
2580                                 rcu_read_unlock_bh();
2581                                 return 0;
2582                         }
2583
2584                         rth->u.dst.lastuse = jiffies;
2585                         dst_hold(&rth->u.dst);
2586                         rth->u.dst.__use++;
2587                         RT_CACHE_STAT_INC(out_hit);
2588                         rcu_read_unlock_bh();
2589                         *rp = rth;
2590                         return 0;
2591                 }
2592                 RT_CACHE_STAT_INC(out_hlist_search);
2593         }
2594         rcu_read_unlock_bh();
2595
2596         return ip_route_output_slow(rp, flp);
2597 }
2598
2599 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2600
2601 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2602 {
2603         int err;
2604
2605         if ((err = __ip_route_output_key(rp, flp)) != 0)
2606                 return err;
2607
2608         if (flp->proto) {
2609                 if (!flp->fl4_src)
2610                         flp->fl4_src = (*rp)->rt_src;
2611                 if (!flp->fl4_dst)
2612                         flp->fl4_dst = (*rp)->rt_dst;
2613                 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2614         }
2615
2616         return 0;
2617 }
2618
2619 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2620
2621 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2622 {
2623         return ip_route_output_flow(rp, flp, NULL, 0);
2624 }
2625
2626 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2627                         int nowait, unsigned int flags)
2628 {
2629         struct rtable *rt = (struct rtable*)skb->dst;
2630         struct rtmsg *r;
2631         struct nlmsghdr *nlh;
2632         struct rta_cacheinfo ci;
2633
2634         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2635         if (nlh == NULL)
2636                 return -ENOBUFS;
2637
2638         r = nlmsg_data(nlh);
2639         r->rtm_family    = AF_INET;
2640         r->rtm_dst_len  = 32;
2641         r->rtm_src_len  = 0;
2642         r->rtm_tos      = rt->fl.fl4_tos;
2643         r->rtm_table    = RT_TABLE_MAIN;
2644         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2645         r->rtm_type     = rt->rt_type;
2646         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2647         r->rtm_protocol = RTPROT_UNSPEC;
2648         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2649         if (rt->rt_flags & RTCF_NOTIFY)
2650                 r->rtm_flags |= RTM_F_NOTIFY;
2651
2652         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2653
2654         if (rt->fl.fl4_src) {
2655                 r->rtm_src_len = 32;
2656                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2657         }
2658         if (rt->u.dst.dev)
2659                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2660 #ifdef CONFIG_NET_CLS_ROUTE
2661         if (rt->u.dst.tclassid)
2662                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2663 #endif
2664 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2665         if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2666                 NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
2667 #endif
2668         if (rt->fl.iif)
2669                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2670         else if (rt->rt_src != rt->fl.fl4_src)
2671                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2672
2673         if (rt->rt_dst != rt->rt_gateway)
2674                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2675
2676         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2677                 goto nla_put_failure;
2678
2679         ci.rta_lastuse  = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2680         ci.rta_used     = rt->u.dst.__use;
2681         ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
2682         if (rt->u.dst.expires)
2683                 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2684         else
2685                 ci.rta_expires = 0;
2686         ci.rta_error    = rt->u.dst.error;
2687         ci.rta_id       = ci.rta_ts = ci.rta_tsage = 0;
2688         if (rt->peer) {
2689                 ci.rta_id = rt->peer->ip_id_count;
2690                 if (rt->peer->tcp_ts_stamp) {
2691                         ci.rta_ts = rt->peer->tcp_ts;
2692                         ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2693                 }
2694         }
2695
2696         if (rt->fl.iif) {
2697 #ifdef CONFIG_IP_MROUTE
2698                 __be32 dst = rt->rt_dst;
2699
2700                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2701                     ipv4_devconf.mc_forwarding) {
2702                         int err = ipmr_get_route(skb, r, nowait);
2703                         if (err <= 0) {
2704                                 if (!nowait) {
2705                                         if (err == 0)
2706                                                 return 0;
2707                                         goto nla_put_failure;
2708                                 } else {
2709                                         if (err == -EMSGSIZE)
2710                                                 goto nla_put_failure;
2711                                         ci.rta_error = err;
2712                                 }
2713                         }
2714                 } else
2715 #endif
2716                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2717         }
2718
2719         NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2720
2721         return nlmsg_end(skb, nlh);
2722
2723 nla_put_failure:
2724         return nlmsg_cancel(skb, nlh);
2725 }
2726
2727 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2728 {
2729         struct rtmsg *rtm;
2730         struct nlattr *tb[RTA_MAX+1];
2731         struct rtable *rt = NULL;
2732         __be32 dst = 0;
2733         __be32 src = 0;
2734         u32 iif;
2735         int err;
2736         struct sk_buff *skb;
2737
2738         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2739         if (err < 0)
2740                 goto errout;
2741
2742         rtm = nlmsg_data(nlh);
2743
2744         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2745         if (skb == NULL) {
2746                 err = -ENOBUFS;
2747                 goto errout;
2748         }
2749
2750         /* Reserve room for dummy headers, this skb can pass
2751            through good chunk of routing engine.
2752          */
2753         skb->mac.raw = skb->nh.raw = skb->data;
2754
2755         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2756         skb->nh.iph->protocol = IPPROTO_ICMP;
2757         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2758
2759         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2760         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2761         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2762
2763         if (iif) {
2764                 struct net_device *dev;
2765
2766                 dev = __dev_get_by_index(iif);
2767                 if (dev == NULL) {
2768                         err = -ENODEV;
2769                         goto errout_free;
2770                 }
2771
2772                 skb->protocol   = htons(ETH_P_IP);
2773                 skb->dev        = dev;
2774                 local_bh_disable();
2775                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2776                 local_bh_enable();
2777
2778                 rt = (struct rtable*) skb->dst;
2779                 if (err == 0 && rt->u.dst.error)
2780                         err = -rt->u.dst.error;
2781         } else {
2782                 struct flowi fl = {
2783                         .nl_u = {
2784                                 .ip4_u = {
2785                                         .daddr = dst,
2786                                         .saddr = src,
2787                                         .tos = rtm->rtm_tos,
2788                                 },
2789                         },
2790                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2791                 };
2792                 err = ip_route_output_key(&rt, &fl);
2793         }
2794
2795         if (err)
2796                 goto errout_free;
2797
2798         skb->dst = &rt->u.dst;
2799         if (rtm->rtm_flags & RTM_F_NOTIFY)
2800                 rt->rt_flags |= RTCF_NOTIFY;
2801
2802         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2803                                 RTM_NEWROUTE, 0, 0);
2804         if (err <= 0)
2805                 goto errout_free;
2806
2807         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2808 errout:
2809         return err;
2810
2811 errout_free:
2812         kfree_skb(skb);
2813         goto errout;
2814 }
2815
2816 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2817 {
2818         struct rtable *rt;
2819         int h, s_h;
2820         int idx, s_idx;
2821
2822         s_h = cb->args[0];
2823         s_idx = idx = cb->args[1];
2824         for (h = 0; h <= rt_hash_mask; h++) {
2825                 if (h < s_h) continue;
2826                 if (h > s_h)
2827                         s_idx = 0;
2828                 rcu_read_lock_bh();
2829                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2830                      rt = rcu_dereference(rt->u.rt_next), idx++) {
2831                         if (idx < s_idx)
2832                                 continue;
2833                         skb->dst = dst_clone(&rt->u.dst);
2834                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2835                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE, 
2836                                          1, NLM_F_MULTI) <= 0) {
2837                                 dst_release(xchg(&skb->dst, NULL));
2838                                 rcu_read_unlock_bh();
2839                                 goto done;
2840                         }
2841                         dst_release(xchg(&skb->dst, NULL));
2842                 }
2843                 rcu_read_unlock_bh();
2844         }
2845
2846 done:
2847         cb->args[0] = h;
2848         cb->args[1] = idx;
2849         return skb->len;
2850 }
2851
2852 void ip_rt_multicast_event(struct in_device *in_dev)
2853 {
2854         rt_cache_flush(0);
2855 }
2856
2857 #ifdef CONFIG_SYSCTL
2858 static int flush_delay;
2859
2860 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2861                                         struct file *filp, void __user *buffer,
2862                                         size_t *lenp, loff_t *ppos)
2863 {
2864         if (write) {
2865                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2866                 rt_cache_flush(flush_delay);
2867                 return 0;
2868         } 
2869
2870         return -EINVAL;
2871 }
2872
2873 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2874                                                 int __user *name,
2875                                                 int nlen,
2876                                                 void __user *oldval,
2877                                                 size_t __user *oldlenp,
2878                                                 void __user *newval,
2879                                                 size_t newlen,
2880                                                 void **context)
2881 {
2882         int delay;
2883         if (newlen != sizeof(int))
2884                 return -EINVAL;
2885         if (get_user(delay, (int __user *)newval))
2886                 return -EFAULT; 
2887         rt_cache_flush(delay); 
2888         return 0;
2889 }
2890
2891 ctl_table ipv4_route_table[] = {
2892         {
2893                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2894                 .procname       = "flush",
2895                 .data           = &flush_delay,
2896                 .maxlen         = sizeof(int),
2897                 .mode           = 0200,
2898                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2899                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2900         },
2901         {
2902                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2903                 .procname       = "min_delay",
2904                 .data           = &ip_rt_min_delay,
2905                 .maxlen         = sizeof(int),
2906                 .mode           = 0644,
2907                 .proc_handler   = &proc_dointvec_jiffies,
2908                 .strategy       = &sysctl_jiffies,
2909         },
2910         {
2911                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2912                 .procname       = "max_delay",
2913                 .data           = &ip_rt_max_delay,
2914                 .maxlen         = sizeof(int),
2915                 .mode           = 0644,
2916                 .proc_handler   = &proc_dointvec_jiffies,
2917                 .strategy       = &sysctl_jiffies,
2918         },
2919         {
2920                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2921                 .procname       = "gc_thresh",
2922                 .data           = &ipv4_dst_ops.gc_thresh,
2923                 .maxlen         = sizeof(int),
2924                 .mode           = 0644,
2925                 .proc_handler   = &proc_dointvec,
2926         },
2927         {
2928                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2929                 .procname       = "max_size",
2930                 .data           = &ip_rt_max_size,
2931                 .maxlen         = sizeof(int),
2932                 .mode           = 0644,
2933                 .proc_handler   = &proc_dointvec,
2934         },
2935         {
2936                 /*  Deprecated. Use gc_min_interval_ms */
2937  
2938                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2939                 .procname       = "gc_min_interval",
2940                 .data           = &ip_rt_gc_min_interval,
2941                 .maxlen         = sizeof(int),
2942                 .mode           = 0644,
2943                 .proc_handler   = &proc_dointvec_jiffies,
2944                 .strategy       = &sysctl_jiffies,
2945         },
2946         {
2947                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2948                 .procname       = "gc_min_interval_ms",
2949                 .data           = &ip_rt_gc_min_interval,
2950                 .maxlen         = sizeof(int),
2951                 .mode           = 0644,
2952                 .proc_handler   = &proc_dointvec_ms_jiffies,
2953                 .strategy       = &sysctl_ms_jiffies,
2954         },
2955         {
2956                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2957                 .procname       = "gc_timeout",
2958                 .data           = &ip_rt_gc_timeout,
2959                 .maxlen         = sizeof(int),
2960                 .mode           = 0644,
2961                 .proc_handler   = &proc_dointvec_jiffies,
2962                 .strategy       = &sysctl_jiffies,
2963         },
2964         {
2965                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2966                 .procname       = "gc_interval",
2967                 .data           = &ip_rt_gc_interval,
2968                 .maxlen         = sizeof(int),
2969                 .mode           = 0644,
2970                 .proc_handler   = &proc_dointvec_jiffies,
2971                 .strategy       = &sysctl_jiffies,
2972         },
2973         {
2974                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2975                 .procname       = "redirect_load",
2976                 .data           = &ip_rt_redirect_load,
2977                 .maxlen         = sizeof(int),
2978                 .mode           = 0644,
2979                 .proc_handler   = &proc_dointvec,
2980         },
2981         {
2982                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2983                 .procname       = "redirect_number",
2984                 .data           = &ip_rt_redirect_number,
2985                 .maxlen         = sizeof(int),
2986                 .mode           = 0644,
2987                 .proc_handler   = &proc_dointvec,
2988         },
2989         {
2990                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2991                 .procname       = "redirect_silence",
2992                 .data           = &ip_rt_redirect_silence,
2993                 .maxlen         = sizeof(int),
2994                 .mode           = 0644,
2995                 .proc_handler   = &proc_dointvec,
2996         },
2997         {
2998                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2999                 .procname       = "error_cost",
3000                 .data           = &ip_rt_error_cost,
3001                 .maxlen         = sizeof(int),
3002                 .mode           = 0644,
3003                 .proc_handler   = &proc_dointvec,
3004         },
3005         {
3006                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3007                 .procname       = "error_burst",
3008                 .data           = &ip_rt_error_burst,
3009                 .maxlen         = sizeof(int),
3010                 .mode           = 0644,
3011                 .proc_handler   = &proc_dointvec,
3012         },
3013         {
3014                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3015                 .procname       = "gc_elasticity",
3016                 .data           = &ip_rt_gc_elasticity,
3017                 .maxlen         = sizeof(int),
3018                 .mode           = 0644,
3019                 .proc_handler   = &proc_dointvec,
3020         },
3021         {
3022                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3023                 .procname       = "mtu_expires",
3024                 .data           = &ip_rt_mtu_expires,
3025                 .maxlen         = sizeof(int),
3026                 .mode           = 0644,
3027                 .proc_handler   = &proc_dointvec_jiffies,
3028                 .strategy       = &sysctl_jiffies,
3029         },
3030         {
3031                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3032                 .procname       = "min_pmtu",
3033                 .data           = &ip_rt_min_pmtu,
3034                 .maxlen         = sizeof(int),
3035                 .mode           = 0644,
3036                 .proc_handler   = &proc_dointvec,
3037         },
3038         {
3039                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3040                 .procname       = "min_adv_mss",
3041                 .data           = &ip_rt_min_advmss,
3042                 .maxlen         = sizeof(int),
3043                 .mode           = 0644,
3044                 .proc_handler   = &proc_dointvec,
3045         },
3046         {
3047                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3048                 .procname       = "secret_interval",
3049                 .data           = &ip_rt_secret_interval,
3050                 .maxlen         = sizeof(int),
3051                 .mode           = 0644,
3052                 .proc_handler   = &proc_dointvec_jiffies,
3053                 .strategy       = &sysctl_jiffies,
3054         },
3055         { .ctl_name = 0 }
3056 };
3057 #endif
3058
3059 #ifdef CONFIG_NET_CLS_ROUTE
3060 struct ip_rt_acct *ip_rt_acct;
3061
3062 /* This code sucks.  But you should have seen it before! --RR */
3063
3064 /* IP route accounting ptr for this logical cpu number. */
3065 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3066
3067 #ifdef CONFIG_PROC_FS
3068 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3069                            int length, int *eof, void *data)
3070 {
3071         unsigned int i;
3072
3073         if ((offset & 3) || (length & 3))
3074                 return -EIO;
3075
3076         if (offset >= sizeof(struct ip_rt_acct) * 256) {
3077                 *eof = 1;
3078                 return 0;
3079         }
3080
3081         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3082                 length = sizeof(struct ip_rt_acct) * 256 - offset;
3083                 *eof = 1;
3084         }
3085
3086         offset /= sizeof(u32);
3087
3088         if (length > 0) {
3089                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3090                 u32 *dst = (u32 *) buffer;
3091
3092                 /* Copy first cpu. */
3093                 *start = buffer;
3094                 memcpy(dst, src, length);
3095
3096                 /* Add the other cpus in, one int at a time */
3097                 for_each_possible_cpu(i) {
3098                         unsigned int j;
3099
3100                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3101
3102                         for (j = 0; j < length/4; j++)
3103                                 dst[j] += src[j];
3104                 }
3105         }
3106         return length;
3107 }
3108 #endif /* CONFIG_PROC_FS */
3109 #endif /* CONFIG_NET_CLS_ROUTE */
3110
3111 static __initdata unsigned long rhash_entries;
3112 static int __init set_rhash_entries(char *str)
3113 {
3114         if (!str)
3115                 return 0;
3116         rhash_entries = simple_strtoul(str, &str, 0);
3117         return 1;
3118 }
3119 __setup("rhash_entries=", set_rhash_entries);
3120
3121 int __init ip_rt_init(void)
3122 {
3123         int rc = 0;
3124
3125         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3126                              (jiffies ^ (jiffies >> 7)));
3127
3128 #ifdef CONFIG_NET_CLS_ROUTE
3129         {
3130         int order;
3131         for (order = 0;
3132              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3133                 /* NOTHING */;
3134         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3135         if (!ip_rt_acct)
3136                 panic("IP: failed to allocate ip_rt_acct\n");
3137         memset(ip_rt_acct, 0, PAGE_SIZE << order);
3138         }
3139 #endif
3140
3141         ipv4_dst_ops.kmem_cachep =
3142                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3143                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
3144
3145         rt_hash_table = (struct rt_hash_bucket *)
3146                 alloc_large_system_hash("IP route cache",
3147                                         sizeof(struct rt_hash_bucket),
3148                                         rhash_entries,
3149                                         (num_physpages >= 128 * 1024) ?
3150                                         15 : 17,
3151                                         0,
3152                                         &rt_hash_log,
3153                                         &rt_hash_mask,
3154                                         0);
3155         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3156         rt_hash_lock_init();
3157
3158         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3159         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3160
3161         devinet_init();
3162         ip_fib_init();
3163
3164         init_timer(&rt_flush_timer);
3165         rt_flush_timer.function = rt_run_flush;
3166         init_timer(&rt_periodic_timer);
3167         rt_periodic_timer.function = rt_check_expire;
3168         init_timer(&rt_secret_timer);
3169         rt_secret_timer.function = rt_secret_rebuild;
3170
3171         /* All the timers, started at system startup tend
3172            to synchronize. Perturb it a bit.
3173          */
3174         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3175                                         ip_rt_gc_interval;
3176         add_timer(&rt_periodic_timer);
3177
3178         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3179                 ip_rt_secret_interval;
3180         add_timer(&rt_secret_timer);
3181
3182 #ifdef CONFIG_PROC_FS
3183         {
3184         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3185         if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3186             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 
3187                                              proc_net_stat))) {
3188                 return -ENOMEM;
3189         }
3190         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3191         }
3192 #ifdef CONFIG_NET_CLS_ROUTE
3193         create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3194 #endif
3195 #endif
3196 #ifdef CONFIG_XFRM
3197         xfrm_init();
3198         xfrm4_init();
3199 #endif
3200         return rc;
3201 }
3202
3203 EXPORT_SYMBOL(__ip_select_ident);
3204 EXPORT_SYMBOL(ip_route_input);
3205 EXPORT_SYMBOL(ip_route_output_key);