]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv4/route.c
[NET]: Make /proc/net per network namespace
[net-next-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/skbuff.h>
85 #include <linux/inetdevice.h>
86 #include <linux/igmp.h>
87 #include <linux/pkt_sched.h>
88 #include <linux/mroute.h>
89 #include <linux/netfilter_ipv4.h>
90 #include <linux/random.h>
91 #include <linux/jhash.h>
92 #include <linux/rcupdate.h>
93 #include <linux/times.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110
111 #define RT_FL_TOS(oldflp) \
112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114 #define IP_MAX_MTU      0xFFF0
115
116 #define RT_GC_TIMEOUT (300*HZ)
117
118 static int ip_rt_min_delay              = 2 * HZ;
119 static int ip_rt_max_delay              = 10 * HZ;
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval            = 60 * HZ;
123 static int ip_rt_gc_min_interval        = HZ / 2;
124 static int ip_rt_redirect_number        = 9;
125 static int ip_rt_redirect_load          = HZ / 50;
126 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost             = HZ;
128 static int ip_rt_error_burst            = 5 * HZ;
129 static int ip_rt_gc_elasticity          = 8;
130 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu               = 512 + 20 + 20;
132 static int ip_rt_min_advmss             = 256;
133 static int ip_rt_secret_interval        = 10 * 60 * HZ;
134 static unsigned long rt_deadline;
135
136 #define RTprint(a...)   printk(KERN_DEBUG a)
137
138 static struct timer_list rt_flush_timer;
139 static struct timer_list rt_periodic_timer;
140 static struct timer_list rt_secret_timer;
141
142 /*
143  *      Interface to generic destination cache.
144  */
145
146 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
147 static void              ipv4_dst_destroy(struct dst_entry *dst);
148 static void              ipv4_dst_ifdown(struct dst_entry *dst,
149                                          struct net_device *dev, int how);
150 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
151 static void              ipv4_link_failure(struct sk_buff *skb);
152 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
153 static int rt_garbage_collect(void);
154
155
156 static struct dst_ops ipv4_dst_ops = {
157         .family =               AF_INET,
158         .protocol =             __constant_htons(ETH_P_IP),
159         .gc =                   rt_garbage_collect,
160         .check =                ipv4_dst_check,
161         .destroy =              ipv4_dst_destroy,
162         .ifdown =               ipv4_dst_ifdown,
163         .negative_advice =      ipv4_negative_advice,
164         .link_failure =         ipv4_link_failure,
165         .update_pmtu =          ip_rt_update_pmtu,
166         .entry_size =           sizeof(struct rtable),
167 };
168
169 #define ECN_OR_COST(class)      TC_PRIO_##class
170
171 const __u8 ip_tos2prio[16] = {
172         TC_PRIO_BESTEFFORT,
173         ECN_OR_COST(FILLER),
174         TC_PRIO_BESTEFFORT,
175         ECN_OR_COST(BESTEFFORT),
176         TC_PRIO_BULK,
177         ECN_OR_COST(BULK),
178         TC_PRIO_BULK,
179         ECN_OR_COST(BULK),
180         TC_PRIO_INTERACTIVE,
181         ECN_OR_COST(INTERACTIVE),
182         TC_PRIO_INTERACTIVE,
183         ECN_OR_COST(INTERACTIVE),
184         TC_PRIO_INTERACTIVE_BULK,
185         ECN_OR_COST(INTERACTIVE_BULK),
186         TC_PRIO_INTERACTIVE_BULK,
187         ECN_OR_COST(INTERACTIVE_BULK)
188 };
189
190
191 /*
192  * Route cache.
193  */
194
195 /* The locking scheme is rather straight forward:
196  *
197  * 1) Read-Copy Update protects the buckets of the central route hash.
198  * 2) Only writers remove entries, and they hold the lock
199  *    as they look at rtable reference counts.
200  * 3) Only readers acquire references to rtable entries,
201  *    they do so with atomic increments and with the
202  *    lock held.
203  */
204
205 struct rt_hash_bucket {
206         struct rtable   *chain;
207 };
208 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
209         defined(CONFIG_PROVE_LOCKING)
210 /*
211  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
212  * The size of this table is a power of two and depends on the number of CPUS.
213  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
214  */
215 #ifdef CONFIG_LOCKDEP
216 # define RT_HASH_LOCK_SZ        256
217 #else
218 # if NR_CPUS >= 32
219 #  define RT_HASH_LOCK_SZ       4096
220 # elif NR_CPUS >= 16
221 #  define RT_HASH_LOCK_SZ       2048
222 # elif NR_CPUS >= 8
223 #  define RT_HASH_LOCK_SZ       1024
224 # elif NR_CPUS >= 4
225 #  define RT_HASH_LOCK_SZ       512
226 # else
227 #  define RT_HASH_LOCK_SZ       256
228 # endif
229 #endif
230
231 static spinlock_t       *rt_hash_locks;
232 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
233 # define rt_hash_lock_init()    { \
234                 int i; \
235                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
236                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
237                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
238                         spin_lock_init(&rt_hash_locks[i]); \
239                 }
240 #else
241 # define rt_hash_lock_addr(slot) NULL
242 # define rt_hash_lock_init()
243 #endif
244
245 static struct rt_hash_bucket    *rt_hash_table;
246 static unsigned                 rt_hash_mask;
247 static int                      rt_hash_log;
248 static unsigned int             rt_hash_rnd;
249
250 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
251 #define RT_CACHE_STAT_INC(field) \
252         (__raw_get_cpu_var(rt_cache_stat).field++)
253
254 static int rt_intern_hash(unsigned hash, struct rtable *rth,
255                                 struct rtable **res);
256
257 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
258 {
259         return (jhash_2words(daddr, saddr, rt_hash_rnd)
260                 & rt_hash_mask);
261 }
262
263 #define rt_hash(daddr, saddr, idx) \
264         rt_hash_code((__force u32)(__be32)(daddr),\
265                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
266
267 #ifdef CONFIG_PROC_FS
268 struct rt_cache_iter_state {
269         int bucket;
270 };
271
272 static struct rtable *rt_cache_get_first(struct seq_file *seq)
273 {
274         struct rtable *r = NULL;
275         struct rt_cache_iter_state *st = seq->private;
276
277         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
278                 rcu_read_lock_bh();
279                 r = rt_hash_table[st->bucket].chain;
280                 if (r)
281                         break;
282                 rcu_read_unlock_bh();
283         }
284         return r;
285 }
286
287 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
288 {
289         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
290
291         r = r->u.dst.rt_next;
292         while (!r) {
293                 rcu_read_unlock_bh();
294                 if (--st->bucket < 0)
295                         break;
296                 rcu_read_lock_bh();
297                 r = rt_hash_table[st->bucket].chain;
298         }
299         return r;
300 }
301
302 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
303 {
304         struct rtable *r = rt_cache_get_first(seq);
305
306         if (r)
307                 while (pos && (r = rt_cache_get_next(seq, r)))
308                         --pos;
309         return pos ? NULL : r;
310 }
311
312 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
313 {
314         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
315 }
316
317 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
318 {
319         struct rtable *r = NULL;
320
321         if (v == SEQ_START_TOKEN)
322                 r = rt_cache_get_first(seq);
323         else
324                 r = rt_cache_get_next(seq, v);
325         ++*pos;
326         return r;
327 }
328
329 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
330 {
331         if (v && v != SEQ_START_TOKEN)
332                 rcu_read_unlock_bh();
333 }
334
335 static int rt_cache_seq_show(struct seq_file *seq, void *v)
336 {
337         if (v == SEQ_START_TOKEN)
338                 seq_printf(seq, "%-127s\n",
339                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
340                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
341                            "HHUptod\tSpecDst");
342         else {
343                 struct rtable *r = v;
344                 char temp[256];
345
346                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
347                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
348                         r->u.dst.dev ? r->u.dst.dev->name : "*",
349                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
350                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
351                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
352                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
353                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
354                         dst_metric(&r->u.dst, RTAX_WINDOW),
355                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
356                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
357                         r->fl.fl4_tos,
358                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
359                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
360                                        dev_queue_xmit) : 0,
361                         r->rt_spec_dst);
362                 seq_printf(seq, "%-127s\n", temp);
363         }
364         return 0;
365 }
366
367 static const struct seq_operations rt_cache_seq_ops = {
368         .start  = rt_cache_seq_start,
369         .next   = rt_cache_seq_next,
370         .stop   = rt_cache_seq_stop,
371         .show   = rt_cache_seq_show,
372 };
373
374 static int rt_cache_seq_open(struct inode *inode, struct file *file)
375 {
376         struct seq_file *seq;
377         int rc = -ENOMEM;
378         struct rt_cache_iter_state *s;
379
380         s = kzalloc(sizeof(*s), GFP_KERNEL);
381         if (!s)
382                 goto out;
383         rc = seq_open(file, &rt_cache_seq_ops);
384         if (rc)
385                 goto out_kfree;
386         seq          = file->private_data;
387         seq->private = s;
388 out:
389         return rc;
390 out_kfree:
391         kfree(s);
392         goto out;
393 }
394
395 static const struct file_operations rt_cache_seq_fops = {
396         .owner   = THIS_MODULE,
397         .open    = rt_cache_seq_open,
398         .read    = seq_read,
399         .llseek  = seq_lseek,
400         .release = seq_release_private,
401 };
402
403
404 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
405 {
406         int cpu;
407
408         if (*pos == 0)
409                 return SEQ_START_TOKEN;
410
411         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
412                 if (!cpu_possible(cpu))
413                         continue;
414                 *pos = cpu+1;
415                 return &per_cpu(rt_cache_stat, cpu);
416         }
417         return NULL;
418 }
419
420 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
421 {
422         int cpu;
423
424         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
425                 if (!cpu_possible(cpu))
426                         continue;
427                 *pos = cpu+1;
428                 return &per_cpu(rt_cache_stat, cpu);
429         }
430         return NULL;
431
432 }
433
434 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
435 {
436
437 }
438
439 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
440 {
441         struct rt_cache_stat *st = v;
442
443         if (v == SEQ_START_TOKEN) {
444                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
445                 return 0;
446         }
447
448         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
449                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
450                    atomic_read(&ipv4_dst_ops.entries),
451                    st->in_hit,
452                    st->in_slow_tot,
453                    st->in_slow_mc,
454                    st->in_no_route,
455                    st->in_brd,
456                    st->in_martian_dst,
457                    st->in_martian_src,
458
459                    st->out_hit,
460                    st->out_slow_tot,
461                    st->out_slow_mc,
462
463                    st->gc_total,
464                    st->gc_ignored,
465                    st->gc_goal_miss,
466                    st->gc_dst_overflow,
467                    st->in_hlist_search,
468                    st->out_hlist_search
469                 );
470         return 0;
471 }
472
473 static const struct seq_operations rt_cpu_seq_ops = {
474         .start  = rt_cpu_seq_start,
475         .next   = rt_cpu_seq_next,
476         .stop   = rt_cpu_seq_stop,
477         .show   = rt_cpu_seq_show,
478 };
479
480
481 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
482 {
483         return seq_open(file, &rt_cpu_seq_ops);
484 }
485
486 static const struct file_operations rt_cpu_seq_fops = {
487         .owner   = THIS_MODULE,
488         .open    = rt_cpu_seq_open,
489         .read    = seq_read,
490         .llseek  = seq_lseek,
491         .release = seq_release,
492 };
493
494 #endif /* CONFIG_PROC_FS */
495
496 static __inline__ void rt_free(struct rtable *rt)
497 {
498         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
499 }
500
501 static __inline__ void rt_drop(struct rtable *rt)
502 {
503         ip_rt_put(rt);
504         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
505 }
506
507 static __inline__ int rt_fast_clean(struct rtable *rth)
508 {
509         /* Kill broadcast/multicast entries very aggresively, if they
510            collide in hash table with more useful entries */
511         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
512                 rth->fl.iif && rth->u.dst.rt_next;
513 }
514
515 static __inline__ int rt_valuable(struct rtable *rth)
516 {
517         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
518                 rth->u.dst.expires;
519 }
520
521 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
522 {
523         unsigned long age;
524         int ret = 0;
525
526         if (atomic_read(&rth->u.dst.__refcnt))
527                 goto out;
528
529         ret = 1;
530         if (rth->u.dst.expires &&
531             time_after_eq(jiffies, rth->u.dst.expires))
532                 goto out;
533
534         age = jiffies - rth->u.dst.lastuse;
535         ret = 0;
536         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
537             (age <= tmo2 && rt_valuable(rth)))
538                 goto out;
539         ret = 1;
540 out:    return ret;
541 }
542
543 /* Bits of score are:
544  * 31: very valuable
545  * 30: not quite useless
546  * 29..0: usage counter
547  */
548 static inline u32 rt_score(struct rtable *rt)
549 {
550         u32 score = jiffies - rt->u.dst.lastuse;
551
552         score = ~score & ~(3<<30);
553
554         if (rt_valuable(rt))
555                 score |= (1<<31);
556
557         if (!rt->fl.iif ||
558             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
559                 score |= (1<<30);
560
561         return score;
562 }
563
564 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
565 {
566         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
567                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
568                 (fl1->mark ^ fl2->mark) |
569                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
570                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
571                 (fl1->oif ^ fl2->oif) |
572                 (fl1->iif ^ fl2->iif)) == 0;
573 }
574
575 /* This runs via a timer and thus is always in BH context. */
576 static void rt_check_expire(unsigned long dummy)
577 {
578         static unsigned int rover;
579         unsigned int i = rover, goal;
580         struct rtable *rth, **rthp;
581         unsigned long now = jiffies;
582         u64 mult;
583
584         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
585         if (ip_rt_gc_timeout > 1)
586                 do_div(mult, ip_rt_gc_timeout);
587         goal = (unsigned int)mult;
588         if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
589         for (; goal > 0; goal--) {
590                 unsigned long tmo = ip_rt_gc_timeout;
591
592                 i = (i + 1) & rt_hash_mask;
593                 rthp = &rt_hash_table[i].chain;
594
595                 if (*rthp == 0)
596                         continue;
597                 spin_lock(rt_hash_lock_addr(i));
598                 while ((rth = *rthp) != NULL) {
599                         if (rth->u.dst.expires) {
600                                 /* Entry is expired even if it is in use */
601                                 if (time_before_eq(now, rth->u.dst.expires)) {
602                                         tmo >>= 1;
603                                         rthp = &rth->u.dst.rt_next;
604                                         continue;
605                                 }
606                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
607                                 tmo >>= 1;
608                                 rthp = &rth->u.dst.rt_next;
609                                 continue;
610                         }
611
612                         /* Cleanup aged off entries. */
613                         *rthp = rth->u.dst.rt_next;
614                         rt_free(rth);
615                 }
616                 spin_unlock(rt_hash_lock_addr(i));
617
618                 /* Fallback loop breaker. */
619                 if (time_after(jiffies, now))
620                         break;
621         }
622         rover = i;
623         mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
624 }
625
626 /* This can run from both BH and non-BH contexts, the latter
627  * in the case of a forced flush event.
628  */
629 static void rt_run_flush(unsigned long dummy)
630 {
631         int i;
632         struct rtable *rth, *next;
633
634         rt_deadline = 0;
635
636         get_random_bytes(&rt_hash_rnd, 4);
637
638         for (i = rt_hash_mask; i >= 0; i--) {
639                 spin_lock_bh(rt_hash_lock_addr(i));
640                 rth = rt_hash_table[i].chain;
641                 if (rth)
642                         rt_hash_table[i].chain = NULL;
643                 spin_unlock_bh(rt_hash_lock_addr(i));
644
645                 for (; rth; rth = next) {
646                         next = rth->u.dst.rt_next;
647                         rt_free(rth);
648                 }
649         }
650 }
651
652 static DEFINE_SPINLOCK(rt_flush_lock);
653
654 void rt_cache_flush(int delay)
655 {
656         unsigned long now = jiffies;
657         int user_mode = !in_softirq();
658
659         if (delay < 0)
660                 delay = ip_rt_min_delay;
661
662         spin_lock_bh(&rt_flush_lock);
663
664         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
665                 long tmo = (long)(rt_deadline - now);
666
667                 /* If flush timer is already running
668                    and flush request is not immediate (delay > 0):
669
670                    if deadline is not achieved, prolongate timer to "delay",
671                    otherwise fire it at deadline time.
672                  */
673
674                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
675                         tmo = 0;
676
677                 if (delay > tmo)
678                         delay = tmo;
679         }
680
681         if (delay <= 0) {
682                 spin_unlock_bh(&rt_flush_lock);
683                 rt_run_flush(0);
684                 return;
685         }
686
687         if (rt_deadline == 0)
688                 rt_deadline = now + ip_rt_max_delay;
689
690         mod_timer(&rt_flush_timer, now+delay);
691         spin_unlock_bh(&rt_flush_lock);
692 }
693
694 static void rt_secret_rebuild(unsigned long dummy)
695 {
696         unsigned long now = jiffies;
697
698         rt_cache_flush(0);
699         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
700 }
701
702 /*
703    Short description of GC goals.
704
705    We want to build algorithm, which will keep routing cache
706    at some equilibrium point, when number of aged off entries
707    is kept approximately equal to newly generated ones.
708
709    Current expiration strength is variable "expire".
710    We try to adjust it dynamically, so that if networking
711    is idle expires is large enough to keep enough of warm entries,
712    and when load increases it reduces to limit cache size.
713  */
714
715 static int rt_garbage_collect(void)
716 {
717         static unsigned long expire = RT_GC_TIMEOUT;
718         static unsigned long last_gc;
719         static int rover;
720         static int equilibrium;
721         struct rtable *rth, **rthp;
722         unsigned long now = jiffies;
723         int goal;
724
725         /*
726          * Garbage collection is pretty expensive,
727          * do not make it too frequently.
728          */
729
730         RT_CACHE_STAT_INC(gc_total);
731
732         if (now - last_gc < ip_rt_gc_min_interval &&
733             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
734                 RT_CACHE_STAT_INC(gc_ignored);
735                 goto out;
736         }
737
738         /* Calculate number of entries, which we want to expire now. */
739         goal = atomic_read(&ipv4_dst_ops.entries) -
740                 (ip_rt_gc_elasticity << rt_hash_log);
741         if (goal <= 0) {
742                 if (equilibrium < ipv4_dst_ops.gc_thresh)
743                         equilibrium = ipv4_dst_ops.gc_thresh;
744                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
745                 if (goal > 0) {
746                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
747                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
748                 }
749         } else {
750                 /* We are in dangerous area. Try to reduce cache really
751                  * aggressively.
752                  */
753                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
754                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
755         }
756
757         if (now - last_gc >= ip_rt_gc_min_interval)
758                 last_gc = now;
759
760         if (goal <= 0) {
761                 equilibrium += goal;
762                 goto work_done;
763         }
764
765         do {
766                 int i, k;
767
768                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
769                         unsigned long tmo = expire;
770
771                         k = (k + 1) & rt_hash_mask;
772                         rthp = &rt_hash_table[k].chain;
773                         spin_lock_bh(rt_hash_lock_addr(k));
774                         while ((rth = *rthp) != NULL) {
775                                 if (!rt_may_expire(rth, tmo, expire)) {
776                                         tmo >>= 1;
777                                         rthp = &rth->u.dst.rt_next;
778                                         continue;
779                                 }
780                                 *rthp = rth->u.dst.rt_next;
781                                 rt_free(rth);
782                                 goal--;
783                         }
784                         spin_unlock_bh(rt_hash_lock_addr(k));
785                         if (goal <= 0)
786                                 break;
787                 }
788                 rover = k;
789
790                 if (goal <= 0)
791                         goto work_done;
792
793                 /* Goal is not achieved. We stop process if:
794
795                    - if expire reduced to zero. Otherwise, expire is halfed.
796                    - if table is not full.
797                    - if we are called from interrupt.
798                    - jiffies check is just fallback/debug loop breaker.
799                      We will not spin here for long time in any case.
800                  */
801
802                 RT_CACHE_STAT_INC(gc_goal_miss);
803
804                 if (expire == 0)
805                         break;
806
807                 expire >>= 1;
808 #if RT_CACHE_DEBUG >= 2
809                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
810                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
811 #endif
812
813                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
814                         goto out;
815         } while (!in_softirq() && time_before_eq(jiffies, now));
816
817         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
818                 goto out;
819         if (net_ratelimit())
820                 printk(KERN_WARNING "dst cache overflow\n");
821         RT_CACHE_STAT_INC(gc_dst_overflow);
822         return 1;
823
824 work_done:
825         expire += ip_rt_gc_min_interval;
826         if (expire > ip_rt_gc_timeout ||
827             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
828                 expire = ip_rt_gc_timeout;
829 #if RT_CACHE_DEBUG >= 2
830         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
831                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
832 #endif
833 out:    return 0;
834 }
835
836 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
837 {
838         struct rtable   *rth, **rthp;
839         unsigned long   now;
840         struct rtable *cand, **candp;
841         u32             min_score;
842         int             chain_length;
843         int attempts = !in_softirq();
844
845 restart:
846         chain_length = 0;
847         min_score = ~(u32)0;
848         cand = NULL;
849         candp = NULL;
850         now = jiffies;
851
852         rthp = &rt_hash_table[hash].chain;
853
854         spin_lock_bh(rt_hash_lock_addr(hash));
855         while ((rth = *rthp) != NULL) {
856                 if (compare_keys(&rth->fl, &rt->fl)) {
857                         /* Put it first */
858                         *rthp = rth->u.dst.rt_next;
859                         /*
860                          * Since lookup is lockfree, the deletion
861                          * must be visible to another weakly ordered CPU before
862                          * the insertion at the start of the hash chain.
863                          */
864                         rcu_assign_pointer(rth->u.dst.rt_next,
865                                            rt_hash_table[hash].chain);
866                         /*
867                          * Since lookup is lockfree, the update writes
868                          * must be ordered for consistency on SMP.
869                          */
870                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
871
872                         rth->u.dst.__use++;
873                         dst_hold(&rth->u.dst);
874                         rth->u.dst.lastuse = now;
875                         spin_unlock_bh(rt_hash_lock_addr(hash));
876
877                         rt_drop(rt);
878                         *rp = rth;
879                         return 0;
880                 }
881
882                 if (!atomic_read(&rth->u.dst.__refcnt)) {
883                         u32 score = rt_score(rth);
884
885                         if (score <= min_score) {
886                                 cand = rth;
887                                 candp = rthp;
888                                 min_score = score;
889                         }
890                 }
891
892                 chain_length++;
893
894                 rthp = &rth->u.dst.rt_next;
895         }
896
897         if (cand) {
898                 /* ip_rt_gc_elasticity used to be average length of chain
899                  * length, when exceeded gc becomes really aggressive.
900                  *
901                  * The second limit is less certain. At the moment it allows
902                  * only 2 entries per bucket. We will see.
903                  */
904                 if (chain_length > ip_rt_gc_elasticity) {
905                         *candp = cand->u.dst.rt_next;
906                         rt_free(cand);
907                 }
908         }
909
910         /* Try to bind route to arp only if it is output
911            route or unicast forwarding path.
912          */
913         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
914                 int err = arp_bind_neighbour(&rt->u.dst);
915                 if (err) {
916                         spin_unlock_bh(rt_hash_lock_addr(hash));
917
918                         if (err != -ENOBUFS) {
919                                 rt_drop(rt);
920                                 return err;
921                         }
922
923                         /* Neighbour tables are full and nothing
924                            can be released. Try to shrink route cache,
925                            it is most likely it holds some neighbour records.
926                          */
927                         if (attempts-- > 0) {
928                                 int saved_elasticity = ip_rt_gc_elasticity;
929                                 int saved_int = ip_rt_gc_min_interval;
930                                 ip_rt_gc_elasticity     = 1;
931                                 ip_rt_gc_min_interval   = 0;
932                                 rt_garbage_collect();
933                                 ip_rt_gc_min_interval   = saved_int;
934                                 ip_rt_gc_elasticity     = saved_elasticity;
935                                 goto restart;
936                         }
937
938                         if (net_ratelimit())
939                                 printk(KERN_WARNING "Neighbour table overflow.\n");
940                         rt_drop(rt);
941                         return -ENOBUFS;
942                 }
943         }
944
945         rt->u.dst.rt_next = rt_hash_table[hash].chain;
946 #if RT_CACHE_DEBUG >= 2
947         if (rt->u.dst.rt_next) {
948                 struct rtable *trt;
949                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
950                        NIPQUAD(rt->rt_dst));
951                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
952                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
953                 printk("\n");
954         }
955 #endif
956         rt_hash_table[hash].chain = rt;
957         spin_unlock_bh(rt_hash_lock_addr(hash));
958         *rp = rt;
959         return 0;
960 }
961
962 void rt_bind_peer(struct rtable *rt, int create)
963 {
964         static DEFINE_SPINLOCK(rt_peer_lock);
965         struct inet_peer *peer;
966
967         peer = inet_getpeer(rt->rt_dst, create);
968
969         spin_lock_bh(&rt_peer_lock);
970         if (rt->peer == NULL) {
971                 rt->peer = peer;
972                 peer = NULL;
973         }
974         spin_unlock_bh(&rt_peer_lock);
975         if (peer)
976                 inet_putpeer(peer);
977 }
978
979 /*
980  * Peer allocation may fail only in serious out-of-memory conditions.  However
981  * we still can generate some output.
982  * Random ID selection looks a bit dangerous because we have no chances to
983  * select ID being unique in a reasonable period of time.
984  * But broken packet identifier may be better than no packet at all.
985  */
986 static void ip_select_fb_ident(struct iphdr *iph)
987 {
988         static DEFINE_SPINLOCK(ip_fb_id_lock);
989         static u32 ip_fallback_id;
990         u32 salt;
991
992         spin_lock_bh(&ip_fb_id_lock);
993         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
994         iph->id = htons(salt & 0xFFFF);
995         ip_fallback_id = salt;
996         spin_unlock_bh(&ip_fb_id_lock);
997 }
998
999 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1000 {
1001         struct rtable *rt = (struct rtable *) dst;
1002
1003         if (rt) {
1004                 if (rt->peer == NULL)
1005                         rt_bind_peer(rt, 1);
1006
1007                 /* If peer is attached to destination, it is never detached,
1008                    so that we need not to grab a lock to dereference it.
1009                  */
1010                 if (rt->peer) {
1011                         iph->id = htons(inet_getid(rt->peer, more));
1012                         return;
1013                 }
1014         } else
1015                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1016                        __builtin_return_address(0));
1017
1018         ip_select_fb_ident(iph);
1019 }
1020
1021 static void rt_del(unsigned hash, struct rtable *rt)
1022 {
1023         struct rtable **rthp;
1024
1025         spin_lock_bh(rt_hash_lock_addr(hash));
1026         ip_rt_put(rt);
1027         for (rthp = &rt_hash_table[hash].chain; *rthp;
1028              rthp = &(*rthp)->u.dst.rt_next)
1029                 if (*rthp == rt) {
1030                         *rthp = rt->u.dst.rt_next;
1031                         rt_free(rt);
1032                         break;
1033                 }
1034         spin_unlock_bh(rt_hash_lock_addr(hash));
1035 }
1036
1037 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1038                     __be32 saddr, struct net_device *dev)
1039 {
1040         int i, k;
1041         struct in_device *in_dev = in_dev_get(dev);
1042         struct rtable *rth, **rthp;
1043         __be32  skeys[2] = { saddr, 0 };
1044         int  ikeys[2] = { dev->ifindex, 0 };
1045         struct netevent_redirect netevent;
1046
1047         if (!in_dev)
1048                 return;
1049
1050         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1051             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1052                 goto reject_redirect;
1053
1054         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1055                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1056                         goto reject_redirect;
1057                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1058                         goto reject_redirect;
1059         } else {
1060                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1061                         goto reject_redirect;
1062         }
1063
1064         for (i = 0; i < 2; i++) {
1065                 for (k = 0; k < 2; k++) {
1066                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1067
1068                         rthp=&rt_hash_table[hash].chain;
1069
1070                         rcu_read_lock();
1071                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1072                                 struct rtable *rt;
1073
1074                                 if (rth->fl.fl4_dst != daddr ||
1075                                     rth->fl.fl4_src != skeys[i] ||
1076                                     rth->fl.oif != ikeys[k] ||
1077                                     rth->fl.iif != 0) {
1078                                         rthp = &rth->u.dst.rt_next;
1079                                         continue;
1080                                 }
1081
1082                                 if (rth->rt_dst != daddr ||
1083                                     rth->rt_src != saddr ||
1084                                     rth->u.dst.error ||
1085                                     rth->rt_gateway != old_gw ||
1086                                     rth->u.dst.dev != dev)
1087                                         break;
1088
1089                                 dst_hold(&rth->u.dst);
1090                                 rcu_read_unlock();
1091
1092                                 rt = dst_alloc(&ipv4_dst_ops);
1093                                 if (rt == NULL) {
1094                                         ip_rt_put(rth);
1095                                         in_dev_put(in_dev);
1096                                         return;
1097                                 }
1098
1099                                 /* Copy all the information. */
1100                                 *rt = *rth;
1101                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1102                                 rt->u.dst.__use         = 1;
1103                                 atomic_set(&rt->u.dst.__refcnt, 1);
1104                                 rt->u.dst.child         = NULL;
1105                                 if (rt->u.dst.dev)
1106                                         dev_hold(rt->u.dst.dev);
1107                                 if (rt->idev)
1108                                         in_dev_hold(rt->idev);
1109                                 rt->u.dst.obsolete      = 0;
1110                                 rt->u.dst.lastuse       = jiffies;
1111                                 rt->u.dst.path          = &rt->u.dst;
1112                                 rt->u.dst.neighbour     = NULL;
1113                                 rt->u.dst.hh            = NULL;
1114                                 rt->u.dst.xfrm          = NULL;
1115
1116                                 rt->rt_flags            |= RTCF_REDIRECTED;
1117
1118                                 /* Gateway is different ... */
1119                                 rt->rt_gateway          = new_gw;
1120
1121                                 /* Redirect received -> path was valid */
1122                                 dst_confirm(&rth->u.dst);
1123
1124                                 if (rt->peer)
1125                                         atomic_inc(&rt->peer->refcnt);
1126
1127                                 if (arp_bind_neighbour(&rt->u.dst) ||
1128                                     !(rt->u.dst.neighbour->nud_state &
1129                                             NUD_VALID)) {
1130                                         if (rt->u.dst.neighbour)
1131                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1132                                         ip_rt_put(rth);
1133                                         rt_drop(rt);
1134                                         goto do_next;
1135                                 }
1136
1137                                 netevent.old = &rth->u.dst;
1138                                 netevent.new = &rt->u.dst;
1139                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1140                                                         &netevent);
1141
1142                                 rt_del(hash, rth);
1143                                 if (!rt_intern_hash(hash, rt, &rt))
1144                                         ip_rt_put(rt);
1145                                 goto do_next;
1146                         }
1147                         rcu_read_unlock();
1148                 do_next:
1149                         ;
1150                 }
1151         }
1152         in_dev_put(in_dev);
1153         return;
1154
1155 reject_redirect:
1156 #ifdef CONFIG_IP_ROUTE_VERBOSE
1157         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1158                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1159                         "%u.%u.%u.%u ignored.\n"
1160                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1161                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1162                        NIPQUAD(saddr), NIPQUAD(daddr));
1163 #endif
1164         in_dev_put(in_dev);
1165 }
1166
1167 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1168 {
1169         struct rtable *rt = (struct rtable*)dst;
1170         struct dst_entry *ret = dst;
1171
1172         if (rt) {
1173                 if (dst->obsolete) {
1174                         ip_rt_put(rt);
1175                         ret = NULL;
1176                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1177                            rt->u.dst.expires) {
1178                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1179                                                 rt->fl.oif);
1180 #if RT_CACHE_DEBUG >= 1
1181                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1182                                           "%u.%u.%u.%u/%02x dropped\n",
1183                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1184 #endif
1185                         rt_del(hash, rt);
1186                         ret = NULL;
1187                 }
1188         }
1189         return ret;
1190 }
1191
1192 /*
1193  * Algorithm:
1194  *      1. The first ip_rt_redirect_number redirects are sent
1195  *         with exponential backoff, then we stop sending them at all,
1196  *         assuming that the host ignores our redirects.
1197  *      2. If we did not see packets requiring redirects
1198  *         during ip_rt_redirect_silence, we assume that the host
1199  *         forgot redirected route and start to send redirects again.
1200  *
1201  * This algorithm is much cheaper and more intelligent than dumb load limiting
1202  * in icmp.c.
1203  *
1204  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1205  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1206  */
1207
1208 void ip_rt_send_redirect(struct sk_buff *skb)
1209 {
1210         struct rtable *rt = (struct rtable*)skb->dst;
1211         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1212
1213         if (!in_dev)
1214                 return;
1215
1216         if (!IN_DEV_TX_REDIRECTS(in_dev))
1217                 goto out;
1218
1219         /* No redirected packets during ip_rt_redirect_silence;
1220          * reset the algorithm.
1221          */
1222         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1223                 rt->u.dst.rate_tokens = 0;
1224
1225         /* Too many ignored redirects; do not send anything
1226          * set u.dst.rate_last to the last seen redirected packet.
1227          */
1228         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1229                 rt->u.dst.rate_last = jiffies;
1230                 goto out;
1231         }
1232
1233         /* Check for load limit; set rate_last to the latest sent
1234          * redirect.
1235          */
1236         if (rt->u.dst.rate_tokens == 0 ||
1237             time_after(jiffies,
1238                        (rt->u.dst.rate_last +
1239                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1240                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1241                 rt->u.dst.rate_last = jiffies;
1242                 ++rt->u.dst.rate_tokens;
1243 #ifdef CONFIG_IP_ROUTE_VERBOSE
1244                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1245                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1246                     net_ratelimit())
1247                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1248                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1249                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1250                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1251 #endif
1252         }
1253 out:
1254         in_dev_put(in_dev);
1255 }
1256
1257 static int ip_error(struct sk_buff *skb)
1258 {
1259         struct rtable *rt = (struct rtable*)skb->dst;
1260         unsigned long now;
1261         int code;
1262
1263         switch (rt->u.dst.error) {
1264                 case EINVAL:
1265                 default:
1266                         goto out;
1267                 case EHOSTUNREACH:
1268                         code = ICMP_HOST_UNREACH;
1269                         break;
1270                 case ENETUNREACH:
1271                         code = ICMP_NET_UNREACH;
1272                         break;
1273                 case EACCES:
1274                         code = ICMP_PKT_FILTERED;
1275                         break;
1276         }
1277
1278         now = jiffies;
1279         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1280         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1281                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1282         rt->u.dst.rate_last = now;
1283         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1284                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1285                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1286         }
1287
1288 out:    kfree_skb(skb);
1289         return 0;
1290 }
1291
1292 /*
1293  *      The last two values are not from the RFC but
1294  *      are needed for AMPRnet AX.25 paths.
1295  */
1296
1297 static const unsigned short mtu_plateau[] =
1298 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1299
1300 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1301 {
1302         int i;
1303
1304         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1305                 if (old_mtu > mtu_plateau[i])
1306                         return mtu_plateau[i];
1307         return 68;
1308 }
1309
1310 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1311 {
1312         int i;
1313         unsigned short old_mtu = ntohs(iph->tot_len);
1314         struct rtable *rth;
1315         __be32  skeys[2] = { iph->saddr, 0, };
1316         __be32  daddr = iph->daddr;
1317         unsigned short est_mtu = 0;
1318
1319         if (ipv4_config.no_pmtu_disc)
1320                 return 0;
1321
1322         for (i = 0; i < 2; i++) {
1323                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1324
1325                 rcu_read_lock();
1326                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1327                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1328                         if (rth->fl.fl4_dst == daddr &&
1329                             rth->fl.fl4_src == skeys[i] &&
1330                             rth->rt_dst  == daddr &&
1331                             rth->rt_src  == iph->saddr &&
1332                             rth->fl.iif == 0 &&
1333                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1334                                 unsigned short mtu = new_mtu;
1335
1336                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1337
1338                                         /* BSD 4.2 compatibility hack :-( */
1339                                         if (mtu == 0 &&
1340                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1341                                             old_mtu >= 68 + (iph->ihl << 2))
1342                                                 old_mtu -= iph->ihl << 2;
1343
1344                                         mtu = guess_mtu(old_mtu);
1345                                 }
1346                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1347                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1348                                                 dst_confirm(&rth->u.dst);
1349                                                 if (mtu < ip_rt_min_pmtu) {
1350                                                         mtu = ip_rt_min_pmtu;
1351                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1352                                                                 (1 << RTAX_MTU);
1353                                                 }
1354                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1355                                                 dst_set_expires(&rth->u.dst,
1356                                                         ip_rt_mtu_expires);
1357                                         }
1358                                         est_mtu = mtu;
1359                                 }
1360                         }
1361                 }
1362                 rcu_read_unlock();
1363         }
1364         return est_mtu ? : new_mtu;
1365 }
1366
1367 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1368 {
1369         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1370             !(dst_metric_locked(dst, RTAX_MTU))) {
1371                 if (mtu < ip_rt_min_pmtu) {
1372                         mtu = ip_rt_min_pmtu;
1373                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1374                 }
1375                 dst->metrics[RTAX_MTU-1] = mtu;
1376                 dst_set_expires(dst, ip_rt_mtu_expires);
1377                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1378         }
1379 }
1380
1381 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1382 {
1383         return NULL;
1384 }
1385
1386 static void ipv4_dst_destroy(struct dst_entry *dst)
1387 {
1388         struct rtable *rt = (struct rtable *) dst;
1389         struct inet_peer *peer = rt->peer;
1390         struct in_device *idev = rt->idev;
1391
1392         if (peer) {
1393                 rt->peer = NULL;
1394                 inet_putpeer(peer);
1395         }
1396
1397         if (idev) {
1398                 rt->idev = NULL;
1399                 in_dev_put(idev);
1400         }
1401 }
1402
1403 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1404                             int how)
1405 {
1406         struct rtable *rt = (struct rtable *) dst;
1407         struct in_device *idev = rt->idev;
1408         if (dev != &loopback_dev && idev && idev->dev == dev) {
1409                 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1410                 if (loopback_idev) {
1411                         rt->idev = loopback_idev;
1412                         in_dev_put(idev);
1413                 }
1414         }
1415 }
1416
1417 static void ipv4_link_failure(struct sk_buff *skb)
1418 {
1419         struct rtable *rt;
1420
1421         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1422
1423         rt = (struct rtable *) skb->dst;
1424         if (rt)
1425                 dst_set_expires(&rt->u.dst, 0);
1426 }
1427
1428 static int ip_rt_bug(struct sk_buff *skb)
1429 {
1430         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1431                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1432                 skb->dev ? skb->dev->name : "?");
1433         kfree_skb(skb);
1434         return 0;
1435 }
1436
1437 /*
1438    We do not cache source address of outgoing interface,
1439    because it is used only by IP RR, TS and SRR options,
1440    so that it out of fast path.
1441
1442    BTW remember: "addr" is allowed to be not aligned
1443    in IP options!
1444  */
1445
1446 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1447 {
1448         __be32 src;
1449         struct fib_result res;
1450
1451         if (rt->fl.iif == 0)
1452                 src = rt->rt_src;
1453         else if (fib_lookup(&rt->fl, &res) == 0) {
1454                 src = FIB_RES_PREFSRC(res);
1455                 fib_res_put(&res);
1456         } else
1457                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1458                                         RT_SCOPE_UNIVERSE);
1459         memcpy(addr, &src, 4);
1460 }
1461
1462 #ifdef CONFIG_NET_CLS_ROUTE
1463 static void set_class_tag(struct rtable *rt, u32 tag)
1464 {
1465         if (!(rt->u.dst.tclassid & 0xFFFF))
1466                 rt->u.dst.tclassid |= tag & 0xFFFF;
1467         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1468                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1469 }
1470 #endif
1471
1472 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1473 {
1474         struct fib_info *fi = res->fi;
1475
1476         if (fi) {
1477                 if (FIB_RES_GW(*res) &&
1478                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1479                         rt->rt_gateway = FIB_RES_GW(*res);
1480                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1481                        sizeof(rt->u.dst.metrics));
1482                 if (fi->fib_mtu == 0) {
1483                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1484                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1485                             rt->rt_gateway != rt->rt_dst &&
1486                             rt->u.dst.dev->mtu > 576)
1487                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1488                 }
1489 #ifdef CONFIG_NET_CLS_ROUTE
1490                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1491 #endif
1492         } else
1493                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1494
1495         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1496                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1497         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1498                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1499         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1500                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1501                                        ip_rt_min_advmss);
1502         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1503                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1504
1505 #ifdef CONFIG_NET_CLS_ROUTE
1506 #ifdef CONFIG_IP_MULTIPLE_TABLES
1507         set_class_tag(rt, fib_rules_tclass(res));
1508 #endif
1509         set_class_tag(rt, itag);
1510 #endif
1511         rt->rt_type = res->type;
1512 }
1513
1514 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1515                                 u8 tos, struct net_device *dev, int our)
1516 {
1517         unsigned hash;
1518         struct rtable *rth;
1519         __be32 spec_dst;
1520         struct in_device *in_dev = in_dev_get(dev);
1521         u32 itag = 0;
1522
1523         /* Primary sanity checks. */
1524
1525         if (in_dev == NULL)
1526                 return -EINVAL;
1527
1528         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1529             skb->protocol != htons(ETH_P_IP))
1530                 goto e_inval;
1531
1532         if (ZERONET(saddr)) {
1533                 if (!LOCAL_MCAST(daddr))
1534                         goto e_inval;
1535                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1536         } else if (fib_validate_source(saddr, 0, tos, 0,
1537                                         dev, &spec_dst, &itag) < 0)
1538                 goto e_inval;
1539
1540         rth = dst_alloc(&ipv4_dst_ops);
1541         if (!rth)
1542                 goto e_nobufs;
1543
1544         rth->u.dst.output= ip_rt_bug;
1545
1546         atomic_set(&rth->u.dst.__refcnt, 1);
1547         rth->u.dst.flags= DST_HOST;
1548         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1549                 rth->u.dst.flags |= DST_NOPOLICY;
1550         rth->fl.fl4_dst = daddr;
1551         rth->rt_dst     = daddr;
1552         rth->fl.fl4_tos = tos;
1553         rth->fl.mark    = skb->mark;
1554         rth->fl.fl4_src = saddr;
1555         rth->rt_src     = saddr;
1556 #ifdef CONFIG_NET_CLS_ROUTE
1557         rth->u.dst.tclassid = itag;
1558 #endif
1559         rth->rt_iif     =
1560         rth->fl.iif     = dev->ifindex;
1561         rth->u.dst.dev  = &loopback_dev;
1562         dev_hold(rth->u.dst.dev);
1563         rth->idev       = in_dev_get(rth->u.dst.dev);
1564         rth->fl.oif     = 0;
1565         rth->rt_gateway = daddr;
1566         rth->rt_spec_dst= spec_dst;
1567         rth->rt_type    = RTN_MULTICAST;
1568         rth->rt_flags   = RTCF_MULTICAST;
1569         if (our) {
1570                 rth->u.dst.input= ip_local_deliver;
1571                 rth->rt_flags |= RTCF_LOCAL;
1572         }
1573
1574 #ifdef CONFIG_IP_MROUTE
1575         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1576                 rth->u.dst.input = ip_mr_input;
1577 #endif
1578         RT_CACHE_STAT_INC(in_slow_mc);
1579
1580         in_dev_put(in_dev);
1581         hash = rt_hash(daddr, saddr, dev->ifindex);
1582         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1583
1584 e_nobufs:
1585         in_dev_put(in_dev);
1586         return -ENOBUFS;
1587
1588 e_inval:
1589         in_dev_put(in_dev);
1590         return -EINVAL;
1591 }
1592
1593
1594 static void ip_handle_martian_source(struct net_device *dev,
1595                                      struct in_device *in_dev,
1596                                      struct sk_buff *skb,
1597                                      __be32 daddr,
1598                                      __be32 saddr)
1599 {
1600         RT_CACHE_STAT_INC(in_martian_src);
1601 #ifdef CONFIG_IP_ROUTE_VERBOSE
1602         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1603                 /*
1604                  *      RFC1812 recommendation, if source is martian,
1605                  *      the only hint is MAC header.
1606                  */
1607                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1608                         "%u.%u.%u.%u, on dev %s\n",
1609                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1610                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1611                         int i;
1612                         const unsigned char *p = skb_mac_header(skb);
1613                         printk(KERN_WARNING "ll header: ");
1614                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1615                                 printk("%02x", *p);
1616                                 if (i < (dev->hard_header_len - 1))
1617                                         printk(":");
1618                         }
1619                         printk("\n");
1620                 }
1621         }
1622 #endif
1623 }
1624
1625 static inline int __mkroute_input(struct sk_buff *skb,
1626                                   struct fib_result* res,
1627                                   struct in_device *in_dev,
1628                                   __be32 daddr, __be32 saddr, u32 tos,
1629                                   struct rtable **result)
1630 {
1631
1632         struct rtable *rth;
1633         int err;
1634         struct in_device *out_dev;
1635         unsigned flags = 0;
1636         __be32 spec_dst;
1637         u32 itag;
1638
1639         /* get a working reference to the output device */
1640         out_dev = in_dev_get(FIB_RES_DEV(*res));
1641         if (out_dev == NULL) {
1642                 if (net_ratelimit())
1643                         printk(KERN_CRIT "Bug in ip_route_input" \
1644                                "_slow(). Please, report\n");
1645                 return -EINVAL;
1646         }
1647
1648
1649         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1650                                   in_dev->dev, &spec_dst, &itag);
1651         if (err < 0) {
1652                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1653                                          saddr);
1654
1655                 err = -EINVAL;
1656                 goto cleanup;
1657         }
1658
1659         if (err)
1660                 flags |= RTCF_DIRECTSRC;
1661
1662         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1663             (IN_DEV_SHARED_MEDIA(out_dev) ||
1664              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1665                 flags |= RTCF_DOREDIRECT;
1666
1667         if (skb->protocol != htons(ETH_P_IP)) {
1668                 /* Not IP (i.e. ARP). Do not create route, if it is
1669                  * invalid for proxy arp. DNAT routes are always valid.
1670                  */
1671                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1672                         err = -EINVAL;
1673                         goto cleanup;
1674                 }
1675         }
1676
1677
1678         rth = dst_alloc(&ipv4_dst_ops);
1679         if (!rth) {
1680                 err = -ENOBUFS;
1681                 goto cleanup;
1682         }
1683
1684         atomic_set(&rth->u.dst.__refcnt, 1);
1685         rth->u.dst.flags= DST_HOST;
1686         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1687                 rth->u.dst.flags |= DST_NOPOLICY;
1688         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1689                 rth->u.dst.flags |= DST_NOXFRM;
1690         rth->fl.fl4_dst = daddr;
1691         rth->rt_dst     = daddr;
1692         rth->fl.fl4_tos = tos;
1693         rth->fl.mark    = skb->mark;
1694         rth->fl.fl4_src = saddr;
1695         rth->rt_src     = saddr;
1696         rth->rt_gateway = daddr;
1697         rth->rt_iif     =
1698                 rth->fl.iif     = in_dev->dev->ifindex;
1699         rth->u.dst.dev  = (out_dev)->dev;
1700         dev_hold(rth->u.dst.dev);
1701         rth->idev       = in_dev_get(rth->u.dst.dev);
1702         rth->fl.oif     = 0;
1703         rth->rt_spec_dst= spec_dst;
1704
1705         rth->u.dst.input = ip_forward;
1706         rth->u.dst.output = ip_output;
1707
1708         rt_set_nexthop(rth, res, itag);
1709
1710         rth->rt_flags = flags;
1711
1712         *result = rth;
1713         err = 0;
1714  cleanup:
1715         /* release the working reference to the output device */
1716         in_dev_put(out_dev);
1717         return err;
1718 }
1719
1720 static inline int ip_mkroute_input(struct sk_buff *skb,
1721                                    struct fib_result* res,
1722                                    const struct flowi *fl,
1723                                    struct in_device *in_dev,
1724                                    __be32 daddr, __be32 saddr, u32 tos)
1725 {
1726         struct rtable* rth = NULL;
1727         int err;
1728         unsigned hash;
1729
1730 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1731         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1732                 fib_select_multipath(fl, res);
1733 #endif
1734
1735         /* create a routing cache entry */
1736         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1737         if (err)
1738                 return err;
1739
1740         /* put it into the cache */
1741         hash = rt_hash(daddr, saddr, fl->iif);
1742         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1743 }
1744
1745 /*
1746  *      NOTE. We drop all the packets that has local source
1747  *      addresses, because every properly looped back packet
1748  *      must have correct destination already attached by output routine.
1749  *
1750  *      Such approach solves two big problems:
1751  *      1. Not simplex devices are handled properly.
1752  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1753  */
1754
1755 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1756                                u8 tos, struct net_device *dev)
1757 {
1758         struct fib_result res;
1759         struct in_device *in_dev = in_dev_get(dev);
1760         struct flowi fl = { .nl_u = { .ip4_u =
1761                                       { .daddr = daddr,
1762                                         .saddr = saddr,
1763                                         .tos = tos,
1764                                         .scope = RT_SCOPE_UNIVERSE,
1765                                       } },
1766                             .mark = skb->mark,
1767                             .iif = dev->ifindex };
1768         unsigned        flags = 0;
1769         u32             itag = 0;
1770         struct rtable * rth;
1771         unsigned        hash;
1772         __be32          spec_dst;
1773         int             err = -EINVAL;
1774         int             free_res = 0;
1775
1776         /* IP on this device is disabled. */
1777
1778         if (!in_dev)
1779                 goto out;
1780
1781         /* Check for the most weird martians, which can be not detected
1782            by fib_lookup.
1783          */
1784
1785         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1786                 goto martian_source;
1787
1788         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1789                 goto brd_input;
1790
1791         /* Accept zero addresses only to limited broadcast;
1792          * I even do not know to fix it or not. Waiting for complains :-)
1793          */
1794         if (ZERONET(saddr))
1795                 goto martian_source;
1796
1797         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1798                 goto martian_destination;
1799
1800         /*
1801          *      Now we are ready to route packet.
1802          */
1803         if ((err = fib_lookup(&fl, &res)) != 0) {
1804                 if (!IN_DEV_FORWARD(in_dev))
1805                         goto e_hostunreach;
1806                 goto no_route;
1807         }
1808         free_res = 1;
1809
1810         RT_CACHE_STAT_INC(in_slow_tot);
1811
1812         if (res.type == RTN_BROADCAST)
1813                 goto brd_input;
1814
1815         if (res.type == RTN_LOCAL) {
1816                 int result;
1817                 result = fib_validate_source(saddr, daddr, tos,
1818                                              loopback_dev.ifindex,
1819                                              dev, &spec_dst, &itag);
1820                 if (result < 0)
1821                         goto martian_source;
1822                 if (result)
1823                         flags |= RTCF_DIRECTSRC;
1824                 spec_dst = daddr;
1825                 goto local_input;
1826         }
1827
1828         if (!IN_DEV_FORWARD(in_dev))
1829                 goto e_hostunreach;
1830         if (res.type != RTN_UNICAST)
1831                 goto martian_destination;
1832
1833         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1834         if (err == -ENOBUFS)
1835                 goto e_nobufs;
1836         if (err == -EINVAL)
1837                 goto e_inval;
1838
1839 done:
1840         in_dev_put(in_dev);
1841         if (free_res)
1842                 fib_res_put(&res);
1843 out:    return err;
1844
1845 brd_input:
1846         if (skb->protocol != htons(ETH_P_IP))
1847                 goto e_inval;
1848
1849         if (ZERONET(saddr))
1850                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1851         else {
1852                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1853                                           &itag);
1854                 if (err < 0)
1855                         goto martian_source;
1856                 if (err)
1857                         flags |= RTCF_DIRECTSRC;
1858         }
1859         flags |= RTCF_BROADCAST;
1860         res.type = RTN_BROADCAST;
1861         RT_CACHE_STAT_INC(in_brd);
1862
1863 local_input:
1864         rth = dst_alloc(&ipv4_dst_ops);
1865         if (!rth)
1866                 goto e_nobufs;
1867
1868         rth->u.dst.output= ip_rt_bug;
1869
1870         atomic_set(&rth->u.dst.__refcnt, 1);
1871         rth->u.dst.flags= DST_HOST;
1872         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1873                 rth->u.dst.flags |= DST_NOPOLICY;
1874         rth->fl.fl4_dst = daddr;
1875         rth->rt_dst     = daddr;
1876         rth->fl.fl4_tos = tos;
1877         rth->fl.mark    = skb->mark;
1878         rth->fl.fl4_src = saddr;
1879         rth->rt_src     = saddr;
1880 #ifdef CONFIG_NET_CLS_ROUTE
1881         rth->u.dst.tclassid = itag;
1882 #endif
1883         rth->rt_iif     =
1884         rth->fl.iif     = dev->ifindex;
1885         rth->u.dst.dev  = &loopback_dev;
1886         dev_hold(rth->u.dst.dev);
1887         rth->idev       = in_dev_get(rth->u.dst.dev);
1888         rth->rt_gateway = daddr;
1889         rth->rt_spec_dst= spec_dst;
1890         rth->u.dst.input= ip_local_deliver;
1891         rth->rt_flags   = flags|RTCF_LOCAL;
1892         if (res.type == RTN_UNREACHABLE) {
1893                 rth->u.dst.input= ip_error;
1894                 rth->u.dst.error= -err;
1895                 rth->rt_flags   &= ~RTCF_LOCAL;
1896         }
1897         rth->rt_type    = res.type;
1898         hash = rt_hash(daddr, saddr, fl.iif);
1899         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1900         goto done;
1901
1902 no_route:
1903         RT_CACHE_STAT_INC(in_no_route);
1904         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1905         res.type = RTN_UNREACHABLE;
1906         goto local_input;
1907
1908         /*
1909          *      Do not cache martian addresses: they should be logged (RFC1812)
1910          */
1911 martian_destination:
1912         RT_CACHE_STAT_INC(in_martian_dst);
1913 #ifdef CONFIG_IP_ROUTE_VERBOSE
1914         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1915                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1916                         "%u.%u.%u.%u, dev %s\n",
1917                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1918 #endif
1919
1920 e_hostunreach:
1921         err = -EHOSTUNREACH;
1922         goto done;
1923
1924 e_inval:
1925         err = -EINVAL;
1926         goto done;
1927
1928 e_nobufs:
1929         err = -ENOBUFS;
1930         goto done;
1931
1932 martian_source:
1933         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1934         goto e_inval;
1935 }
1936
1937 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1938                    u8 tos, struct net_device *dev)
1939 {
1940         struct rtable * rth;
1941         unsigned        hash;
1942         int iif = dev->ifindex;
1943
1944         tos &= IPTOS_RT_MASK;
1945         hash = rt_hash(daddr, saddr, iif);
1946
1947         rcu_read_lock();
1948         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1949              rth = rcu_dereference(rth->u.dst.rt_next)) {
1950                 if (rth->fl.fl4_dst == daddr &&
1951                     rth->fl.fl4_src == saddr &&
1952                     rth->fl.iif == iif &&
1953                     rth->fl.oif == 0 &&
1954                     rth->fl.mark == skb->mark &&
1955                     rth->fl.fl4_tos == tos) {
1956                         rth->u.dst.lastuse = jiffies;
1957                         dst_hold(&rth->u.dst);
1958                         rth->u.dst.__use++;
1959                         RT_CACHE_STAT_INC(in_hit);
1960                         rcu_read_unlock();
1961                         skb->dst = (struct dst_entry*)rth;
1962                         return 0;
1963                 }
1964                 RT_CACHE_STAT_INC(in_hlist_search);
1965         }
1966         rcu_read_unlock();
1967
1968         /* Multicast recognition logic is moved from route cache to here.
1969            The problem was that too many Ethernet cards have broken/missing
1970            hardware multicast filters :-( As result the host on multicasting
1971            network acquires a lot of useless route cache entries, sort of
1972            SDR messages from all the world. Now we try to get rid of them.
1973            Really, provided software IP multicast filter is organized
1974            reasonably (at least, hashed), it does not result in a slowdown
1975            comparing with route cache reject entries.
1976            Note, that multicast routers are not affected, because
1977            route cache entry is created eventually.
1978          */
1979         if (MULTICAST(daddr)) {
1980                 struct in_device *in_dev;
1981
1982                 rcu_read_lock();
1983                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1984                         int our = ip_check_mc(in_dev, daddr, saddr,
1985                                 ip_hdr(skb)->protocol);
1986                         if (our
1987 #ifdef CONFIG_IP_MROUTE
1988                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1989 #endif
1990                             ) {
1991                                 rcu_read_unlock();
1992                                 return ip_route_input_mc(skb, daddr, saddr,
1993                                                          tos, dev, our);
1994                         }
1995                 }
1996                 rcu_read_unlock();
1997                 return -EINVAL;
1998         }
1999         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2000 }
2001
2002 static inline int __mkroute_output(struct rtable **result,
2003                                    struct fib_result* res,
2004                                    const struct flowi *fl,
2005                                    const struct flowi *oldflp,
2006                                    struct net_device *dev_out,
2007                                    unsigned flags)
2008 {
2009         struct rtable *rth;
2010         struct in_device *in_dev;
2011         u32 tos = RT_FL_TOS(oldflp);
2012         int err = 0;
2013
2014         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2015                 return -EINVAL;
2016
2017         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2018                 res->type = RTN_BROADCAST;
2019         else if (MULTICAST(fl->fl4_dst))
2020                 res->type = RTN_MULTICAST;
2021         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2022                 return -EINVAL;
2023
2024         if (dev_out->flags & IFF_LOOPBACK)
2025                 flags |= RTCF_LOCAL;
2026
2027         /* get work reference to inet device */
2028         in_dev = in_dev_get(dev_out);
2029         if (!in_dev)
2030                 return -EINVAL;
2031
2032         if (res->type == RTN_BROADCAST) {
2033                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2034                 if (res->fi) {
2035                         fib_info_put(res->fi);
2036                         res->fi = NULL;
2037                 }
2038         } else if (res->type == RTN_MULTICAST) {
2039                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2040                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2041                                  oldflp->proto))
2042                         flags &= ~RTCF_LOCAL;
2043                 /* If multicast route do not exist use
2044                    default one, but do not gateway in this case.
2045                    Yes, it is hack.
2046                  */
2047                 if (res->fi && res->prefixlen < 4) {
2048                         fib_info_put(res->fi);
2049                         res->fi = NULL;
2050                 }
2051         }
2052
2053
2054         rth = dst_alloc(&ipv4_dst_ops);
2055         if (!rth) {
2056                 err = -ENOBUFS;
2057                 goto cleanup;
2058         }
2059
2060         atomic_set(&rth->u.dst.__refcnt, 1);
2061         rth->u.dst.flags= DST_HOST;
2062         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2063                 rth->u.dst.flags |= DST_NOXFRM;
2064         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2065                 rth->u.dst.flags |= DST_NOPOLICY;
2066
2067         rth->fl.fl4_dst = oldflp->fl4_dst;
2068         rth->fl.fl4_tos = tos;
2069         rth->fl.fl4_src = oldflp->fl4_src;
2070         rth->fl.oif     = oldflp->oif;
2071         rth->fl.mark    = oldflp->mark;
2072         rth->rt_dst     = fl->fl4_dst;
2073         rth->rt_src     = fl->fl4_src;
2074         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2075         /* get references to the devices that are to be hold by the routing
2076            cache entry */
2077         rth->u.dst.dev  = dev_out;
2078         dev_hold(dev_out);
2079         rth->idev       = in_dev_get(dev_out);
2080         rth->rt_gateway = fl->fl4_dst;
2081         rth->rt_spec_dst= fl->fl4_src;
2082
2083         rth->u.dst.output=ip_output;
2084
2085         RT_CACHE_STAT_INC(out_slow_tot);
2086
2087         if (flags & RTCF_LOCAL) {
2088                 rth->u.dst.input = ip_local_deliver;
2089                 rth->rt_spec_dst = fl->fl4_dst;
2090         }
2091         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2092                 rth->rt_spec_dst = fl->fl4_src;
2093                 if (flags & RTCF_LOCAL &&
2094                     !(dev_out->flags & IFF_LOOPBACK)) {
2095                         rth->u.dst.output = ip_mc_output;
2096                         RT_CACHE_STAT_INC(out_slow_mc);
2097                 }
2098 #ifdef CONFIG_IP_MROUTE
2099                 if (res->type == RTN_MULTICAST) {
2100                         if (IN_DEV_MFORWARD(in_dev) &&
2101                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2102                                 rth->u.dst.input = ip_mr_input;
2103                                 rth->u.dst.output = ip_mc_output;
2104                         }
2105                 }
2106 #endif
2107         }
2108
2109         rt_set_nexthop(rth, res, 0);
2110
2111         rth->rt_flags = flags;
2112
2113         *result = rth;
2114  cleanup:
2115         /* release work reference to inet device */
2116         in_dev_put(in_dev);
2117
2118         return err;
2119 }
2120
2121 static inline int ip_mkroute_output(struct rtable **rp,
2122                                     struct fib_result* res,
2123                                     const struct flowi *fl,
2124                                     const struct flowi *oldflp,
2125                                     struct net_device *dev_out,
2126                                     unsigned flags)
2127 {
2128         struct rtable *rth = NULL;
2129         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2130         unsigned hash;
2131         if (err == 0) {
2132                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2133                 err = rt_intern_hash(hash, rth, rp);
2134         }
2135
2136         return err;
2137 }
2138
2139 /*
2140  * Major route resolver routine.
2141  */
2142
2143 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2144 {
2145         u32 tos = RT_FL_TOS(oldflp);
2146         struct flowi fl = { .nl_u = { .ip4_u =
2147                                       { .daddr = oldflp->fl4_dst,
2148                                         .saddr = oldflp->fl4_src,
2149                                         .tos = tos & IPTOS_RT_MASK,
2150                                         .scope = ((tos & RTO_ONLINK) ?
2151                                                   RT_SCOPE_LINK :
2152                                                   RT_SCOPE_UNIVERSE),
2153                                       } },
2154                             .mark = oldflp->mark,
2155                             .iif = loopback_dev.ifindex,
2156                             .oif = oldflp->oif };
2157         struct fib_result res;
2158         unsigned flags = 0;
2159         struct net_device *dev_out = NULL;
2160         int free_res = 0;
2161         int err;
2162
2163
2164         res.fi          = NULL;
2165 #ifdef CONFIG_IP_MULTIPLE_TABLES
2166         res.r           = NULL;
2167 #endif
2168
2169         if (oldflp->fl4_src) {
2170                 err = -EINVAL;
2171                 if (MULTICAST(oldflp->fl4_src) ||
2172                     BADCLASS(oldflp->fl4_src) ||
2173                     ZERONET(oldflp->fl4_src))
2174                         goto out;
2175
2176                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2177                 dev_out = ip_dev_find(oldflp->fl4_src);
2178                 if (dev_out == NULL)
2179                         goto out;
2180
2181                 /* I removed check for oif == dev_out->oif here.
2182                    It was wrong for two reasons:
2183                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2184                       assigned to multiple interfaces.
2185                    2. Moreover, we are allowed to send packets with saddr
2186                       of another iface. --ANK
2187                  */
2188
2189                 if (oldflp->oif == 0
2190                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2191                         /* Special hack: user can direct multicasts
2192                            and limited broadcast via necessary interface
2193                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2194                            This hack is not just for fun, it allows
2195                            vic,vat and friends to work.
2196                            They bind socket to loopback, set ttl to zero
2197                            and expect that it will work.
2198                            From the viewpoint of routing cache they are broken,
2199                            because we are not allowed to build multicast path
2200                            with loopback source addr (look, routing cache
2201                            cannot know, that ttl is zero, so that packet
2202                            will not leave this host and route is valid).
2203                            Luckily, this hack is good workaround.
2204                          */
2205
2206                         fl.oif = dev_out->ifindex;
2207                         goto make_route;
2208                 }
2209                 if (dev_out)
2210                         dev_put(dev_out);
2211                 dev_out = NULL;
2212         }
2213
2214
2215         if (oldflp->oif) {
2216                 dev_out = dev_get_by_index(oldflp->oif);
2217                 err = -ENODEV;
2218                 if (dev_out == NULL)
2219                         goto out;
2220
2221                 /* RACE: Check return value of inet_select_addr instead. */
2222                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2223                         dev_put(dev_out);
2224                         goto out;       /* Wrong error code */
2225                 }
2226
2227                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2228                         if (!fl.fl4_src)
2229                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2230                                                               RT_SCOPE_LINK);
2231                         goto make_route;
2232                 }
2233                 if (!fl.fl4_src) {
2234                         if (MULTICAST(oldflp->fl4_dst))
2235                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2236                                                               fl.fl4_scope);
2237                         else if (!oldflp->fl4_dst)
2238                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2239                                                               RT_SCOPE_HOST);
2240                 }
2241         }
2242
2243         if (!fl.fl4_dst) {
2244                 fl.fl4_dst = fl.fl4_src;
2245                 if (!fl.fl4_dst)
2246                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2247                 if (dev_out)
2248                         dev_put(dev_out);
2249                 dev_out = &loopback_dev;
2250                 dev_hold(dev_out);
2251                 fl.oif = loopback_dev.ifindex;
2252                 res.type = RTN_LOCAL;
2253                 flags |= RTCF_LOCAL;
2254                 goto make_route;
2255         }
2256
2257         if (fib_lookup(&fl, &res)) {
2258                 res.fi = NULL;
2259                 if (oldflp->oif) {
2260                         /* Apparently, routing tables are wrong. Assume,
2261                            that the destination is on link.
2262
2263                            WHY? DW.
2264                            Because we are allowed to send to iface
2265                            even if it has NO routes and NO assigned
2266                            addresses. When oif is specified, routing
2267                            tables are looked up with only one purpose:
2268                            to catch if destination is gatewayed, rather than
2269                            direct. Moreover, if MSG_DONTROUTE is set,
2270                            we send packet, ignoring both routing tables
2271                            and ifaddr state. --ANK
2272
2273
2274                            We could make it even if oif is unknown,
2275                            likely IPv6, but we do not.
2276                          */
2277
2278                         if (fl.fl4_src == 0)
2279                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2280                                                               RT_SCOPE_LINK);
2281                         res.type = RTN_UNICAST;
2282                         goto make_route;
2283                 }
2284                 if (dev_out)
2285                         dev_put(dev_out);
2286                 err = -ENETUNREACH;
2287                 goto out;
2288         }
2289         free_res = 1;
2290
2291         if (res.type == RTN_LOCAL) {
2292                 if (!fl.fl4_src)
2293                         fl.fl4_src = fl.fl4_dst;
2294                 if (dev_out)
2295                         dev_put(dev_out);
2296                 dev_out = &loopback_dev;
2297                 dev_hold(dev_out);
2298                 fl.oif = dev_out->ifindex;
2299                 if (res.fi)
2300                         fib_info_put(res.fi);
2301                 res.fi = NULL;
2302                 flags |= RTCF_LOCAL;
2303                 goto make_route;
2304         }
2305
2306 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2307         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2308                 fib_select_multipath(&fl, &res);
2309         else
2310 #endif
2311         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2312                 fib_select_default(&fl, &res);
2313
2314         if (!fl.fl4_src)
2315                 fl.fl4_src = FIB_RES_PREFSRC(res);
2316
2317         if (dev_out)
2318                 dev_put(dev_out);
2319         dev_out = FIB_RES_DEV(res);
2320         dev_hold(dev_out);
2321         fl.oif = dev_out->ifindex;
2322
2323
2324 make_route:
2325         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2326
2327
2328         if (free_res)
2329                 fib_res_put(&res);
2330         if (dev_out)
2331                 dev_put(dev_out);
2332 out:    return err;
2333 }
2334
2335 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2336 {
2337         unsigned hash;
2338         struct rtable *rth;
2339
2340         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2341
2342         rcu_read_lock_bh();
2343         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2344                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2345                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2346                     rth->fl.fl4_src == flp->fl4_src &&
2347                     rth->fl.iif == 0 &&
2348                     rth->fl.oif == flp->oif &&
2349                     rth->fl.mark == flp->mark &&
2350                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2351                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2352                         rth->u.dst.lastuse = jiffies;
2353                         dst_hold(&rth->u.dst);
2354                         rth->u.dst.__use++;
2355                         RT_CACHE_STAT_INC(out_hit);
2356                         rcu_read_unlock_bh();
2357                         *rp = rth;
2358                         return 0;
2359                 }
2360                 RT_CACHE_STAT_INC(out_hlist_search);
2361         }
2362         rcu_read_unlock_bh();
2363
2364         return ip_route_output_slow(rp, flp);
2365 }
2366
2367 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2368
2369 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2370 {
2371 }
2372
2373 static struct dst_ops ipv4_dst_blackhole_ops = {
2374         .family                 =       AF_INET,
2375         .protocol               =       __constant_htons(ETH_P_IP),
2376         .destroy                =       ipv4_dst_destroy,
2377         .check                  =       ipv4_dst_check,
2378         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2379         .entry_size             =       sizeof(struct rtable),
2380 };
2381
2382
2383 static int ipv4_blackhole_output(struct sk_buff *skb)
2384 {
2385         kfree_skb(skb);
2386         return 0;
2387 }
2388
2389 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2390 {
2391         struct rtable *ort = *rp;
2392         struct rtable *rt = (struct rtable *)
2393                 dst_alloc(&ipv4_dst_blackhole_ops);
2394
2395         if (rt) {
2396                 struct dst_entry *new = &rt->u.dst;
2397
2398                 atomic_set(&new->__refcnt, 1);
2399                 new->__use = 1;
2400                 new->input = ipv4_blackhole_output;
2401                 new->output = ipv4_blackhole_output;
2402                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2403
2404                 new->dev = ort->u.dst.dev;
2405                 if (new->dev)
2406                         dev_hold(new->dev);
2407
2408                 rt->fl = ort->fl;
2409
2410                 rt->idev = ort->idev;
2411                 if (rt->idev)
2412                         in_dev_hold(rt->idev);
2413                 rt->rt_flags = ort->rt_flags;
2414                 rt->rt_type = ort->rt_type;
2415                 rt->rt_dst = ort->rt_dst;
2416                 rt->rt_src = ort->rt_src;
2417                 rt->rt_iif = ort->rt_iif;
2418                 rt->rt_gateway = ort->rt_gateway;
2419                 rt->rt_spec_dst = ort->rt_spec_dst;
2420                 rt->peer = ort->peer;
2421                 if (rt->peer)
2422                         atomic_inc(&rt->peer->refcnt);
2423
2424                 dst_free(new);
2425         }
2426
2427         dst_release(&(*rp)->u.dst);
2428         *rp = rt;
2429         return (rt ? 0 : -ENOMEM);
2430 }
2431
2432 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2433 {
2434         int err;
2435
2436         if ((err = __ip_route_output_key(rp, flp)) != 0)
2437                 return err;
2438
2439         if (flp->proto) {
2440                 if (!flp->fl4_src)
2441                         flp->fl4_src = (*rp)->rt_src;
2442                 if (!flp->fl4_dst)
2443                         flp->fl4_dst = (*rp)->rt_dst;
2444                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2445                 if (err == -EREMOTE)
2446                         err = ipv4_dst_blackhole(rp, flp, sk);
2447
2448                 return err;
2449         }
2450
2451         return 0;
2452 }
2453
2454 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2455
2456 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2457 {
2458         return ip_route_output_flow(rp, flp, NULL, 0);
2459 }
2460
2461 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2462                         int nowait, unsigned int flags)
2463 {
2464         struct rtable *rt = (struct rtable*)skb->dst;
2465         struct rtmsg *r;
2466         struct nlmsghdr *nlh;
2467         long expires;
2468         u32 id = 0, ts = 0, tsage = 0, error;
2469
2470         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2471         if (nlh == NULL)
2472                 return -EMSGSIZE;
2473
2474         r = nlmsg_data(nlh);
2475         r->rtm_family    = AF_INET;
2476         r->rtm_dst_len  = 32;
2477         r->rtm_src_len  = 0;
2478         r->rtm_tos      = rt->fl.fl4_tos;
2479         r->rtm_table    = RT_TABLE_MAIN;
2480         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2481         r->rtm_type     = rt->rt_type;
2482         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2483         r->rtm_protocol = RTPROT_UNSPEC;
2484         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2485         if (rt->rt_flags & RTCF_NOTIFY)
2486                 r->rtm_flags |= RTM_F_NOTIFY;
2487
2488         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2489
2490         if (rt->fl.fl4_src) {
2491                 r->rtm_src_len = 32;
2492                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2493         }
2494         if (rt->u.dst.dev)
2495                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2496 #ifdef CONFIG_NET_CLS_ROUTE
2497         if (rt->u.dst.tclassid)
2498                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2499 #endif
2500         if (rt->fl.iif)
2501                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2502         else if (rt->rt_src != rt->fl.fl4_src)
2503                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2504
2505         if (rt->rt_dst != rt->rt_gateway)
2506                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2507
2508         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2509                 goto nla_put_failure;
2510
2511         error = rt->u.dst.error;
2512         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2513         if (rt->peer) {
2514                 id = rt->peer->ip_id_count;
2515                 if (rt->peer->tcp_ts_stamp) {
2516                         ts = rt->peer->tcp_ts;
2517                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2518                 }
2519         }
2520
2521         if (rt->fl.iif) {
2522 #ifdef CONFIG_IP_MROUTE
2523                 __be32 dst = rt->rt_dst;
2524
2525                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2526                     IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2527                         int err = ipmr_get_route(skb, r, nowait);
2528                         if (err <= 0) {
2529                                 if (!nowait) {
2530                                         if (err == 0)
2531                                                 return 0;
2532                                         goto nla_put_failure;
2533                                 } else {
2534                                         if (err == -EMSGSIZE)
2535                                                 goto nla_put_failure;
2536                                         error = err;
2537                                 }
2538                         }
2539                 } else
2540 #endif
2541                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2542         }
2543
2544         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2545                                expires, error) < 0)
2546                 goto nla_put_failure;
2547
2548         return nlmsg_end(skb, nlh);
2549
2550 nla_put_failure:
2551         nlmsg_cancel(skb, nlh);
2552         return -EMSGSIZE;
2553 }
2554
2555 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2556 {
2557         struct rtmsg *rtm;
2558         struct nlattr *tb[RTA_MAX+1];
2559         struct rtable *rt = NULL;
2560         __be32 dst = 0;
2561         __be32 src = 0;
2562         u32 iif;
2563         int err;
2564         struct sk_buff *skb;
2565
2566         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2567         if (err < 0)
2568                 goto errout;
2569
2570         rtm = nlmsg_data(nlh);
2571
2572         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2573         if (skb == NULL) {
2574                 err = -ENOBUFS;
2575                 goto errout;
2576         }
2577
2578         /* Reserve room for dummy headers, this skb can pass
2579            through good chunk of routing engine.
2580          */
2581         skb_reset_mac_header(skb);
2582         skb_reset_network_header(skb);
2583
2584         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2585         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2586         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2587
2588         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2589         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2590         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2591
2592         if (iif) {
2593                 struct net_device *dev;
2594
2595                 dev = __dev_get_by_index(iif);
2596                 if (dev == NULL) {
2597                         err = -ENODEV;
2598                         goto errout_free;
2599                 }
2600
2601                 skb->protocol   = htons(ETH_P_IP);
2602                 skb->dev        = dev;
2603                 local_bh_disable();
2604                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2605                 local_bh_enable();
2606
2607                 rt = (struct rtable*) skb->dst;
2608                 if (err == 0 && rt->u.dst.error)
2609                         err = -rt->u.dst.error;
2610         } else {
2611                 struct flowi fl = {
2612                         .nl_u = {
2613                                 .ip4_u = {
2614                                         .daddr = dst,
2615                                         .saddr = src,
2616                                         .tos = rtm->rtm_tos,
2617                                 },
2618                         },
2619                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2620                 };
2621                 err = ip_route_output_key(&rt, &fl);
2622         }
2623
2624         if (err)
2625                 goto errout_free;
2626
2627         skb->dst = &rt->u.dst;
2628         if (rtm->rtm_flags & RTM_F_NOTIFY)
2629                 rt->rt_flags |= RTCF_NOTIFY;
2630
2631         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2632                                 RTM_NEWROUTE, 0, 0);
2633         if (err <= 0)
2634                 goto errout_free;
2635
2636         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2637 errout:
2638         return err;
2639
2640 errout_free:
2641         kfree_skb(skb);
2642         goto errout;
2643 }
2644
2645 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2646 {
2647         struct rtable *rt;
2648         int h, s_h;
2649         int idx, s_idx;
2650
2651         s_h = cb->args[0];
2652         s_idx = idx = cb->args[1];
2653         for (h = 0; h <= rt_hash_mask; h++) {
2654                 if (h < s_h) continue;
2655                 if (h > s_h)
2656                         s_idx = 0;
2657                 rcu_read_lock_bh();
2658                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2659                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2660                         if (idx < s_idx)
2661                                 continue;
2662                         skb->dst = dst_clone(&rt->u.dst);
2663                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2664                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2665                                          1, NLM_F_MULTI) <= 0) {
2666                                 dst_release(xchg(&skb->dst, NULL));
2667                                 rcu_read_unlock_bh();
2668                                 goto done;
2669                         }
2670                         dst_release(xchg(&skb->dst, NULL));
2671                 }
2672                 rcu_read_unlock_bh();
2673         }
2674
2675 done:
2676         cb->args[0] = h;
2677         cb->args[1] = idx;
2678         return skb->len;
2679 }
2680
2681 void ip_rt_multicast_event(struct in_device *in_dev)
2682 {
2683         rt_cache_flush(0);
2684 }
2685
2686 #ifdef CONFIG_SYSCTL
2687 static int flush_delay;
2688
2689 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2690                                         struct file *filp, void __user *buffer,
2691                                         size_t *lenp, loff_t *ppos)
2692 {
2693         if (write) {
2694                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2695                 rt_cache_flush(flush_delay);
2696                 return 0;
2697         }
2698
2699         return -EINVAL;
2700 }
2701
2702 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2703                                                 int __user *name,
2704                                                 int nlen,
2705                                                 void __user *oldval,
2706                                                 size_t __user *oldlenp,
2707                                                 void __user *newval,
2708                                                 size_t newlen)
2709 {
2710         int delay;
2711         if (newlen != sizeof(int))
2712                 return -EINVAL;
2713         if (get_user(delay, (int __user *)newval))
2714                 return -EFAULT;
2715         rt_cache_flush(delay);
2716         return 0;
2717 }
2718
2719 ctl_table ipv4_route_table[] = {
2720         {
2721                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2722                 .procname       = "flush",
2723                 .data           = &flush_delay,
2724                 .maxlen         = sizeof(int),
2725                 .mode           = 0200,
2726                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2727                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2728         },
2729         {
2730                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2731                 .procname       = "min_delay",
2732                 .data           = &ip_rt_min_delay,
2733                 .maxlen         = sizeof(int),
2734                 .mode           = 0644,
2735                 .proc_handler   = &proc_dointvec_jiffies,
2736                 .strategy       = &sysctl_jiffies,
2737         },
2738         {
2739                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2740                 .procname       = "max_delay",
2741                 .data           = &ip_rt_max_delay,
2742                 .maxlen         = sizeof(int),
2743                 .mode           = 0644,
2744                 .proc_handler   = &proc_dointvec_jiffies,
2745                 .strategy       = &sysctl_jiffies,
2746         },
2747         {
2748                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2749                 .procname       = "gc_thresh",
2750                 .data           = &ipv4_dst_ops.gc_thresh,
2751                 .maxlen         = sizeof(int),
2752                 .mode           = 0644,
2753                 .proc_handler   = &proc_dointvec,
2754         },
2755         {
2756                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2757                 .procname       = "max_size",
2758                 .data           = &ip_rt_max_size,
2759                 .maxlen         = sizeof(int),
2760                 .mode           = 0644,
2761                 .proc_handler   = &proc_dointvec,
2762         },
2763         {
2764                 /*  Deprecated. Use gc_min_interval_ms */
2765
2766                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2767                 .procname       = "gc_min_interval",
2768                 .data           = &ip_rt_gc_min_interval,
2769                 .maxlen         = sizeof(int),
2770                 .mode           = 0644,
2771                 .proc_handler   = &proc_dointvec_jiffies,
2772                 .strategy       = &sysctl_jiffies,
2773         },
2774         {
2775                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2776                 .procname       = "gc_min_interval_ms",
2777                 .data           = &ip_rt_gc_min_interval,
2778                 .maxlen         = sizeof(int),
2779                 .mode           = 0644,
2780                 .proc_handler   = &proc_dointvec_ms_jiffies,
2781                 .strategy       = &sysctl_ms_jiffies,
2782         },
2783         {
2784                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2785                 .procname       = "gc_timeout",
2786                 .data           = &ip_rt_gc_timeout,
2787                 .maxlen         = sizeof(int),
2788                 .mode           = 0644,
2789                 .proc_handler   = &proc_dointvec_jiffies,
2790                 .strategy       = &sysctl_jiffies,
2791         },
2792         {
2793                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2794                 .procname       = "gc_interval",
2795                 .data           = &ip_rt_gc_interval,
2796                 .maxlen         = sizeof(int),
2797                 .mode           = 0644,
2798                 .proc_handler   = &proc_dointvec_jiffies,
2799                 .strategy       = &sysctl_jiffies,
2800         },
2801         {
2802                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2803                 .procname       = "redirect_load",
2804                 .data           = &ip_rt_redirect_load,
2805                 .maxlen         = sizeof(int),
2806                 .mode           = 0644,
2807                 .proc_handler   = &proc_dointvec,
2808         },
2809         {
2810                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2811                 .procname       = "redirect_number",
2812                 .data           = &ip_rt_redirect_number,
2813                 .maxlen         = sizeof(int),
2814                 .mode           = 0644,
2815                 .proc_handler   = &proc_dointvec,
2816         },
2817         {
2818                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2819                 .procname       = "redirect_silence",
2820                 .data           = &ip_rt_redirect_silence,
2821                 .maxlen         = sizeof(int),
2822                 .mode           = 0644,
2823                 .proc_handler   = &proc_dointvec,
2824         },
2825         {
2826                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2827                 .procname       = "error_cost",
2828                 .data           = &ip_rt_error_cost,
2829                 .maxlen         = sizeof(int),
2830                 .mode           = 0644,
2831                 .proc_handler   = &proc_dointvec,
2832         },
2833         {
2834                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2835                 .procname       = "error_burst",
2836                 .data           = &ip_rt_error_burst,
2837                 .maxlen         = sizeof(int),
2838                 .mode           = 0644,
2839                 .proc_handler   = &proc_dointvec,
2840         },
2841         {
2842                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2843                 .procname       = "gc_elasticity",
2844                 .data           = &ip_rt_gc_elasticity,
2845                 .maxlen         = sizeof(int),
2846                 .mode           = 0644,
2847                 .proc_handler   = &proc_dointvec,
2848         },
2849         {
2850                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2851                 .procname       = "mtu_expires",
2852                 .data           = &ip_rt_mtu_expires,
2853                 .maxlen         = sizeof(int),
2854                 .mode           = 0644,
2855                 .proc_handler   = &proc_dointvec_jiffies,
2856                 .strategy       = &sysctl_jiffies,
2857         },
2858         {
2859                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2860                 .procname       = "min_pmtu",
2861                 .data           = &ip_rt_min_pmtu,
2862                 .maxlen         = sizeof(int),
2863                 .mode           = 0644,
2864                 .proc_handler   = &proc_dointvec,
2865         },
2866         {
2867                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2868                 .procname       = "min_adv_mss",
2869                 .data           = &ip_rt_min_advmss,
2870                 .maxlen         = sizeof(int),
2871                 .mode           = 0644,
2872                 .proc_handler   = &proc_dointvec,
2873         },
2874         {
2875                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2876                 .procname       = "secret_interval",
2877                 .data           = &ip_rt_secret_interval,
2878                 .maxlen         = sizeof(int),
2879                 .mode           = 0644,
2880                 .proc_handler   = &proc_dointvec_jiffies,
2881                 .strategy       = &sysctl_jiffies,
2882         },
2883         { .ctl_name = 0 }
2884 };
2885 #endif
2886
2887 #ifdef CONFIG_NET_CLS_ROUTE
2888 struct ip_rt_acct *ip_rt_acct;
2889
2890 /* This code sucks.  But you should have seen it before! --RR */
2891
2892 /* IP route accounting ptr for this logical cpu number. */
2893 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2894
2895 #ifdef CONFIG_PROC_FS
2896 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2897                            int length, int *eof, void *data)
2898 {
2899         unsigned int i;
2900
2901         if ((offset & 3) || (length & 3))
2902                 return -EIO;
2903
2904         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2905                 *eof = 1;
2906                 return 0;
2907         }
2908
2909         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2910                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2911                 *eof = 1;
2912         }
2913
2914         offset /= sizeof(u32);
2915
2916         if (length > 0) {
2917                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2918                 u32 *dst = (u32 *) buffer;
2919
2920                 /* Copy first cpu. */
2921                 *start = buffer;
2922                 memcpy(dst, src, length);
2923
2924                 /* Add the other cpus in, one int at a time */
2925                 for_each_possible_cpu(i) {
2926                         unsigned int j;
2927
2928                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2929
2930                         for (j = 0; j < length/4; j++)
2931                                 dst[j] += src[j];
2932                 }
2933         }
2934         return length;
2935 }
2936 #endif /* CONFIG_PROC_FS */
2937 #endif /* CONFIG_NET_CLS_ROUTE */
2938
2939 static __initdata unsigned long rhash_entries;
2940 static int __init set_rhash_entries(char *str)
2941 {
2942         if (!str)
2943                 return 0;
2944         rhash_entries = simple_strtoul(str, &str, 0);
2945         return 1;
2946 }
2947 __setup("rhash_entries=", set_rhash_entries);
2948
2949 int __init ip_rt_init(void)
2950 {
2951         int rc = 0;
2952
2953         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2954                              (jiffies ^ (jiffies >> 7)));
2955
2956 #ifdef CONFIG_NET_CLS_ROUTE
2957         {
2958         int order;
2959         for (order = 0;
2960              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2961                 /* NOTHING */;
2962         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2963         if (!ip_rt_acct)
2964                 panic("IP: failed to allocate ip_rt_acct\n");
2965         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2966         }
2967 #endif
2968
2969         ipv4_dst_ops.kmem_cachep =
2970                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2971                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2972
2973         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2974
2975         rt_hash_table = (struct rt_hash_bucket *)
2976                 alloc_large_system_hash("IP route cache",
2977                                         sizeof(struct rt_hash_bucket),
2978                                         rhash_entries,
2979                                         (num_physpages >= 128 * 1024) ?
2980                                         15 : 17,
2981                                         0,
2982                                         &rt_hash_log,
2983                                         &rt_hash_mask,
2984                                         0);
2985         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2986         rt_hash_lock_init();
2987
2988         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2989         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2990
2991         devinet_init();
2992         ip_fib_init();
2993
2994         init_timer(&rt_flush_timer);
2995         rt_flush_timer.function = rt_run_flush;
2996         init_timer(&rt_periodic_timer);
2997         rt_periodic_timer.function = rt_check_expire;
2998         init_timer(&rt_secret_timer);
2999         rt_secret_timer.function = rt_secret_rebuild;
3000
3001         /* All the timers, started at system startup tend
3002            to synchronize. Perturb it a bit.
3003          */
3004         rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3005                                         ip_rt_gc_interval;
3006         add_timer(&rt_periodic_timer);
3007
3008         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3009                 ip_rt_secret_interval;
3010         add_timer(&rt_secret_timer);
3011
3012 #ifdef CONFIG_PROC_FS
3013         {
3014         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3015         if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3016             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3017                                              init_net.proc_net_stat))) {
3018                 return -ENOMEM;
3019         }
3020         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3021         }
3022 #ifdef CONFIG_NET_CLS_ROUTE
3023         create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
3024 #endif
3025 #endif
3026 #ifdef CONFIG_XFRM
3027         xfrm_init();
3028         xfrm4_init();
3029 #endif
3030         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3031
3032         return rc;
3033 }
3034
3035 EXPORT_SYMBOL(__ip_select_ident);
3036 EXPORT_SYMBOL(ip_route_input);
3037 EXPORT_SYMBOL(ip_route_output_key);