]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv4/route.c
[PATCH] IPV4 : Move ip route cache flush (secret_rebuild) from softirq to workqueue
[net-next-2.6.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_min_delay              = 2 * HZ;
121 static int ip_rt_max_delay              = 10 * HZ;
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval            = 60 * HZ;
125 static int ip_rt_gc_min_interval        = HZ / 2;
126 static int ip_rt_redirect_number        = 9;
127 static int ip_rt_redirect_load          = HZ / 50;
128 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost             = HZ;
130 static int ip_rt_error_burst            = 5 * HZ;
131 static int ip_rt_gc_elasticity          = 8;
132 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu               = 512 + 20 + 20;
134 static int ip_rt_min_advmss             = 256;
135 static int ip_rt_secret_interval        = 10 * 60 * HZ;
136 static int ip_rt_flush_expected;
137 static unsigned long rt_deadline;
138
139 #define RTprint(a...)   printk(KERN_DEBUG a)
140
141 static struct timer_list rt_flush_timer;
142 static void rt_worker_func(struct work_struct *work);
143 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
144 static struct timer_list rt_secret_timer;
145
146 /*
147  *      Interface to generic destination cache.
148  */
149
150 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151 static void              ipv4_dst_destroy(struct dst_entry *dst);
152 static void              ipv4_dst_ifdown(struct dst_entry *dst,
153                                          struct net_device *dev, int how);
154 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155 static void              ipv4_link_failure(struct sk_buff *skb);
156 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
157 static int rt_garbage_collect(void);
158
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             __constant_htons(ETH_P_IP),
163         .gc =                   rt_garbage_collect,
164         .check =                ipv4_dst_check,
165         .destroy =              ipv4_dst_destroy,
166         .ifdown =               ipv4_dst_ifdown,
167         .negative_advice =      ipv4_negative_advice,
168         .link_failure =         ipv4_link_failure,
169         .update_pmtu =          ip_rt_update_pmtu,
170         .local_out =            ip_local_out,
171         .entry_size =           sizeof(struct rtable),
172 };
173
174 #define ECN_OR_COST(class)      TC_PRIO_##class
175
176 const __u8 ip_tos2prio[16] = {
177         TC_PRIO_BESTEFFORT,
178         ECN_OR_COST(FILLER),
179         TC_PRIO_BESTEFFORT,
180         ECN_OR_COST(BESTEFFORT),
181         TC_PRIO_BULK,
182         ECN_OR_COST(BULK),
183         TC_PRIO_BULK,
184         ECN_OR_COST(BULK),
185         TC_PRIO_INTERACTIVE,
186         ECN_OR_COST(INTERACTIVE),
187         TC_PRIO_INTERACTIVE,
188         ECN_OR_COST(INTERACTIVE),
189         TC_PRIO_INTERACTIVE_BULK,
190         ECN_OR_COST(INTERACTIVE_BULK),
191         TC_PRIO_INTERACTIVE_BULK,
192         ECN_OR_COST(INTERACTIVE_BULK)
193 };
194
195
196 /*
197  * Route cache.
198  */
199
200 /* The locking scheme is rather straight forward:
201  *
202  * 1) Read-Copy Update protects the buckets of the central route hash.
203  * 2) Only writers remove entries, and they hold the lock
204  *    as they look at rtable reference counts.
205  * 3) Only readers acquire references to rtable entries,
206  *    they do so with atomic increments and with the
207  *    lock held.
208  */
209
210 struct rt_hash_bucket {
211         struct rtable   *chain;
212 };
213 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
214         defined(CONFIG_PROVE_LOCKING)
215 /*
216  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
217  * The size of this table is a power of two and depends on the number of CPUS.
218  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
219  */
220 #ifdef CONFIG_LOCKDEP
221 # define RT_HASH_LOCK_SZ        256
222 #else
223 # if NR_CPUS >= 32
224 #  define RT_HASH_LOCK_SZ       4096
225 # elif NR_CPUS >= 16
226 #  define RT_HASH_LOCK_SZ       2048
227 # elif NR_CPUS >= 8
228 #  define RT_HASH_LOCK_SZ       1024
229 # elif NR_CPUS >= 4
230 #  define RT_HASH_LOCK_SZ       512
231 # else
232 #  define RT_HASH_LOCK_SZ       256
233 # endif
234 #endif
235
236 static spinlock_t       *rt_hash_locks;
237 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
238 # define rt_hash_lock_init()    { \
239                 int i; \
240                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
241                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
242                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
243                         spin_lock_init(&rt_hash_locks[i]); \
244                 }
245 #else
246 # define rt_hash_lock_addr(slot) NULL
247 # define rt_hash_lock_init()
248 #endif
249
250 static struct rt_hash_bucket    *rt_hash_table;
251 static unsigned                 rt_hash_mask;
252 static unsigned int             rt_hash_log;
253 static unsigned int             rt_hash_rnd;
254
255 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
256 #define RT_CACHE_STAT_INC(field) \
257         (__raw_get_cpu_var(rt_cache_stat).field++)
258
259 static int rt_intern_hash(unsigned hash, struct rtable *rth,
260                                 struct rtable **res);
261
262 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
263 {
264         return (jhash_2words(daddr, saddr, rt_hash_rnd)
265                 & rt_hash_mask);
266 }
267
268 #define rt_hash(daddr, saddr, idx) \
269         rt_hash_code((__force u32)(__be32)(daddr),\
270                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
271
272 #ifdef CONFIG_PROC_FS
273 struct rt_cache_iter_state {
274         int bucket;
275 };
276
277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
278 {
279         struct rtable *r = NULL;
280         struct rt_cache_iter_state *st = seq->private;
281
282         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283                 rcu_read_lock_bh();
284                 r = rt_hash_table[st->bucket].chain;
285                 if (r)
286                         break;
287                 rcu_read_unlock_bh();
288         }
289         return rcu_dereference(r);
290 }
291
292 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
293 {
294         struct rt_cache_iter_state *st = seq->private;
295
296         r = r->u.dst.rt_next;
297         while (!r) {
298                 rcu_read_unlock_bh();
299                 if (--st->bucket < 0)
300                         break;
301                 rcu_read_lock_bh();
302                 r = rt_hash_table[st->bucket].chain;
303         }
304         return rcu_dereference(r);
305 }
306
307 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
308 {
309         struct rtable *r = rt_cache_get_first(seq);
310
311         if (r)
312                 while (pos && (r = rt_cache_get_next(seq, r)))
313                         --pos;
314         return pos ? NULL : r;
315 }
316
317 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
318 {
319         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
320 }
321
322 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
323 {
324         struct rtable *r = NULL;
325
326         if (v == SEQ_START_TOKEN)
327                 r = rt_cache_get_first(seq);
328         else
329                 r = rt_cache_get_next(seq, v);
330         ++*pos;
331         return r;
332 }
333
334 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
335 {
336         if (v && v != SEQ_START_TOKEN)
337                 rcu_read_unlock_bh();
338 }
339
340 static int rt_cache_seq_show(struct seq_file *seq, void *v)
341 {
342         if (v == SEQ_START_TOKEN)
343                 seq_printf(seq, "%-127s\n",
344                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
345                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
346                            "HHUptod\tSpecDst");
347         else {
348                 struct rtable *r = v;
349                 char temp[256];
350
351                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
352                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
353                         r->u.dst.dev ? r->u.dst.dev->name : "*",
354                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
355                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
356                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
357                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
358                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
359                         dst_metric(&r->u.dst, RTAX_WINDOW),
360                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
361                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
362                         r->fl.fl4_tos,
363                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
364                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
365                                        dev_queue_xmit) : 0,
366                         r->rt_spec_dst);
367                 seq_printf(seq, "%-127s\n", temp);
368         }
369         return 0;
370 }
371
372 static const struct seq_operations rt_cache_seq_ops = {
373         .start  = rt_cache_seq_start,
374         .next   = rt_cache_seq_next,
375         .stop   = rt_cache_seq_stop,
376         .show   = rt_cache_seq_show,
377 };
378
379 static int rt_cache_seq_open(struct inode *inode, struct file *file)
380 {
381         return seq_open_private(file, &rt_cache_seq_ops,
382                         sizeof(struct rt_cache_iter_state));
383 }
384
385 static const struct file_operations rt_cache_seq_fops = {
386         .owner   = THIS_MODULE,
387         .open    = rt_cache_seq_open,
388         .read    = seq_read,
389         .llseek  = seq_lseek,
390         .release = seq_release_private,
391 };
392
393
394 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
395 {
396         int cpu;
397
398         if (*pos == 0)
399                 return SEQ_START_TOKEN;
400
401         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
402                 if (!cpu_possible(cpu))
403                         continue;
404                 *pos = cpu+1;
405                 return &per_cpu(rt_cache_stat, cpu);
406         }
407         return NULL;
408 }
409
410 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
411 {
412         int cpu;
413
414         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
415                 if (!cpu_possible(cpu))
416                         continue;
417                 *pos = cpu+1;
418                 return &per_cpu(rt_cache_stat, cpu);
419         }
420         return NULL;
421
422 }
423
424 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
425 {
426
427 }
428
429 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
430 {
431         struct rt_cache_stat *st = v;
432
433         if (v == SEQ_START_TOKEN) {
434                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
435                 return 0;
436         }
437
438         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
439                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
440                    atomic_read(&ipv4_dst_ops.entries),
441                    st->in_hit,
442                    st->in_slow_tot,
443                    st->in_slow_mc,
444                    st->in_no_route,
445                    st->in_brd,
446                    st->in_martian_dst,
447                    st->in_martian_src,
448
449                    st->out_hit,
450                    st->out_slow_tot,
451                    st->out_slow_mc,
452
453                    st->gc_total,
454                    st->gc_ignored,
455                    st->gc_goal_miss,
456                    st->gc_dst_overflow,
457                    st->in_hlist_search,
458                    st->out_hlist_search
459                 );
460         return 0;
461 }
462
463 static const struct seq_operations rt_cpu_seq_ops = {
464         .start  = rt_cpu_seq_start,
465         .next   = rt_cpu_seq_next,
466         .stop   = rt_cpu_seq_stop,
467         .show   = rt_cpu_seq_show,
468 };
469
470
471 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
472 {
473         return seq_open(file, &rt_cpu_seq_ops);
474 }
475
476 static const struct file_operations rt_cpu_seq_fops = {
477         .owner   = THIS_MODULE,
478         .open    = rt_cpu_seq_open,
479         .read    = seq_read,
480         .llseek  = seq_lseek,
481         .release = seq_release,
482 };
483
484 #endif /* CONFIG_PROC_FS */
485
486 static __inline__ void rt_free(struct rtable *rt)
487 {
488         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
489 }
490
491 static __inline__ void rt_drop(struct rtable *rt)
492 {
493         ip_rt_put(rt);
494         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
495 }
496
497 static __inline__ int rt_fast_clean(struct rtable *rth)
498 {
499         /* Kill broadcast/multicast entries very aggresively, if they
500            collide in hash table with more useful entries */
501         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
502                 rth->fl.iif && rth->u.dst.rt_next;
503 }
504
505 static __inline__ int rt_valuable(struct rtable *rth)
506 {
507         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
508                 rth->u.dst.expires;
509 }
510
511 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
512 {
513         unsigned long age;
514         int ret = 0;
515
516         if (atomic_read(&rth->u.dst.__refcnt))
517                 goto out;
518
519         ret = 1;
520         if (rth->u.dst.expires &&
521             time_after_eq(jiffies, rth->u.dst.expires))
522                 goto out;
523
524         age = jiffies - rth->u.dst.lastuse;
525         ret = 0;
526         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
527             (age <= tmo2 && rt_valuable(rth)))
528                 goto out;
529         ret = 1;
530 out:    return ret;
531 }
532
533 /* Bits of score are:
534  * 31: very valuable
535  * 30: not quite useless
536  * 29..0: usage counter
537  */
538 static inline u32 rt_score(struct rtable *rt)
539 {
540         u32 score = jiffies - rt->u.dst.lastuse;
541
542         score = ~score & ~(3<<30);
543
544         if (rt_valuable(rt))
545                 score |= (1<<31);
546
547         if (!rt->fl.iif ||
548             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
549                 score |= (1<<30);
550
551         return score;
552 }
553
554 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
555 {
556         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
557                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
558                 (fl1->mark ^ fl2->mark) |
559                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
560                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
561                 (fl1->oif ^ fl2->oif) |
562                 (fl1->iif ^ fl2->iif)) == 0;
563 }
564
565 /*
566  * Perform a full scan of hash table and free all entries.
567  * Can be called by a softirq or a process.
568  * In the later case, we want to be reschedule if necessary
569  */
570 static void rt_do_flush(int process_context)
571 {
572         unsigned int i;
573         struct rtable *rth, *next;
574
575         for (i = 0; i <= rt_hash_mask; i++) {
576                 if (process_context && need_resched())
577                         cond_resched();
578                 rth = rt_hash_table[i].chain;
579                 if (!rth)
580                         continue;
581
582                 spin_lock_bh(rt_hash_lock_addr(i));
583                 rth = rt_hash_table[i].chain;
584                 rt_hash_table[i].chain = NULL;
585                 spin_unlock_bh(rt_hash_lock_addr(i));
586
587                 for (; rth; rth = next) {
588                         next = rth->u.dst.rt_next;
589                         rt_free(rth);
590                 }
591         }
592 }
593
594 static void rt_check_expire(void)
595 {
596         static unsigned int rover;
597         unsigned int i = rover, goal;
598         struct rtable *rth, **rthp;
599         u64 mult;
600
601         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
602         if (ip_rt_gc_timeout > 1)
603                 do_div(mult, ip_rt_gc_timeout);
604         goal = (unsigned int)mult;
605         if (goal > rt_hash_mask)
606                 goal = rt_hash_mask + 1;
607         for (; goal > 0; goal--) {
608                 unsigned long tmo = ip_rt_gc_timeout;
609
610                 i = (i + 1) & rt_hash_mask;
611                 rthp = &rt_hash_table[i].chain;
612
613                 if (need_resched())
614                         cond_resched();
615
616                 if (*rthp == NULL)
617                         continue;
618                 spin_lock_bh(rt_hash_lock_addr(i));
619                 while ((rth = *rthp) != NULL) {
620                         if (rth->u.dst.expires) {
621                                 /* Entry is expired even if it is in use */
622                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
623                                         tmo >>= 1;
624                                         rthp = &rth->u.dst.rt_next;
625                                         continue;
626                                 }
627                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
628                                 tmo >>= 1;
629                                 rthp = &rth->u.dst.rt_next;
630                                 continue;
631                         }
632
633                         /* Cleanup aged off entries. */
634                         *rthp = rth->u.dst.rt_next;
635                         rt_free(rth);
636                 }
637                 spin_unlock_bh(rt_hash_lock_addr(i));
638         }
639         rover = i;
640 }
641
642 /*
643  * rt_worker_func() is run in process context.
644  * If a whole flush was scheduled, it is done.
645  * Else, we call rt_check_expire() to scan part of the hash table
646  */
647 static void rt_worker_func(struct work_struct *work)
648 {
649         if (ip_rt_flush_expected) {
650                 ip_rt_flush_expected = 0;
651                 rt_do_flush(1);
652         } else
653                 rt_check_expire();
654         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
655 }
656
657 /* This can run from both BH and non-BH contexts, the latter
658  * in the case of a forced flush event.
659  */
660 static void rt_run_flush(unsigned long process_context)
661 {
662         rt_deadline = 0;
663
664         get_random_bytes(&rt_hash_rnd, 4);
665
666         rt_do_flush(process_context);
667 }
668
669 static DEFINE_SPINLOCK(rt_flush_lock);
670
671 void rt_cache_flush(int delay)
672 {
673         unsigned long now = jiffies;
674         int user_mode = !in_softirq();
675
676         if (delay < 0)
677                 delay = ip_rt_min_delay;
678
679         spin_lock_bh(&rt_flush_lock);
680
681         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
682                 long tmo = (long)(rt_deadline - now);
683
684                 /* If flush timer is already running
685                    and flush request is not immediate (delay > 0):
686
687                    if deadline is not achieved, prolongate timer to "delay",
688                    otherwise fire it at deadline time.
689                  */
690
691                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
692                         tmo = 0;
693
694                 if (delay > tmo)
695                         delay = tmo;
696         }
697
698         if (delay <= 0) {
699                 spin_unlock_bh(&rt_flush_lock);
700                 rt_run_flush(user_mode);
701                 return;
702         }
703
704         if (rt_deadline == 0)
705                 rt_deadline = now + ip_rt_max_delay;
706
707         mod_timer(&rt_flush_timer, now+delay);
708         spin_unlock_bh(&rt_flush_lock);
709 }
710
711 /*
712  * We change rt_hash_rnd and ask next rt_worker_func() invocation
713  * to perform a flush in process context
714  */
715 static void rt_secret_rebuild(unsigned long dummy)
716 {
717         get_random_bytes(&rt_hash_rnd, 4);
718         ip_rt_flush_expected = 1;
719         cancel_delayed_work(&expires_work);
720         schedule_delayed_work(&expires_work, HZ/10);
721         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
722 }
723
724 /*
725    Short description of GC goals.
726
727    We want to build algorithm, which will keep routing cache
728    at some equilibrium point, when number of aged off entries
729    is kept approximately equal to newly generated ones.
730
731    Current expiration strength is variable "expire".
732    We try to adjust it dynamically, so that if networking
733    is idle expires is large enough to keep enough of warm entries,
734    and when load increases it reduces to limit cache size.
735  */
736
737 static int rt_garbage_collect(void)
738 {
739         static unsigned long expire = RT_GC_TIMEOUT;
740         static unsigned long last_gc;
741         static int rover;
742         static int equilibrium;
743         struct rtable *rth, **rthp;
744         unsigned long now = jiffies;
745         int goal;
746
747         /*
748          * Garbage collection is pretty expensive,
749          * do not make it too frequently.
750          */
751
752         RT_CACHE_STAT_INC(gc_total);
753
754         if (now - last_gc < ip_rt_gc_min_interval &&
755             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
756                 RT_CACHE_STAT_INC(gc_ignored);
757                 goto out;
758         }
759
760         /* Calculate number of entries, which we want to expire now. */
761         goal = atomic_read(&ipv4_dst_ops.entries) -
762                 (ip_rt_gc_elasticity << rt_hash_log);
763         if (goal <= 0) {
764                 if (equilibrium < ipv4_dst_ops.gc_thresh)
765                         equilibrium = ipv4_dst_ops.gc_thresh;
766                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
767                 if (goal > 0) {
768                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
769                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
770                 }
771         } else {
772                 /* We are in dangerous area. Try to reduce cache really
773                  * aggressively.
774                  */
775                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
776                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
777         }
778
779         if (now - last_gc >= ip_rt_gc_min_interval)
780                 last_gc = now;
781
782         if (goal <= 0) {
783                 equilibrium += goal;
784                 goto work_done;
785         }
786
787         do {
788                 int i, k;
789
790                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
791                         unsigned long tmo = expire;
792
793                         k = (k + 1) & rt_hash_mask;
794                         rthp = &rt_hash_table[k].chain;
795                         spin_lock_bh(rt_hash_lock_addr(k));
796                         while ((rth = *rthp) != NULL) {
797                                 if (!rt_may_expire(rth, tmo, expire)) {
798                                         tmo >>= 1;
799                                         rthp = &rth->u.dst.rt_next;
800                                         continue;
801                                 }
802                                 *rthp = rth->u.dst.rt_next;
803                                 rt_free(rth);
804                                 goal--;
805                         }
806                         spin_unlock_bh(rt_hash_lock_addr(k));
807                         if (goal <= 0)
808                                 break;
809                 }
810                 rover = k;
811
812                 if (goal <= 0)
813                         goto work_done;
814
815                 /* Goal is not achieved. We stop process if:
816
817                    - if expire reduced to zero. Otherwise, expire is halfed.
818                    - if table is not full.
819                    - if we are called from interrupt.
820                    - jiffies check is just fallback/debug loop breaker.
821                      We will not spin here for long time in any case.
822                  */
823
824                 RT_CACHE_STAT_INC(gc_goal_miss);
825
826                 if (expire == 0)
827                         break;
828
829                 expire >>= 1;
830 #if RT_CACHE_DEBUG >= 2
831                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
832                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
833 #endif
834
835                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
836                         goto out;
837         } while (!in_softirq() && time_before_eq(jiffies, now));
838
839         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
840                 goto out;
841         if (net_ratelimit())
842                 printk(KERN_WARNING "dst cache overflow\n");
843         RT_CACHE_STAT_INC(gc_dst_overflow);
844         return 1;
845
846 work_done:
847         expire += ip_rt_gc_min_interval;
848         if (expire > ip_rt_gc_timeout ||
849             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
850                 expire = ip_rt_gc_timeout;
851 #if RT_CACHE_DEBUG >= 2
852         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
853                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
854 #endif
855 out:    return 0;
856 }
857
858 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
859 {
860         struct rtable   *rth, **rthp;
861         unsigned long   now;
862         struct rtable *cand, **candp;
863         u32             min_score;
864         int             chain_length;
865         int attempts = !in_softirq();
866
867 restart:
868         chain_length = 0;
869         min_score = ~(u32)0;
870         cand = NULL;
871         candp = NULL;
872         now = jiffies;
873
874         rthp = &rt_hash_table[hash].chain;
875
876         spin_lock_bh(rt_hash_lock_addr(hash));
877         while ((rth = *rthp) != NULL) {
878                 if (compare_keys(&rth->fl, &rt->fl)) {
879                         /* Put it first */
880                         *rthp = rth->u.dst.rt_next;
881                         /*
882                          * Since lookup is lockfree, the deletion
883                          * must be visible to another weakly ordered CPU before
884                          * the insertion at the start of the hash chain.
885                          */
886                         rcu_assign_pointer(rth->u.dst.rt_next,
887                                            rt_hash_table[hash].chain);
888                         /*
889                          * Since lookup is lockfree, the update writes
890                          * must be ordered for consistency on SMP.
891                          */
892                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
893
894                         dst_use(&rth->u.dst, now);
895                         spin_unlock_bh(rt_hash_lock_addr(hash));
896
897                         rt_drop(rt);
898                         *rp = rth;
899                         return 0;
900                 }
901
902                 if (!atomic_read(&rth->u.dst.__refcnt)) {
903                         u32 score = rt_score(rth);
904
905                         if (score <= min_score) {
906                                 cand = rth;
907                                 candp = rthp;
908                                 min_score = score;
909                         }
910                 }
911
912                 chain_length++;
913
914                 rthp = &rth->u.dst.rt_next;
915         }
916
917         if (cand) {
918                 /* ip_rt_gc_elasticity used to be average length of chain
919                  * length, when exceeded gc becomes really aggressive.
920                  *
921                  * The second limit is less certain. At the moment it allows
922                  * only 2 entries per bucket. We will see.
923                  */
924                 if (chain_length > ip_rt_gc_elasticity) {
925                         *candp = cand->u.dst.rt_next;
926                         rt_free(cand);
927                 }
928         }
929
930         /* Try to bind route to arp only if it is output
931            route or unicast forwarding path.
932          */
933         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
934                 int err = arp_bind_neighbour(&rt->u.dst);
935                 if (err) {
936                         spin_unlock_bh(rt_hash_lock_addr(hash));
937
938                         if (err != -ENOBUFS) {
939                                 rt_drop(rt);
940                                 return err;
941                         }
942
943                         /* Neighbour tables are full and nothing
944                            can be released. Try to shrink route cache,
945                            it is most likely it holds some neighbour records.
946                          */
947                         if (attempts-- > 0) {
948                                 int saved_elasticity = ip_rt_gc_elasticity;
949                                 int saved_int = ip_rt_gc_min_interval;
950                                 ip_rt_gc_elasticity     = 1;
951                                 ip_rt_gc_min_interval   = 0;
952                                 rt_garbage_collect();
953                                 ip_rt_gc_min_interval   = saved_int;
954                                 ip_rt_gc_elasticity     = saved_elasticity;
955                                 goto restart;
956                         }
957
958                         if (net_ratelimit())
959                                 printk(KERN_WARNING "Neighbour table overflow.\n");
960                         rt_drop(rt);
961                         return -ENOBUFS;
962                 }
963         }
964
965         rt->u.dst.rt_next = rt_hash_table[hash].chain;
966 #if RT_CACHE_DEBUG >= 2
967         if (rt->u.dst.rt_next) {
968                 struct rtable *trt;
969                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
970                        NIPQUAD(rt->rt_dst));
971                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
972                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
973                 printk("\n");
974         }
975 #endif
976         rt_hash_table[hash].chain = rt;
977         spin_unlock_bh(rt_hash_lock_addr(hash));
978         *rp = rt;
979         return 0;
980 }
981
982 void rt_bind_peer(struct rtable *rt, int create)
983 {
984         static DEFINE_SPINLOCK(rt_peer_lock);
985         struct inet_peer *peer;
986
987         peer = inet_getpeer(rt->rt_dst, create);
988
989         spin_lock_bh(&rt_peer_lock);
990         if (rt->peer == NULL) {
991                 rt->peer = peer;
992                 peer = NULL;
993         }
994         spin_unlock_bh(&rt_peer_lock);
995         if (peer)
996                 inet_putpeer(peer);
997 }
998
999 /*
1000  * Peer allocation may fail only in serious out-of-memory conditions.  However
1001  * we still can generate some output.
1002  * Random ID selection looks a bit dangerous because we have no chances to
1003  * select ID being unique in a reasonable period of time.
1004  * But broken packet identifier may be better than no packet at all.
1005  */
1006 static void ip_select_fb_ident(struct iphdr *iph)
1007 {
1008         static DEFINE_SPINLOCK(ip_fb_id_lock);
1009         static u32 ip_fallback_id;
1010         u32 salt;
1011
1012         spin_lock_bh(&ip_fb_id_lock);
1013         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1014         iph->id = htons(salt & 0xFFFF);
1015         ip_fallback_id = salt;
1016         spin_unlock_bh(&ip_fb_id_lock);
1017 }
1018
1019 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1020 {
1021         struct rtable *rt = (struct rtable *) dst;
1022
1023         if (rt) {
1024                 if (rt->peer == NULL)
1025                         rt_bind_peer(rt, 1);
1026
1027                 /* If peer is attached to destination, it is never detached,
1028                    so that we need not to grab a lock to dereference it.
1029                  */
1030                 if (rt->peer) {
1031                         iph->id = htons(inet_getid(rt->peer, more));
1032                         return;
1033                 }
1034         } else
1035                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1036                        __builtin_return_address(0));
1037
1038         ip_select_fb_ident(iph);
1039 }
1040
1041 static void rt_del(unsigned hash, struct rtable *rt)
1042 {
1043         struct rtable **rthp;
1044
1045         spin_lock_bh(rt_hash_lock_addr(hash));
1046         ip_rt_put(rt);
1047         for (rthp = &rt_hash_table[hash].chain; *rthp;
1048              rthp = &(*rthp)->u.dst.rt_next)
1049                 if (*rthp == rt) {
1050                         *rthp = rt->u.dst.rt_next;
1051                         rt_free(rt);
1052                         break;
1053                 }
1054         spin_unlock_bh(rt_hash_lock_addr(hash));
1055 }
1056
1057 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1058                     __be32 saddr, struct net_device *dev)
1059 {
1060         int i, k;
1061         struct in_device *in_dev = in_dev_get(dev);
1062         struct rtable *rth, **rthp;
1063         __be32  skeys[2] = { saddr, 0 };
1064         int  ikeys[2] = { dev->ifindex, 0 };
1065         struct netevent_redirect netevent;
1066
1067         if (!in_dev)
1068                 return;
1069
1070         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1071             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1072                 goto reject_redirect;
1073
1074         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1075                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1076                         goto reject_redirect;
1077                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1078                         goto reject_redirect;
1079         } else {
1080                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1081                         goto reject_redirect;
1082         }
1083
1084         for (i = 0; i < 2; i++) {
1085                 for (k = 0; k < 2; k++) {
1086                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1087
1088                         rthp=&rt_hash_table[hash].chain;
1089
1090                         rcu_read_lock();
1091                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1092                                 struct rtable *rt;
1093
1094                                 if (rth->fl.fl4_dst != daddr ||
1095                                     rth->fl.fl4_src != skeys[i] ||
1096                                     rth->fl.oif != ikeys[k] ||
1097                                     rth->fl.iif != 0) {
1098                                         rthp = &rth->u.dst.rt_next;
1099                                         continue;
1100                                 }
1101
1102                                 if (rth->rt_dst != daddr ||
1103                                     rth->rt_src != saddr ||
1104                                     rth->u.dst.error ||
1105                                     rth->rt_gateway != old_gw ||
1106                                     rth->u.dst.dev != dev)
1107                                         break;
1108
1109                                 dst_hold(&rth->u.dst);
1110                                 rcu_read_unlock();
1111
1112                                 rt = dst_alloc(&ipv4_dst_ops);
1113                                 if (rt == NULL) {
1114                                         ip_rt_put(rth);
1115                                         in_dev_put(in_dev);
1116                                         return;
1117                                 }
1118
1119                                 /* Copy all the information. */
1120                                 *rt = *rth;
1121                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1122                                 rt->u.dst.__use         = 1;
1123                                 atomic_set(&rt->u.dst.__refcnt, 1);
1124                                 rt->u.dst.child         = NULL;
1125                                 if (rt->u.dst.dev)
1126                                         dev_hold(rt->u.dst.dev);
1127                                 if (rt->idev)
1128                                         in_dev_hold(rt->idev);
1129                                 rt->u.dst.obsolete      = 0;
1130                                 rt->u.dst.lastuse       = jiffies;
1131                                 rt->u.dst.path          = &rt->u.dst;
1132                                 rt->u.dst.neighbour     = NULL;
1133                                 rt->u.dst.hh            = NULL;
1134                                 rt->u.dst.xfrm          = NULL;
1135
1136                                 rt->rt_flags            |= RTCF_REDIRECTED;
1137
1138                                 /* Gateway is different ... */
1139                                 rt->rt_gateway          = new_gw;
1140
1141                                 /* Redirect received -> path was valid */
1142                                 dst_confirm(&rth->u.dst);
1143
1144                                 if (rt->peer)
1145                                         atomic_inc(&rt->peer->refcnt);
1146
1147                                 if (arp_bind_neighbour(&rt->u.dst) ||
1148                                     !(rt->u.dst.neighbour->nud_state &
1149                                             NUD_VALID)) {
1150                                         if (rt->u.dst.neighbour)
1151                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1152                                         ip_rt_put(rth);
1153                                         rt_drop(rt);
1154                                         goto do_next;
1155                                 }
1156
1157                                 netevent.old = &rth->u.dst;
1158                                 netevent.new = &rt->u.dst;
1159                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1160                                                         &netevent);
1161
1162                                 rt_del(hash, rth);
1163                                 if (!rt_intern_hash(hash, rt, &rt))
1164                                         ip_rt_put(rt);
1165                                 goto do_next;
1166                         }
1167                         rcu_read_unlock();
1168                 do_next:
1169                         ;
1170                 }
1171         }
1172         in_dev_put(in_dev);
1173         return;
1174
1175 reject_redirect:
1176 #ifdef CONFIG_IP_ROUTE_VERBOSE
1177         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1178                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1179                         "%u.%u.%u.%u ignored.\n"
1180                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1181                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1182                        NIPQUAD(saddr), NIPQUAD(daddr));
1183 #endif
1184         in_dev_put(in_dev);
1185 }
1186
1187 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1188 {
1189         struct rtable *rt = (struct rtable*)dst;
1190         struct dst_entry *ret = dst;
1191
1192         if (rt) {
1193                 if (dst->obsolete) {
1194                         ip_rt_put(rt);
1195                         ret = NULL;
1196                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1197                            rt->u.dst.expires) {
1198                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1199                                                 rt->fl.oif);
1200 #if RT_CACHE_DEBUG >= 1
1201                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1202                                           "%u.%u.%u.%u/%02x dropped\n",
1203                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1204 #endif
1205                         rt_del(hash, rt);
1206                         ret = NULL;
1207                 }
1208         }
1209         return ret;
1210 }
1211
1212 /*
1213  * Algorithm:
1214  *      1. The first ip_rt_redirect_number redirects are sent
1215  *         with exponential backoff, then we stop sending them at all,
1216  *         assuming that the host ignores our redirects.
1217  *      2. If we did not see packets requiring redirects
1218  *         during ip_rt_redirect_silence, we assume that the host
1219  *         forgot redirected route and start to send redirects again.
1220  *
1221  * This algorithm is much cheaper and more intelligent than dumb load limiting
1222  * in icmp.c.
1223  *
1224  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1225  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1226  */
1227
1228 void ip_rt_send_redirect(struct sk_buff *skb)
1229 {
1230         struct rtable *rt = (struct rtable*)skb->dst;
1231         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1232
1233         if (!in_dev)
1234                 return;
1235
1236         if (!IN_DEV_TX_REDIRECTS(in_dev))
1237                 goto out;
1238
1239         /* No redirected packets during ip_rt_redirect_silence;
1240          * reset the algorithm.
1241          */
1242         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1243                 rt->u.dst.rate_tokens = 0;
1244
1245         /* Too many ignored redirects; do not send anything
1246          * set u.dst.rate_last to the last seen redirected packet.
1247          */
1248         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1249                 rt->u.dst.rate_last = jiffies;
1250                 goto out;
1251         }
1252
1253         /* Check for load limit; set rate_last to the latest sent
1254          * redirect.
1255          */
1256         if (rt->u.dst.rate_tokens == 0 ||
1257             time_after(jiffies,
1258                        (rt->u.dst.rate_last +
1259                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1260                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1261                 rt->u.dst.rate_last = jiffies;
1262                 ++rt->u.dst.rate_tokens;
1263 #ifdef CONFIG_IP_ROUTE_VERBOSE
1264                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1265                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1266                     net_ratelimit())
1267                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1268                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1269                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1270                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1271 #endif
1272         }
1273 out:
1274         in_dev_put(in_dev);
1275 }
1276
1277 static int ip_error(struct sk_buff *skb)
1278 {
1279         struct rtable *rt = (struct rtable*)skb->dst;
1280         unsigned long now;
1281         int code;
1282
1283         switch (rt->u.dst.error) {
1284                 case EINVAL:
1285                 default:
1286                         goto out;
1287                 case EHOSTUNREACH:
1288                         code = ICMP_HOST_UNREACH;
1289                         break;
1290                 case ENETUNREACH:
1291                         code = ICMP_NET_UNREACH;
1292                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1293                         break;
1294                 case EACCES:
1295                         code = ICMP_PKT_FILTERED;
1296                         break;
1297         }
1298
1299         now = jiffies;
1300         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1301         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1302                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1303         rt->u.dst.rate_last = now;
1304         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1305                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1306                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1307         }
1308
1309 out:    kfree_skb(skb);
1310         return 0;
1311 }
1312
1313 /*
1314  *      The last two values are not from the RFC but
1315  *      are needed for AMPRnet AX.25 paths.
1316  */
1317
1318 static const unsigned short mtu_plateau[] =
1319 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1320
1321 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1322 {
1323         int i;
1324
1325         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1326                 if (old_mtu > mtu_plateau[i])
1327                         return mtu_plateau[i];
1328         return 68;
1329 }
1330
1331 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1332 {
1333         int i;
1334         unsigned short old_mtu = ntohs(iph->tot_len);
1335         struct rtable *rth;
1336         __be32  skeys[2] = { iph->saddr, 0, };
1337         __be32  daddr = iph->daddr;
1338         unsigned short est_mtu = 0;
1339
1340         if (ipv4_config.no_pmtu_disc)
1341                 return 0;
1342
1343         for (i = 0; i < 2; i++) {
1344                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1345
1346                 rcu_read_lock();
1347                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1348                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1349                         if (rth->fl.fl4_dst == daddr &&
1350                             rth->fl.fl4_src == skeys[i] &&
1351                             rth->rt_dst  == daddr &&
1352                             rth->rt_src  == iph->saddr &&
1353                             rth->fl.iif == 0 &&
1354                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1355                                 unsigned short mtu = new_mtu;
1356
1357                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1358
1359                                         /* BSD 4.2 compatibility hack :-( */
1360                                         if (mtu == 0 &&
1361                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1362                                             old_mtu >= 68 + (iph->ihl << 2))
1363                                                 old_mtu -= iph->ihl << 2;
1364
1365                                         mtu = guess_mtu(old_mtu);
1366                                 }
1367                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1368                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1369                                                 dst_confirm(&rth->u.dst);
1370                                                 if (mtu < ip_rt_min_pmtu) {
1371                                                         mtu = ip_rt_min_pmtu;
1372                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1373                                                                 (1 << RTAX_MTU);
1374                                                 }
1375                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1376                                                 dst_set_expires(&rth->u.dst,
1377                                                         ip_rt_mtu_expires);
1378                                         }
1379                                         est_mtu = mtu;
1380                                 }
1381                         }
1382                 }
1383                 rcu_read_unlock();
1384         }
1385         return est_mtu ? : new_mtu;
1386 }
1387
1388 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1389 {
1390         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1391             !(dst_metric_locked(dst, RTAX_MTU))) {
1392                 if (mtu < ip_rt_min_pmtu) {
1393                         mtu = ip_rt_min_pmtu;
1394                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1395                 }
1396                 dst->metrics[RTAX_MTU-1] = mtu;
1397                 dst_set_expires(dst, ip_rt_mtu_expires);
1398                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1399         }
1400 }
1401
1402 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1403 {
1404         return NULL;
1405 }
1406
1407 static void ipv4_dst_destroy(struct dst_entry *dst)
1408 {
1409         struct rtable *rt = (struct rtable *) dst;
1410         struct inet_peer *peer = rt->peer;
1411         struct in_device *idev = rt->idev;
1412
1413         if (peer) {
1414                 rt->peer = NULL;
1415                 inet_putpeer(peer);
1416         }
1417
1418         if (idev) {
1419                 rt->idev = NULL;
1420                 in_dev_put(idev);
1421         }
1422 }
1423
1424 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1425                             int how)
1426 {
1427         struct rtable *rt = (struct rtable *) dst;
1428         struct in_device *idev = rt->idev;
1429         if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1430                 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
1431                 if (loopback_idev) {
1432                         rt->idev = loopback_idev;
1433                         in_dev_put(idev);
1434                 }
1435         }
1436 }
1437
1438 static void ipv4_link_failure(struct sk_buff *skb)
1439 {
1440         struct rtable *rt;
1441
1442         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1443
1444         rt = (struct rtable *) skb->dst;
1445         if (rt)
1446                 dst_set_expires(&rt->u.dst, 0);
1447 }
1448
1449 static int ip_rt_bug(struct sk_buff *skb)
1450 {
1451         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1452                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1453                 skb->dev ? skb->dev->name : "?");
1454         kfree_skb(skb);
1455         return 0;
1456 }
1457
1458 /*
1459    We do not cache source address of outgoing interface,
1460    because it is used only by IP RR, TS and SRR options,
1461    so that it out of fast path.
1462
1463    BTW remember: "addr" is allowed to be not aligned
1464    in IP options!
1465  */
1466
1467 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1468 {
1469         __be32 src;
1470         struct fib_result res;
1471
1472         if (rt->fl.iif == 0)
1473                 src = rt->rt_src;
1474         else if (fib_lookup(&rt->fl, &res) == 0) {
1475                 src = FIB_RES_PREFSRC(res);
1476                 fib_res_put(&res);
1477         } else
1478                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1479                                         RT_SCOPE_UNIVERSE);
1480         memcpy(addr, &src, 4);
1481 }
1482
1483 #ifdef CONFIG_NET_CLS_ROUTE
1484 static void set_class_tag(struct rtable *rt, u32 tag)
1485 {
1486         if (!(rt->u.dst.tclassid & 0xFFFF))
1487                 rt->u.dst.tclassid |= tag & 0xFFFF;
1488         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1489                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1490 }
1491 #endif
1492
1493 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1494 {
1495         struct fib_info *fi = res->fi;
1496
1497         if (fi) {
1498                 if (FIB_RES_GW(*res) &&
1499                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1500                         rt->rt_gateway = FIB_RES_GW(*res);
1501                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1502                        sizeof(rt->u.dst.metrics));
1503                 if (fi->fib_mtu == 0) {
1504                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1505                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1506                             rt->rt_gateway != rt->rt_dst &&
1507                             rt->u.dst.dev->mtu > 576)
1508                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1509                 }
1510 #ifdef CONFIG_NET_CLS_ROUTE
1511                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1512 #endif
1513         } else
1514                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1515
1516         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1517                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1518         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1519                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1520         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1521                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1522                                        ip_rt_min_advmss);
1523         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1524                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1525
1526 #ifdef CONFIG_NET_CLS_ROUTE
1527 #ifdef CONFIG_IP_MULTIPLE_TABLES
1528         set_class_tag(rt, fib_rules_tclass(res));
1529 #endif
1530         set_class_tag(rt, itag);
1531 #endif
1532         rt->rt_type = res->type;
1533 }
1534
1535 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1536                                 u8 tos, struct net_device *dev, int our)
1537 {
1538         unsigned hash;
1539         struct rtable *rth;
1540         __be32 spec_dst;
1541         struct in_device *in_dev = in_dev_get(dev);
1542         u32 itag = 0;
1543
1544         /* Primary sanity checks. */
1545
1546         if (in_dev == NULL)
1547                 return -EINVAL;
1548
1549         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1550             skb->protocol != htons(ETH_P_IP))
1551                 goto e_inval;
1552
1553         if (ZERONET(saddr)) {
1554                 if (!LOCAL_MCAST(daddr))
1555                         goto e_inval;
1556                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1557         } else if (fib_validate_source(saddr, 0, tos, 0,
1558                                         dev, &spec_dst, &itag) < 0)
1559                 goto e_inval;
1560
1561         rth = dst_alloc(&ipv4_dst_ops);
1562         if (!rth)
1563                 goto e_nobufs;
1564
1565         rth->u.dst.output= ip_rt_bug;
1566
1567         atomic_set(&rth->u.dst.__refcnt, 1);
1568         rth->u.dst.flags= DST_HOST;
1569         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1570                 rth->u.dst.flags |= DST_NOPOLICY;
1571         rth->fl.fl4_dst = daddr;
1572         rth->rt_dst     = daddr;
1573         rth->fl.fl4_tos = tos;
1574         rth->fl.mark    = skb->mark;
1575         rth->fl.fl4_src = saddr;
1576         rth->rt_src     = saddr;
1577 #ifdef CONFIG_NET_CLS_ROUTE
1578         rth->u.dst.tclassid = itag;
1579 #endif
1580         rth->rt_iif     =
1581         rth->fl.iif     = dev->ifindex;
1582         rth->u.dst.dev  = init_net.loopback_dev;
1583         dev_hold(rth->u.dst.dev);
1584         rth->idev       = in_dev_get(rth->u.dst.dev);
1585         rth->fl.oif     = 0;
1586         rth->rt_gateway = daddr;
1587         rth->rt_spec_dst= spec_dst;
1588         rth->rt_type    = RTN_MULTICAST;
1589         rth->rt_flags   = RTCF_MULTICAST;
1590         if (our) {
1591                 rth->u.dst.input= ip_local_deliver;
1592                 rth->rt_flags |= RTCF_LOCAL;
1593         }
1594
1595 #ifdef CONFIG_IP_MROUTE
1596         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1597                 rth->u.dst.input = ip_mr_input;
1598 #endif
1599         RT_CACHE_STAT_INC(in_slow_mc);
1600
1601         in_dev_put(in_dev);
1602         hash = rt_hash(daddr, saddr, dev->ifindex);
1603         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1604
1605 e_nobufs:
1606         in_dev_put(in_dev);
1607         return -ENOBUFS;
1608
1609 e_inval:
1610         in_dev_put(in_dev);
1611         return -EINVAL;
1612 }
1613
1614
1615 static void ip_handle_martian_source(struct net_device *dev,
1616                                      struct in_device *in_dev,
1617                                      struct sk_buff *skb,
1618                                      __be32 daddr,
1619                                      __be32 saddr)
1620 {
1621         RT_CACHE_STAT_INC(in_martian_src);
1622 #ifdef CONFIG_IP_ROUTE_VERBOSE
1623         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1624                 /*
1625                  *      RFC1812 recommendation, if source is martian,
1626                  *      the only hint is MAC header.
1627                  */
1628                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1629                         "%u.%u.%u.%u, on dev %s\n",
1630                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1631                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1632                         int i;
1633                         const unsigned char *p = skb_mac_header(skb);
1634                         printk(KERN_WARNING "ll header: ");
1635                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1636                                 printk("%02x", *p);
1637                                 if (i < (dev->hard_header_len - 1))
1638                                         printk(":");
1639                         }
1640                         printk("\n");
1641                 }
1642         }
1643 #endif
1644 }
1645
1646 static inline int __mkroute_input(struct sk_buff *skb,
1647                                   struct fib_result* res,
1648                                   struct in_device *in_dev,
1649                                   __be32 daddr, __be32 saddr, u32 tos,
1650                                   struct rtable **result)
1651 {
1652
1653         struct rtable *rth;
1654         int err;
1655         struct in_device *out_dev;
1656         unsigned flags = 0;
1657         __be32 spec_dst;
1658         u32 itag;
1659
1660         /* get a working reference to the output device */
1661         out_dev = in_dev_get(FIB_RES_DEV(*res));
1662         if (out_dev == NULL) {
1663                 if (net_ratelimit())
1664                         printk(KERN_CRIT "Bug in ip_route_input" \
1665                                "_slow(). Please, report\n");
1666                 return -EINVAL;
1667         }
1668
1669
1670         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1671                                   in_dev->dev, &spec_dst, &itag);
1672         if (err < 0) {
1673                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1674                                          saddr);
1675
1676                 err = -EINVAL;
1677                 goto cleanup;
1678         }
1679
1680         if (err)
1681                 flags |= RTCF_DIRECTSRC;
1682
1683         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1684             (IN_DEV_SHARED_MEDIA(out_dev) ||
1685              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1686                 flags |= RTCF_DOREDIRECT;
1687
1688         if (skb->protocol != htons(ETH_P_IP)) {
1689                 /* Not IP (i.e. ARP). Do not create route, if it is
1690                  * invalid for proxy arp. DNAT routes are always valid.
1691                  */
1692                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1693                         err = -EINVAL;
1694                         goto cleanup;
1695                 }
1696         }
1697
1698
1699         rth = dst_alloc(&ipv4_dst_ops);
1700         if (!rth) {
1701                 err = -ENOBUFS;
1702                 goto cleanup;
1703         }
1704
1705         atomic_set(&rth->u.dst.__refcnt, 1);
1706         rth->u.dst.flags= DST_HOST;
1707         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1708                 rth->u.dst.flags |= DST_NOPOLICY;
1709         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1710                 rth->u.dst.flags |= DST_NOXFRM;
1711         rth->fl.fl4_dst = daddr;
1712         rth->rt_dst     = daddr;
1713         rth->fl.fl4_tos = tos;
1714         rth->fl.mark    = skb->mark;
1715         rth->fl.fl4_src = saddr;
1716         rth->rt_src     = saddr;
1717         rth->rt_gateway = daddr;
1718         rth->rt_iif     =
1719                 rth->fl.iif     = in_dev->dev->ifindex;
1720         rth->u.dst.dev  = (out_dev)->dev;
1721         dev_hold(rth->u.dst.dev);
1722         rth->idev       = in_dev_get(rth->u.dst.dev);
1723         rth->fl.oif     = 0;
1724         rth->rt_spec_dst= spec_dst;
1725
1726         rth->u.dst.input = ip_forward;
1727         rth->u.dst.output = ip_output;
1728
1729         rt_set_nexthop(rth, res, itag);
1730
1731         rth->rt_flags = flags;
1732
1733         *result = rth;
1734         err = 0;
1735  cleanup:
1736         /* release the working reference to the output device */
1737         in_dev_put(out_dev);
1738         return err;
1739 }
1740
1741 static inline int ip_mkroute_input(struct sk_buff *skb,
1742                                    struct fib_result* res,
1743                                    const struct flowi *fl,
1744                                    struct in_device *in_dev,
1745                                    __be32 daddr, __be32 saddr, u32 tos)
1746 {
1747         struct rtable* rth = NULL;
1748         int err;
1749         unsigned hash;
1750
1751 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1752         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1753                 fib_select_multipath(fl, res);
1754 #endif
1755
1756         /* create a routing cache entry */
1757         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1758         if (err)
1759                 return err;
1760
1761         /* put it into the cache */
1762         hash = rt_hash(daddr, saddr, fl->iif);
1763         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1764 }
1765
1766 /*
1767  *      NOTE. We drop all the packets that has local source
1768  *      addresses, because every properly looped back packet
1769  *      must have correct destination already attached by output routine.
1770  *
1771  *      Such approach solves two big problems:
1772  *      1. Not simplex devices are handled properly.
1773  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1774  */
1775
1776 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1777                                u8 tos, struct net_device *dev)
1778 {
1779         struct fib_result res;
1780         struct in_device *in_dev = in_dev_get(dev);
1781         struct flowi fl = { .nl_u = { .ip4_u =
1782                                       { .daddr = daddr,
1783                                         .saddr = saddr,
1784                                         .tos = tos,
1785                                         .scope = RT_SCOPE_UNIVERSE,
1786                                       } },
1787                             .mark = skb->mark,
1788                             .iif = dev->ifindex };
1789         unsigned        flags = 0;
1790         u32             itag = 0;
1791         struct rtable * rth;
1792         unsigned        hash;
1793         __be32          spec_dst;
1794         int             err = -EINVAL;
1795         int             free_res = 0;
1796
1797         /* IP on this device is disabled. */
1798
1799         if (!in_dev)
1800                 goto out;
1801
1802         /* Check for the most weird martians, which can be not detected
1803            by fib_lookup.
1804          */
1805
1806         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1807                 goto martian_source;
1808
1809         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1810                 goto brd_input;
1811
1812         /* Accept zero addresses only to limited broadcast;
1813          * I even do not know to fix it or not. Waiting for complains :-)
1814          */
1815         if (ZERONET(saddr))
1816                 goto martian_source;
1817
1818         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1819                 goto martian_destination;
1820
1821         /*
1822          *      Now we are ready to route packet.
1823          */
1824         if ((err = fib_lookup(&fl, &res)) != 0) {
1825                 if (!IN_DEV_FORWARD(in_dev))
1826                         goto e_hostunreach;
1827                 goto no_route;
1828         }
1829         free_res = 1;
1830
1831         RT_CACHE_STAT_INC(in_slow_tot);
1832
1833         if (res.type == RTN_BROADCAST)
1834                 goto brd_input;
1835
1836         if (res.type == RTN_LOCAL) {
1837                 int result;
1838                 result = fib_validate_source(saddr, daddr, tos,
1839                                              init_net.loopback_dev->ifindex,
1840                                              dev, &spec_dst, &itag);
1841                 if (result < 0)
1842                         goto martian_source;
1843                 if (result)
1844                         flags |= RTCF_DIRECTSRC;
1845                 spec_dst = daddr;
1846                 goto local_input;
1847         }
1848
1849         if (!IN_DEV_FORWARD(in_dev))
1850                 goto e_hostunreach;
1851         if (res.type != RTN_UNICAST)
1852                 goto martian_destination;
1853
1854         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1855 done:
1856         in_dev_put(in_dev);
1857         if (free_res)
1858                 fib_res_put(&res);
1859 out:    return err;
1860
1861 brd_input:
1862         if (skb->protocol != htons(ETH_P_IP))
1863                 goto e_inval;
1864
1865         if (ZERONET(saddr))
1866                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1867         else {
1868                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1869                                           &itag);
1870                 if (err < 0)
1871                         goto martian_source;
1872                 if (err)
1873                         flags |= RTCF_DIRECTSRC;
1874         }
1875         flags |= RTCF_BROADCAST;
1876         res.type = RTN_BROADCAST;
1877         RT_CACHE_STAT_INC(in_brd);
1878
1879 local_input:
1880         rth = dst_alloc(&ipv4_dst_ops);
1881         if (!rth)
1882                 goto e_nobufs;
1883
1884         rth->u.dst.output= ip_rt_bug;
1885
1886         atomic_set(&rth->u.dst.__refcnt, 1);
1887         rth->u.dst.flags= DST_HOST;
1888         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1889                 rth->u.dst.flags |= DST_NOPOLICY;
1890         rth->fl.fl4_dst = daddr;
1891         rth->rt_dst     = daddr;
1892         rth->fl.fl4_tos = tos;
1893         rth->fl.mark    = skb->mark;
1894         rth->fl.fl4_src = saddr;
1895         rth->rt_src     = saddr;
1896 #ifdef CONFIG_NET_CLS_ROUTE
1897         rth->u.dst.tclassid = itag;
1898 #endif
1899         rth->rt_iif     =
1900         rth->fl.iif     = dev->ifindex;
1901         rth->u.dst.dev  = init_net.loopback_dev;
1902         dev_hold(rth->u.dst.dev);
1903         rth->idev       = in_dev_get(rth->u.dst.dev);
1904         rth->rt_gateway = daddr;
1905         rth->rt_spec_dst= spec_dst;
1906         rth->u.dst.input= ip_local_deliver;
1907         rth->rt_flags   = flags|RTCF_LOCAL;
1908         if (res.type == RTN_UNREACHABLE) {
1909                 rth->u.dst.input= ip_error;
1910                 rth->u.dst.error= -err;
1911                 rth->rt_flags   &= ~RTCF_LOCAL;
1912         }
1913         rth->rt_type    = res.type;
1914         hash = rt_hash(daddr, saddr, fl.iif);
1915         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1916         goto done;
1917
1918 no_route:
1919         RT_CACHE_STAT_INC(in_no_route);
1920         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1921         res.type = RTN_UNREACHABLE;
1922         if (err == -ESRCH)
1923                 err = -ENETUNREACH;
1924         goto local_input;
1925
1926         /*
1927          *      Do not cache martian addresses: they should be logged (RFC1812)
1928          */
1929 martian_destination:
1930         RT_CACHE_STAT_INC(in_martian_dst);
1931 #ifdef CONFIG_IP_ROUTE_VERBOSE
1932         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1933                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1934                         "%u.%u.%u.%u, dev %s\n",
1935                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1936 #endif
1937
1938 e_hostunreach:
1939         err = -EHOSTUNREACH;
1940         goto done;
1941
1942 e_inval:
1943         err = -EINVAL;
1944         goto done;
1945
1946 e_nobufs:
1947         err = -ENOBUFS;
1948         goto done;
1949
1950 martian_source:
1951         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1952         goto e_inval;
1953 }
1954
1955 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1956                    u8 tos, struct net_device *dev)
1957 {
1958         struct rtable * rth;
1959         unsigned        hash;
1960         int iif = dev->ifindex;
1961
1962         tos &= IPTOS_RT_MASK;
1963         hash = rt_hash(daddr, saddr, iif);
1964
1965         rcu_read_lock();
1966         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1967              rth = rcu_dereference(rth->u.dst.rt_next)) {
1968                 if (rth->fl.fl4_dst == daddr &&
1969                     rth->fl.fl4_src == saddr &&
1970                     rth->fl.iif == iif &&
1971                     rth->fl.oif == 0 &&
1972                     rth->fl.mark == skb->mark &&
1973                     rth->fl.fl4_tos == tos) {
1974                         dst_use(&rth->u.dst, jiffies);
1975                         RT_CACHE_STAT_INC(in_hit);
1976                         rcu_read_unlock();
1977                         skb->dst = (struct dst_entry*)rth;
1978                         return 0;
1979                 }
1980                 RT_CACHE_STAT_INC(in_hlist_search);
1981         }
1982         rcu_read_unlock();
1983
1984         /* Multicast recognition logic is moved from route cache to here.
1985            The problem was that too many Ethernet cards have broken/missing
1986            hardware multicast filters :-( As result the host on multicasting
1987            network acquires a lot of useless route cache entries, sort of
1988            SDR messages from all the world. Now we try to get rid of them.
1989            Really, provided software IP multicast filter is organized
1990            reasonably (at least, hashed), it does not result in a slowdown
1991            comparing with route cache reject entries.
1992            Note, that multicast routers are not affected, because
1993            route cache entry is created eventually.
1994          */
1995         if (MULTICAST(daddr)) {
1996                 struct in_device *in_dev;
1997
1998                 rcu_read_lock();
1999                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2000                         int our = ip_check_mc(in_dev, daddr, saddr,
2001                                 ip_hdr(skb)->protocol);
2002                         if (our
2003 #ifdef CONFIG_IP_MROUTE
2004                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2005 #endif
2006                             ) {
2007                                 rcu_read_unlock();
2008                                 return ip_route_input_mc(skb, daddr, saddr,
2009                                                          tos, dev, our);
2010                         }
2011                 }
2012                 rcu_read_unlock();
2013                 return -EINVAL;
2014         }
2015         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2016 }
2017
2018 static inline int __mkroute_output(struct rtable **result,
2019                                    struct fib_result* res,
2020                                    const struct flowi *fl,
2021                                    const struct flowi *oldflp,
2022                                    struct net_device *dev_out,
2023                                    unsigned flags)
2024 {
2025         struct rtable *rth;
2026         struct in_device *in_dev;
2027         u32 tos = RT_FL_TOS(oldflp);
2028         int err = 0;
2029
2030         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2031                 return -EINVAL;
2032
2033         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2034                 res->type = RTN_BROADCAST;
2035         else if (MULTICAST(fl->fl4_dst))
2036                 res->type = RTN_MULTICAST;
2037         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2038                 return -EINVAL;
2039
2040         if (dev_out->flags & IFF_LOOPBACK)
2041                 flags |= RTCF_LOCAL;
2042
2043         /* get work reference to inet device */
2044         in_dev = in_dev_get(dev_out);
2045         if (!in_dev)
2046                 return -EINVAL;
2047
2048         if (res->type == RTN_BROADCAST) {
2049                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2050                 if (res->fi) {
2051                         fib_info_put(res->fi);
2052                         res->fi = NULL;
2053                 }
2054         } else if (res->type == RTN_MULTICAST) {
2055                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2056                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2057                                  oldflp->proto))
2058                         flags &= ~RTCF_LOCAL;
2059                 /* If multicast route do not exist use
2060                    default one, but do not gateway in this case.
2061                    Yes, it is hack.
2062                  */
2063                 if (res->fi && res->prefixlen < 4) {
2064                         fib_info_put(res->fi);
2065                         res->fi = NULL;
2066                 }
2067         }
2068
2069
2070         rth = dst_alloc(&ipv4_dst_ops);
2071         if (!rth) {
2072                 err = -ENOBUFS;
2073                 goto cleanup;
2074         }
2075
2076         atomic_set(&rth->u.dst.__refcnt, 1);
2077         rth->u.dst.flags= DST_HOST;
2078         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2079                 rth->u.dst.flags |= DST_NOXFRM;
2080         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2081                 rth->u.dst.flags |= DST_NOPOLICY;
2082
2083         rth->fl.fl4_dst = oldflp->fl4_dst;
2084         rth->fl.fl4_tos = tos;
2085         rth->fl.fl4_src = oldflp->fl4_src;
2086         rth->fl.oif     = oldflp->oif;
2087         rth->fl.mark    = oldflp->mark;
2088         rth->rt_dst     = fl->fl4_dst;
2089         rth->rt_src     = fl->fl4_src;
2090         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2091         /* get references to the devices that are to be hold by the routing
2092            cache entry */
2093         rth->u.dst.dev  = dev_out;
2094         dev_hold(dev_out);
2095         rth->idev       = in_dev_get(dev_out);
2096         rth->rt_gateway = fl->fl4_dst;
2097         rth->rt_spec_dst= fl->fl4_src;
2098
2099         rth->u.dst.output=ip_output;
2100
2101         RT_CACHE_STAT_INC(out_slow_tot);
2102
2103         if (flags & RTCF_LOCAL) {
2104                 rth->u.dst.input = ip_local_deliver;
2105                 rth->rt_spec_dst = fl->fl4_dst;
2106         }
2107         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2108                 rth->rt_spec_dst = fl->fl4_src;
2109                 if (flags & RTCF_LOCAL &&
2110                     !(dev_out->flags & IFF_LOOPBACK)) {
2111                         rth->u.dst.output = ip_mc_output;
2112                         RT_CACHE_STAT_INC(out_slow_mc);
2113                 }
2114 #ifdef CONFIG_IP_MROUTE
2115                 if (res->type == RTN_MULTICAST) {
2116                         if (IN_DEV_MFORWARD(in_dev) &&
2117                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2118                                 rth->u.dst.input = ip_mr_input;
2119                                 rth->u.dst.output = ip_mc_output;
2120                         }
2121                 }
2122 #endif
2123         }
2124
2125         rt_set_nexthop(rth, res, 0);
2126
2127         rth->rt_flags = flags;
2128
2129         *result = rth;
2130  cleanup:
2131         /* release work reference to inet device */
2132         in_dev_put(in_dev);
2133
2134         return err;
2135 }
2136
2137 static inline int ip_mkroute_output(struct rtable **rp,
2138                                     struct fib_result* res,
2139                                     const struct flowi *fl,
2140                                     const struct flowi *oldflp,
2141                                     struct net_device *dev_out,
2142                                     unsigned flags)
2143 {
2144         struct rtable *rth = NULL;
2145         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2146         unsigned hash;
2147         if (err == 0) {
2148                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2149                 err = rt_intern_hash(hash, rth, rp);
2150         }
2151
2152         return err;
2153 }
2154
2155 /*
2156  * Major route resolver routine.
2157  */
2158
2159 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2160 {
2161         u32 tos = RT_FL_TOS(oldflp);
2162         struct flowi fl = { .nl_u = { .ip4_u =
2163                                       { .daddr = oldflp->fl4_dst,
2164                                         .saddr = oldflp->fl4_src,
2165                                         .tos = tos & IPTOS_RT_MASK,
2166                                         .scope = ((tos & RTO_ONLINK) ?
2167                                                   RT_SCOPE_LINK :
2168                                                   RT_SCOPE_UNIVERSE),
2169                                       } },
2170                             .mark = oldflp->mark,
2171                             .iif = init_net.loopback_dev->ifindex,
2172                             .oif = oldflp->oif };
2173         struct fib_result res;
2174         unsigned flags = 0;
2175         struct net_device *dev_out = NULL;
2176         int free_res = 0;
2177         int err;
2178
2179
2180         res.fi          = NULL;
2181 #ifdef CONFIG_IP_MULTIPLE_TABLES
2182         res.r           = NULL;
2183 #endif
2184
2185         if (oldflp->fl4_src) {
2186                 err = -EINVAL;
2187                 if (MULTICAST(oldflp->fl4_src) ||
2188                     BADCLASS(oldflp->fl4_src) ||
2189                     ZERONET(oldflp->fl4_src))
2190                         goto out;
2191
2192                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2193                 dev_out = ip_dev_find(oldflp->fl4_src);
2194                 if (dev_out == NULL)
2195                         goto out;
2196
2197                 /* I removed check for oif == dev_out->oif here.
2198                    It was wrong for two reasons:
2199                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2200                       assigned to multiple interfaces.
2201                    2. Moreover, we are allowed to send packets with saddr
2202                       of another iface. --ANK
2203                  */
2204
2205                 if (oldflp->oif == 0
2206                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2207                         /* Special hack: user can direct multicasts
2208                            and limited broadcast via necessary interface
2209                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2210                            This hack is not just for fun, it allows
2211                            vic,vat and friends to work.
2212                            They bind socket to loopback, set ttl to zero
2213                            and expect that it will work.
2214                            From the viewpoint of routing cache they are broken,
2215                            because we are not allowed to build multicast path
2216                            with loopback source addr (look, routing cache
2217                            cannot know, that ttl is zero, so that packet
2218                            will not leave this host and route is valid).
2219                            Luckily, this hack is good workaround.
2220                          */
2221
2222                         fl.oif = dev_out->ifindex;
2223                         goto make_route;
2224                 }
2225                 if (dev_out)
2226                         dev_put(dev_out);
2227                 dev_out = NULL;
2228         }
2229
2230
2231         if (oldflp->oif) {
2232                 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2233                 err = -ENODEV;
2234                 if (dev_out == NULL)
2235                         goto out;
2236
2237                 /* RACE: Check return value of inet_select_addr instead. */
2238                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2239                         dev_put(dev_out);
2240                         goto out;       /* Wrong error code */
2241                 }
2242
2243                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2244                         if (!fl.fl4_src)
2245                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2246                                                               RT_SCOPE_LINK);
2247                         goto make_route;
2248                 }
2249                 if (!fl.fl4_src) {
2250                         if (MULTICAST(oldflp->fl4_dst))
2251                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2252                                                               fl.fl4_scope);
2253                         else if (!oldflp->fl4_dst)
2254                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2255                                                               RT_SCOPE_HOST);
2256                 }
2257         }
2258
2259         if (!fl.fl4_dst) {
2260                 fl.fl4_dst = fl.fl4_src;
2261                 if (!fl.fl4_dst)
2262                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2263                 if (dev_out)
2264                         dev_put(dev_out);
2265                 dev_out = init_net.loopback_dev;
2266                 dev_hold(dev_out);
2267                 fl.oif = init_net.loopback_dev->ifindex;
2268                 res.type = RTN_LOCAL;
2269                 flags |= RTCF_LOCAL;
2270                 goto make_route;
2271         }
2272
2273         if (fib_lookup(&fl, &res)) {
2274                 res.fi = NULL;
2275                 if (oldflp->oif) {
2276                         /* Apparently, routing tables are wrong. Assume,
2277                            that the destination is on link.
2278
2279                            WHY? DW.
2280                            Because we are allowed to send to iface
2281                            even if it has NO routes and NO assigned
2282                            addresses. When oif is specified, routing
2283                            tables are looked up with only one purpose:
2284                            to catch if destination is gatewayed, rather than
2285                            direct. Moreover, if MSG_DONTROUTE is set,
2286                            we send packet, ignoring both routing tables
2287                            and ifaddr state. --ANK
2288
2289
2290                            We could make it even if oif is unknown,
2291                            likely IPv6, but we do not.
2292                          */
2293
2294                         if (fl.fl4_src == 0)
2295                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2296                                                               RT_SCOPE_LINK);
2297                         res.type = RTN_UNICAST;
2298                         goto make_route;
2299                 }
2300                 if (dev_out)
2301                         dev_put(dev_out);
2302                 err = -ENETUNREACH;
2303                 goto out;
2304         }
2305         free_res = 1;
2306
2307         if (res.type == RTN_LOCAL) {
2308                 if (!fl.fl4_src)
2309                         fl.fl4_src = fl.fl4_dst;
2310                 if (dev_out)
2311                         dev_put(dev_out);
2312                 dev_out = init_net.loopback_dev;
2313                 dev_hold(dev_out);
2314                 fl.oif = dev_out->ifindex;
2315                 if (res.fi)
2316                         fib_info_put(res.fi);
2317                 res.fi = NULL;
2318                 flags |= RTCF_LOCAL;
2319                 goto make_route;
2320         }
2321
2322 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2323         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2324                 fib_select_multipath(&fl, &res);
2325         else
2326 #endif
2327         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2328                 fib_select_default(&fl, &res);
2329
2330         if (!fl.fl4_src)
2331                 fl.fl4_src = FIB_RES_PREFSRC(res);
2332
2333         if (dev_out)
2334                 dev_put(dev_out);
2335         dev_out = FIB_RES_DEV(res);
2336         dev_hold(dev_out);
2337         fl.oif = dev_out->ifindex;
2338
2339
2340 make_route:
2341         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2342
2343
2344         if (free_res)
2345                 fib_res_put(&res);
2346         if (dev_out)
2347                 dev_put(dev_out);
2348 out:    return err;
2349 }
2350
2351 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2352 {
2353         unsigned hash;
2354         struct rtable *rth;
2355
2356         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2357
2358         rcu_read_lock_bh();
2359         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2360                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2361                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2362                     rth->fl.fl4_src == flp->fl4_src &&
2363                     rth->fl.iif == 0 &&
2364                     rth->fl.oif == flp->oif &&
2365                     rth->fl.mark == flp->mark &&
2366                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2367                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2368                         dst_use(&rth->u.dst, jiffies);
2369                         RT_CACHE_STAT_INC(out_hit);
2370                         rcu_read_unlock_bh();
2371                         *rp = rth;
2372                         return 0;
2373                 }
2374                 RT_CACHE_STAT_INC(out_hlist_search);
2375         }
2376         rcu_read_unlock_bh();
2377
2378         return ip_route_output_slow(rp, flp);
2379 }
2380
2381 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2382
2383 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2384 {
2385 }
2386
2387 static struct dst_ops ipv4_dst_blackhole_ops = {
2388         .family                 =       AF_INET,
2389         .protocol               =       __constant_htons(ETH_P_IP),
2390         .destroy                =       ipv4_dst_destroy,
2391         .check                  =       ipv4_dst_check,
2392         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2393         .entry_size             =       sizeof(struct rtable),
2394 };
2395
2396
2397 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2398 {
2399         struct rtable *ort = *rp;
2400         struct rtable *rt = (struct rtable *)
2401                 dst_alloc(&ipv4_dst_blackhole_ops);
2402
2403         if (rt) {
2404                 struct dst_entry *new = &rt->u.dst;
2405
2406                 atomic_set(&new->__refcnt, 1);
2407                 new->__use = 1;
2408                 new->input = dst_discard;
2409                 new->output = dst_discard;
2410                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2411
2412                 new->dev = ort->u.dst.dev;
2413                 if (new->dev)
2414                         dev_hold(new->dev);
2415
2416                 rt->fl = ort->fl;
2417
2418                 rt->idev = ort->idev;
2419                 if (rt->idev)
2420                         in_dev_hold(rt->idev);
2421                 rt->rt_flags = ort->rt_flags;
2422                 rt->rt_type = ort->rt_type;
2423                 rt->rt_dst = ort->rt_dst;
2424                 rt->rt_src = ort->rt_src;
2425                 rt->rt_iif = ort->rt_iif;
2426                 rt->rt_gateway = ort->rt_gateway;
2427                 rt->rt_spec_dst = ort->rt_spec_dst;
2428                 rt->peer = ort->peer;
2429                 if (rt->peer)
2430                         atomic_inc(&rt->peer->refcnt);
2431
2432                 dst_free(new);
2433         }
2434
2435         dst_release(&(*rp)->u.dst);
2436         *rp = rt;
2437         return (rt ? 0 : -ENOMEM);
2438 }
2439
2440 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2441 {
2442         int err;
2443
2444         if ((err = __ip_route_output_key(rp, flp)) != 0)
2445                 return err;
2446
2447         if (flp->proto) {
2448                 if (!flp->fl4_src)
2449                         flp->fl4_src = (*rp)->rt_src;
2450                 if (!flp->fl4_dst)
2451                         flp->fl4_dst = (*rp)->rt_dst;
2452                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2453                 if (err == -EREMOTE)
2454                         err = ipv4_dst_blackhole(rp, flp, sk);
2455
2456                 return err;
2457         }
2458
2459         return 0;
2460 }
2461
2462 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2463
2464 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2465 {
2466         return ip_route_output_flow(rp, flp, NULL, 0);
2467 }
2468
2469 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2470                         int nowait, unsigned int flags)
2471 {
2472         struct rtable *rt = (struct rtable*)skb->dst;
2473         struct rtmsg *r;
2474         struct nlmsghdr *nlh;
2475         long expires;
2476         u32 id = 0, ts = 0, tsage = 0, error;
2477
2478         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2479         if (nlh == NULL)
2480                 return -EMSGSIZE;
2481
2482         r = nlmsg_data(nlh);
2483         r->rtm_family    = AF_INET;
2484         r->rtm_dst_len  = 32;
2485         r->rtm_src_len  = 0;
2486         r->rtm_tos      = rt->fl.fl4_tos;
2487         r->rtm_table    = RT_TABLE_MAIN;
2488         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2489         r->rtm_type     = rt->rt_type;
2490         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2491         r->rtm_protocol = RTPROT_UNSPEC;
2492         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2493         if (rt->rt_flags & RTCF_NOTIFY)
2494                 r->rtm_flags |= RTM_F_NOTIFY;
2495
2496         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2497
2498         if (rt->fl.fl4_src) {
2499                 r->rtm_src_len = 32;
2500                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2501         }
2502         if (rt->u.dst.dev)
2503                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2504 #ifdef CONFIG_NET_CLS_ROUTE
2505         if (rt->u.dst.tclassid)
2506                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2507 #endif
2508         if (rt->fl.iif)
2509                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2510         else if (rt->rt_src != rt->fl.fl4_src)
2511                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2512
2513         if (rt->rt_dst != rt->rt_gateway)
2514                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2515
2516         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2517                 goto nla_put_failure;
2518
2519         error = rt->u.dst.error;
2520         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2521         if (rt->peer) {
2522                 id = rt->peer->ip_id_count;
2523                 if (rt->peer->tcp_ts_stamp) {
2524                         ts = rt->peer->tcp_ts;
2525                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2526                 }
2527         }
2528
2529         if (rt->fl.iif) {
2530 #ifdef CONFIG_IP_MROUTE
2531                 __be32 dst = rt->rt_dst;
2532
2533                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2534                     IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2535                         int err = ipmr_get_route(skb, r, nowait);
2536                         if (err <= 0) {
2537                                 if (!nowait) {
2538                                         if (err == 0)
2539                                                 return 0;
2540                                         goto nla_put_failure;
2541                                 } else {
2542                                         if (err == -EMSGSIZE)
2543                                                 goto nla_put_failure;
2544                                         error = err;
2545                                 }
2546                         }
2547                 } else
2548 #endif
2549                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2550         }
2551
2552         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2553                                expires, error) < 0)
2554                 goto nla_put_failure;
2555
2556         return nlmsg_end(skb, nlh);
2557
2558 nla_put_failure:
2559         nlmsg_cancel(skb, nlh);
2560         return -EMSGSIZE;
2561 }
2562
2563 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2564 {
2565         struct net *net = in_skb->sk->sk_net;
2566         struct rtmsg *rtm;
2567         struct nlattr *tb[RTA_MAX+1];
2568         struct rtable *rt = NULL;
2569         __be32 dst = 0;
2570         __be32 src = 0;
2571         u32 iif;
2572         int err;
2573         struct sk_buff *skb;
2574
2575         if (net != &init_net)
2576                 return -EINVAL;
2577
2578         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2579         if (err < 0)
2580                 goto errout;
2581
2582         rtm = nlmsg_data(nlh);
2583
2584         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2585         if (skb == NULL) {
2586                 err = -ENOBUFS;
2587                 goto errout;
2588         }
2589
2590         /* Reserve room for dummy headers, this skb can pass
2591            through good chunk of routing engine.
2592          */
2593         skb_reset_mac_header(skb);
2594         skb_reset_network_header(skb);
2595
2596         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2597         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2598         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2599
2600         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2601         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2602         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2603
2604         if (iif) {
2605                 struct net_device *dev;
2606
2607                 dev = __dev_get_by_index(&init_net, iif);
2608                 if (dev == NULL) {
2609                         err = -ENODEV;
2610                         goto errout_free;
2611                 }
2612
2613                 skb->protocol   = htons(ETH_P_IP);
2614                 skb->dev        = dev;
2615                 local_bh_disable();
2616                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2617                 local_bh_enable();
2618
2619                 rt = (struct rtable*) skb->dst;
2620                 if (err == 0 && rt->u.dst.error)
2621                         err = -rt->u.dst.error;
2622         } else {
2623                 struct flowi fl = {
2624                         .nl_u = {
2625                                 .ip4_u = {
2626                                         .daddr = dst,
2627                                         .saddr = src,
2628                                         .tos = rtm->rtm_tos,
2629                                 },
2630                         },
2631                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2632                 };
2633                 err = ip_route_output_key(&rt, &fl);
2634         }
2635
2636         if (err)
2637                 goto errout_free;
2638
2639         skb->dst = &rt->u.dst;
2640         if (rtm->rtm_flags & RTM_F_NOTIFY)
2641                 rt->rt_flags |= RTCF_NOTIFY;
2642
2643         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2644                                 RTM_NEWROUTE, 0, 0);
2645         if (err <= 0)
2646                 goto errout_free;
2647
2648         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2649 errout:
2650         return err;
2651
2652 errout_free:
2653         kfree_skb(skb);
2654         goto errout;
2655 }
2656
2657 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2658 {
2659         struct rtable *rt;
2660         int h, s_h;
2661         int idx, s_idx;
2662
2663         s_h = cb->args[0];
2664         if (s_h < 0)
2665                 s_h = 0;
2666         s_idx = idx = cb->args[1];
2667         for (h = s_h; h <= rt_hash_mask; h++) {
2668                 rcu_read_lock_bh();
2669                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2670                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2671                         if (idx < s_idx)
2672                                 continue;
2673                         skb->dst = dst_clone(&rt->u.dst);
2674                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2675                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2676                                          1, NLM_F_MULTI) <= 0) {
2677                                 dst_release(xchg(&skb->dst, NULL));
2678                                 rcu_read_unlock_bh();
2679                                 goto done;
2680                         }
2681                         dst_release(xchg(&skb->dst, NULL));
2682                 }
2683                 rcu_read_unlock_bh();
2684                 s_idx = 0;
2685         }
2686
2687 done:
2688         cb->args[0] = h;
2689         cb->args[1] = idx;
2690         return skb->len;
2691 }
2692
2693 void ip_rt_multicast_event(struct in_device *in_dev)
2694 {
2695         rt_cache_flush(0);
2696 }
2697
2698 #ifdef CONFIG_SYSCTL
2699 static int flush_delay;
2700
2701 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2702                                         struct file *filp, void __user *buffer,
2703                                         size_t *lenp, loff_t *ppos)
2704 {
2705         if (write) {
2706                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2707                 rt_cache_flush(flush_delay);
2708                 return 0;
2709         }
2710
2711         return -EINVAL;
2712 }
2713
2714 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2715                                                 int __user *name,
2716                                                 int nlen,
2717                                                 void __user *oldval,
2718                                                 size_t __user *oldlenp,
2719                                                 void __user *newval,
2720                                                 size_t newlen)
2721 {
2722         int delay;
2723         if (newlen != sizeof(int))
2724                 return -EINVAL;
2725         if (get_user(delay, (int __user *)newval))
2726                 return -EFAULT;
2727         rt_cache_flush(delay);
2728         return 0;
2729 }
2730
2731 ctl_table ipv4_route_table[] = {
2732         {
2733                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2734                 .procname       = "flush",
2735                 .data           = &flush_delay,
2736                 .maxlen         = sizeof(int),
2737                 .mode           = 0200,
2738                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2739                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2740         },
2741         {
2742                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2743                 .procname       = "min_delay",
2744                 .data           = &ip_rt_min_delay,
2745                 .maxlen         = sizeof(int),
2746                 .mode           = 0644,
2747                 .proc_handler   = &proc_dointvec_jiffies,
2748                 .strategy       = &sysctl_jiffies,
2749         },
2750         {
2751                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2752                 .procname       = "max_delay",
2753                 .data           = &ip_rt_max_delay,
2754                 .maxlen         = sizeof(int),
2755                 .mode           = 0644,
2756                 .proc_handler   = &proc_dointvec_jiffies,
2757                 .strategy       = &sysctl_jiffies,
2758         },
2759         {
2760                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2761                 .procname       = "gc_thresh",
2762                 .data           = &ipv4_dst_ops.gc_thresh,
2763                 .maxlen         = sizeof(int),
2764                 .mode           = 0644,
2765                 .proc_handler   = &proc_dointvec,
2766         },
2767         {
2768                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2769                 .procname       = "max_size",
2770                 .data           = &ip_rt_max_size,
2771                 .maxlen         = sizeof(int),
2772                 .mode           = 0644,
2773                 .proc_handler   = &proc_dointvec,
2774         },
2775         {
2776                 /*  Deprecated. Use gc_min_interval_ms */
2777
2778                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2779                 .procname       = "gc_min_interval",
2780                 .data           = &ip_rt_gc_min_interval,
2781                 .maxlen         = sizeof(int),
2782                 .mode           = 0644,
2783                 .proc_handler   = &proc_dointvec_jiffies,
2784                 .strategy       = &sysctl_jiffies,
2785         },
2786         {
2787                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2788                 .procname       = "gc_min_interval_ms",
2789                 .data           = &ip_rt_gc_min_interval,
2790                 .maxlen         = sizeof(int),
2791                 .mode           = 0644,
2792                 .proc_handler   = &proc_dointvec_ms_jiffies,
2793                 .strategy       = &sysctl_ms_jiffies,
2794         },
2795         {
2796                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2797                 .procname       = "gc_timeout",
2798                 .data           = &ip_rt_gc_timeout,
2799                 .maxlen         = sizeof(int),
2800                 .mode           = 0644,
2801                 .proc_handler   = &proc_dointvec_jiffies,
2802                 .strategy       = &sysctl_jiffies,
2803         },
2804         {
2805                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2806                 .procname       = "gc_interval",
2807                 .data           = &ip_rt_gc_interval,
2808                 .maxlen         = sizeof(int),
2809                 .mode           = 0644,
2810                 .proc_handler   = &proc_dointvec_jiffies,
2811                 .strategy       = &sysctl_jiffies,
2812         },
2813         {
2814                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2815                 .procname       = "redirect_load",
2816                 .data           = &ip_rt_redirect_load,
2817                 .maxlen         = sizeof(int),
2818                 .mode           = 0644,
2819                 .proc_handler   = &proc_dointvec,
2820         },
2821         {
2822                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2823                 .procname       = "redirect_number",
2824                 .data           = &ip_rt_redirect_number,
2825                 .maxlen         = sizeof(int),
2826                 .mode           = 0644,
2827                 .proc_handler   = &proc_dointvec,
2828         },
2829         {
2830                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2831                 .procname       = "redirect_silence",
2832                 .data           = &ip_rt_redirect_silence,
2833                 .maxlen         = sizeof(int),
2834                 .mode           = 0644,
2835                 .proc_handler   = &proc_dointvec,
2836         },
2837         {
2838                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2839                 .procname       = "error_cost",
2840                 .data           = &ip_rt_error_cost,
2841                 .maxlen         = sizeof(int),
2842                 .mode           = 0644,
2843                 .proc_handler   = &proc_dointvec,
2844         },
2845         {
2846                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2847                 .procname       = "error_burst",
2848                 .data           = &ip_rt_error_burst,
2849                 .maxlen         = sizeof(int),
2850                 .mode           = 0644,
2851                 .proc_handler   = &proc_dointvec,
2852         },
2853         {
2854                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2855                 .procname       = "gc_elasticity",
2856                 .data           = &ip_rt_gc_elasticity,
2857                 .maxlen         = sizeof(int),
2858                 .mode           = 0644,
2859                 .proc_handler   = &proc_dointvec,
2860         },
2861         {
2862                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2863                 .procname       = "mtu_expires",
2864                 .data           = &ip_rt_mtu_expires,
2865                 .maxlen         = sizeof(int),
2866                 .mode           = 0644,
2867                 .proc_handler   = &proc_dointvec_jiffies,
2868                 .strategy       = &sysctl_jiffies,
2869         },
2870         {
2871                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2872                 .procname       = "min_pmtu",
2873                 .data           = &ip_rt_min_pmtu,
2874                 .maxlen         = sizeof(int),
2875                 .mode           = 0644,
2876                 .proc_handler   = &proc_dointvec,
2877         },
2878         {
2879                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2880                 .procname       = "min_adv_mss",
2881                 .data           = &ip_rt_min_advmss,
2882                 .maxlen         = sizeof(int),
2883                 .mode           = 0644,
2884                 .proc_handler   = &proc_dointvec,
2885         },
2886         {
2887                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2888                 .procname       = "secret_interval",
2889                 .data           = &ip_rt_secret_interval,
2890                 .maxlen         = sizeof(int),
2891                 .mode           = 0644,
2892                 .proc_handler   = &proc_dointvec_jiffies,
2893                 .strategy       = &sysctl_jiffies,
2894         },
2895         { .ctl_name = 0 }
2896 };
2897 #endif
2898
2899 #ifdef CONFIG_NET_CLS_ROUTE
2900 struct ip_rt_acct *ip_rt_acct __read_mostly;
2901
2902 /* IP route accounting ptr for this logical cpu number. */
2903 #define IP_RT_ACCT_CPU(cpu) (per_cpu_ptr(ip_rt_acct, cpu))
2904
2905 #ifdef CONFIG_PROC_FS
2906 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2907                            int length, int *eof, void *data)
2908 {
2909         unsigned int i;
2910
2911         if ((offset & 3) || (length & 3))
2912                 return -EIO;
2913
2914         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2915                 *eof = 1;
2916                 return 0;
2917         }
2918
2919         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2920                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2921                 *eof = 1;
2922         }
2923
2924         offset /= sizeof(u32);
2925
2926         if (length > 0) {
2927                 u32 *dst = (u32 *) buffer;
2928
2929                 *start = buffer;
2930                 memset(dst, 0, length);
2931
2932                 for_each_possible_cpu(i) {
2933                         unsigned int j;
2934                         u32 *src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2935
2936                         for (j = 0; j < length/4; j++)
2937                                 dst[j] += src[j];
2938                 }
2939         }
2940         return length;
2941 }
2942 #endif /* CONFIG_PROC_FS */
2943 #endif /* CONFIG_NET_CLS_ROUTE */
2944
2945 static __initdata unsigned long rhash_entries;
2946 static int __init set_rhash_entries(char *str)
2947 {
2948         if (!str)
2949                 return 0;
2950         rhash_entries = simple_strtoul(str, &str, 0);
2951         return 1;
2952 }
2953 __setup("rhash_entries=", set_rhash_entries);
2954
2955 int __init ip_rt_init(void)
2956 {
2957         int rc = 0;
2958
2959         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2960                              (jiffies ^ (jiffies >> 7)));
2961
2962 #ifdef CONFIG_NET_CLS_ROUTE
2963         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
2964         if (!ip_rt_acct)
2965                 panic("IP: failed to allocate ip_rt_acct\n");
2966 #endif
2967
2968         ipv4_dst_ops.kmem_cachep =
2969                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2970                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2971
2972         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2973
2974         rt_hash_table = (struct rt_hash_bucket *)
2975                 alloc_large_system_hash("IP route cache",
2976                                         sizeof(struct rt_hash_bucket),
2977                                         rhash_entries,
2978                                         (num_physpages >= 128 * 1024) ?
2979                                         15 : 17,
2980                                         0,
2981                                         &rt_hash_log,
2982                                         &rt_hash_mask,
2983                                         0);
2984         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2985         rt_hash_lock_init();
2986
2987         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2988         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2989
2990         devinet_init();
2991         ip_fib_init();
2992
2993         setup_timer(&rt_flush_timer, rt_run_flush, 0);
2994         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
2995
2996         /* All the timers, started at system startup tend
2997            to synchronize. Perturb it a bit.
2998          */
2999         schedule_delayed_work(&expires_work,
3000                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3001
3002         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3003                 ip_rt_secret_interval;
3004         add_timer(&rt_secret_timer);
3005
3006 #ifdef CONFIG_PROC_FS
3007         {
3008         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3009         if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3010             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3011                                              init_net.proc_net_stat))) {
3012                 return -ENOMEM;
3013         }
3014         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3015         }
3016 #ifdef CONFIG_NET_CLS_ROUTE
3017         create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
3018 #endif
3019 #endif
3020 #ifdef CONFIG_XFRM
3021         xfrm_init();
3022         xfrm4_init();
3023 #endif
3024         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3025
3026         return rc;
3027 }
3028
3029 EXPORT_SYMBOL(__ip_select_ident);
3030 EXPORT_SYMBOL(ip_route_input);
3031 EXPORT_SYMBOL(ip_route_output_key);