]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/route.c
[RFKILL]: Add rfkill documentation
[net-next-2.6.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
e905a9ed 23 * Alan Cox : Super /proc >4K
1da177e4
LT
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
e905a9ed 41 *
1da177e4
LT
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
60 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
1da177e4
LT
67#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
1da177e4 73#include <linux/mm.h>
424c4b70 74#include <linux/bootmem.h>
1da177e4
LT
75#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
84#include <linux/skbuff.h>
1da177e4
LT
85#include <linux/inetdevice.h>
86#include <linux/igmp.h>
87#include <linux/pkt_sched.h>
88#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h>
93#include <linux/times.h>
457c4cbc 94#include <net/net_namespace.h>
1da177e4
LT
95#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
8d71740c 105#include <net/netevent.h>
63f3444f 106#include <net/rtnetlink.h>
1da177e4
LT
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
118static int ip_rt_min_delay = 2 * HZ;
119static int ip_rt_max_delay = 10 * HZ;
120static int ip_rt_max_size;
121static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
122static int ip_rt_gc_interval = 60 * HZ;
123static int ip_rt_gc_min_interval = HZ / 2;
124static int ip_rt_redirect_number = 9;
125static int ip_rt_redirect_load = HZ / 50;
126static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
127static int ip_rt_error_cost = HZ;
128static int ip_rt_error_burst = 5 * HZ;
129static int ip_rt_gc_elasticity = 8;
130static int ip_rt_mtu_expires = 10 * 60 * HZ;
131static int ip_rt_min_pmtu = 512 + 20 + 20;
132static int ip_rt_min_advmss = 256;
133static int ip_rt_secret_interval = 10 * 60 * HZ;
134static unsigned long rt_deadline;
135
136#define RTprint(a...) printk(KERN_DEBUG a)
137
138static struct timer_list rt_flush_timer;
139static struct timer_list rt_periodic_timer;
140static struct timer_list rt_secret_timer;
141
142/*
143 * Interface to generic destination cache.
144 */
145
146static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
147static void ipv4_dst_destroy(struct dst_entry *dst);
148static void ipv4_dst_ifdown(struct dst_entry *dst,
149 struct net_device *dev, int how);
150static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
151static void ipv4_link_failure(struct sk_buff *skb);
152static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
153static int rt_garbage_collect(void);
154
155
156static struct dst_ops ipv4_dst_ops = {
157 .family = AF_INET,
158 .protocol = __constant_htons(ETH_P_IP),
159 .gc = rt_garbage_collect,
160 .check = ipv4_dst_check,
161 .destroy = ipv4_dst_destroy,
162 .ifdown = ipv4_dst_ifdown,
163 .negative_advice = ipv4_negative_advice,
164 .link_failure = ipv4_link_failure,
165 .update_pmtu = ip_rt_update_pmtu,
166 .entry_size = sizeof(struct rtable),
167};
168
169#define ECN_OR_COST(class) TC_PRIO_##class
170
4839c52b 171const __u8 ip_tos2prio[16] = {
1da177e4
LT
172 TC_PRIO_BESTEFFORT,
173 ECN_OR_COST(FILLER),
174 TC_PRIO_BESTEFFORT,
175 ECN_OR_COST(BESTEFFORT),
176 TC_PRIO_BULK,
177 ECN_OR_COST(BULK),
178 TC_PRIO_BULK,
179 ECN_OR_COST(BULK),
180 TC_PRIO_INTERACTIVE,
181 ECN_OR_COST(INTERACTIVE),
182 TC_PRIO_INTERACTIVE,
183 ECN_OR_COST(INTERACTIVE),
184 TC_PRIO_INTERACTIVE_BULK,
185 ECN_OR_COST(INTERACTIVE_BULK),
186 TC_PRIO_INTERACTIVE_BULK,
187 ECN_OR_COST(INTERACTIVE_BULK)
188};
189
190
191/*
192 * Route cache.
193 */
194
195/* The locking scheme is rather straight forward:
196 *
197 * 1) Read-Copy Update protects the buckets of the central route hash.
198 * 2) Only writers remove entries, and they hold the lock
199 * as they look at rtable reference counts.
200 * 3) Only readers acquire references to rtable entries,
201 * they do so with atomic increments and with the
202 * lock held.
203 */
204
205struct rt_hash_bucket {
206 struct rtable *chain;
22c047cc 207};
8a25d5de
IM
208#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
209 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
210/*
211 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
212 * The size of this table is a power of two and depends on the number of CPUS.
62051200 213 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 214 */
62051200
IM
215#ifdef CONFIG_LOCKDEP
216# define RT_HASH_LOCK_SZ 256
22c047cc 217#else
62051200
IM
218# if NR_CPUS >= 32
219# define RT_HASH_LOCK_SZ 4096
220# elif NR_CPUS >= 16
221# define RT_HASH_LOCK_SZ 2048
222# elif NR_CPUS >= 8
223# define RT_HASH_LOCK_SZ 1024
224# elif NR_CPUS >= 4
225# define RT_HASH_LOCK_SZ 512
226# else
227# define RT_HASH_LOCK_SZ 256
228# endif
22c047cc
ED
229#endif
230
231static spinlock_t *rt_hash_locks;
232# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
233# define rt_hash_lock_init() { \
234 int i; \
235 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
236 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
237 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
238 spin_lock_init(&rt_hash_locks[i]); \
239 }
240#else
241# define rt_hash_lock_addr(slot) NULL
242# define rt_hash_lock_init()
243#endif
1da177e4
LT
244
245static struct rt_hash_bucket *rt_hash_table;
246static unsigned rt_hash_mask;
247static int rt_hash_log;
248static unsigned int rt_hash_rnd;
249
2f970d83 250static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
dbd2915c 251#define RT_CACHE_STAT_INC(field) \
bfe5d834 252 (__raw_get_cpu_var(rt_cache_stat).field++)
1da177e4
LT
253
254static int rt_intern_hash(unsigned hash, struct rtable *rth,
255 struct rtable **res);
256
cef2685e 257static unsigned int rt_hash_code(u32 daddr, u32 saddr)
1da177e4 258{
cef2685e 259 return (jhash_2words(daddr, saddr, rt_hash_rnd)
1da177e4
LT
260 & rt_hash_mask);
261}
262
8c7bc840
AV
263#define rt_hash(daddr, saddr, idx) \
264 rt_hash_code((__force u32)(__be32)(daddr),\
265 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
266
1da177e4
LT
267#ifdef CONFIG_PROC_FS
268struct rt_cache_iter_state {
269 int bucket;
270};
271
272static struct rtable *rt_cache_get_first(struct seq_file *seq)
273{
274 struct rtable *r = NULL;
275 struct rt_cache_iter_state *st = seq->private;
276
277 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
278 rcu_read_lock_bh();
279 r = rt_hash_table[st->bucket].chain;
280 if (r)
281 break;
282 rcu_read_unlock_bh();
283 }
284 return r;
285}
286
287static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
288{
289 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
290
093c2ca4 291 r = r->u.dst.rt_next;
1da177e4
LT
292 while (!r) {
293 rcu_read_unlock_bh();
294 if (--st->bucket < 0)
295 break;
296 rcu_read_lock_bh();
297 r = rt_hash_table[st->bucket].chain;
298 }
299 return r;
300}
301
302static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
303{
304 struct rtable *r = rt_cache_get_first(seq);
305
306 if (r)
307 while (pos && (r = rt_cache_get_next(seq, r)))
308 --pos;
309 return pos ? NULL : r;
310}
311
312static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
313{
314 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
315}
316
317static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
318{
319 struct rtable *r = NULL;
320
321 if (v == SEQ_START_TOKEN)
322 r = rt_cache_get_first(seq);
323 else
324 r = rt_cache_get_next(seq, v);
325 ++*pos;
326 return r;
327}
328
329static void rt_cache_seq_stop(struct seq_file *seq, void *v)
330{
331 if (v && v != SEQ_START_TOKEN)
332 rcu_read_unlock_bh();
333}
334
335static int rt_cache_seq_show(struct seq_file *seq, void *v)
336{
337 if (v == SEQ_START_TOKEN)
338 seq_printf(seq, "%-127s\n",
339 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
340 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
341 "HHUptod\tSpecDst");
342 else {
343 struct rtable *r = v;
344 char temp[256];
345
346 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
347 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
348 r->u.dst.dev ? r->u.dst.dev->name : "*",
349 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
350 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
351 r->u.dst.__use, 0, (unsigned long)r->rt_src,
352 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
353 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
354 dst_metric(&r->u.dst, RTAX_WINDOW),
355 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
356 dst_metric(&r->u.dst, RTAX_RTTVAR)),
357 r->fl.fl4_tos,
358 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
359 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
360 dev_queue_xmit) : 0,
361 r->rt_spec_dst);
362 seq_printf(seq, "%-127s\n", temp);
e905a9ed
YH
363 }
364 return 0;
1da177e4
LT
365}
366
f690808e 367static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
368 .start = rt_cache_seq_start,
369 .next = rt_cache_seq_next,
370 .stop = rt_cache_seq_stop,
371 .show = rt_cache_seq_show,
372};
373
374static int rt_cache_seq_open(struct inode *inode, struct file *file)
375{
376 struct seq_file *seq;
377 int rc = -ENOMEM;
1bcabbdb 378 struct rt_cache_iter_state *s;
1da177e4 379
1bcabbdb 380 s = kzalloc(sizeof(*s), GFP_KERNEL);
1da177e4
LT
381 if (!s)
382 goto out;
383 rc = seq_open(file, &rt_cache_seq_ops);
384 if (rc)
385 goto out_kfree;
386 seq = file->private_data;
387 seq->private = s;
1da177e4
LT
388out:
389 return rc;
390out_kfree:
391 kfree(s);
392 goto out;
393}
394
9a32144e 395static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
396 .owner = THIS_MODULE,
397 .open = rt_cache_seq_open,
398 .read = seq_read,
399 .llseek = seq_lseek,
400 .release = seq_release_private,
401};
402
403
404static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
405{
406 int cpu;
407
408 if (*pos == 0)
409 return SEQ_START_TOKEN;
410
411 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
412 if (!cpu_possible(cpu))
413 continue;
414 *pos = cpu+1;
2f970d83 415 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
416 }
417 return NULL;
418}
419
420static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
421{
422 int cpu;
423
424 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
425 if (!cpu_possible(cpu))
426 continue;
427 *pos = cpu+1;
2f970d83 428 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
429 }
430 return NULL;
e905a9ed 431
1da177e4
LT
432}
433
434static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
435{
436
437}
438
439static int rt_cpu_seq_show(struct seq_file *seq, void *v)
440{
441 struct rt_cache_stat *st = v;
442
443 if (v == SEQ_START_TOKEN) {
5bec0039 444 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
445 return 0;
446 }
e905a9ed 447
1da177e4
LT
448 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
449 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
450 atomic_read(&ipv4_dst_ops.entries),
451 st->in_hit,
452 st->in_slow_tot,
453 st->in_slow_mc,
454 st->in_no_route,
455 st->in_brd,
456 st->in_martian_dst,
457 st->in_martian_src,
458
459 st->out_hit,
460 st->out_slow_tot,
e905a9ed 461 st->out_slow_mc,
1da177e4
LT
462
463 st->gc_total,
464 st->gc_ignored,
465 st->gc_goal_miss,
466 st->gc_dst_overflow,
467 st->in_hlist_search,
468 st->out_hlist_search
469 );
470 return 0;
471}
472
f690808e 473static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
474 .start = rt_cpu_seq_start,
475 .next = rt_cpu_seq_next,
476 .stop = rt_cpu_seq_stop,
477 .show = rt_cpu_seq_show,
478};
479
480
481static int rt_cpu_seq_open(struct inode *inode, struct file *file)
482{
483 return seq_open(file, &rt_cpu_seq_ops);
484}
485
9a32144e 486static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
487 .owner = THIS_MODULE,
488 .open = rt_cpu_seq_open,
489 .read = seq_read,
490 .llseek = seq_lseek,
491 .release = seq_release,
492};
493
494#endif /* CONFIG_PROC_FS */
e905a9ed 495
1da177e4
LT
496static __inline__ void rt_free(struct rtable *rt)
497{
1da177e4
LT
498 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
499}
500
501static __inline__ void rt_drop(struct rtable *rt)
502{
1da177e4
LT
503 ip_rt_put(rt);
504 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
505}
506
507static __inline__ int rt_fast_clean(struct rtable *rth)
508{
509 /* Kill broadcast/multicast entries very aggresively, if they
510 collide in hash table with more useful entries */
511 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
093c2ca4 512 rth->fl.iif && rth->u.dst.rt_next;
1da177e4
LT
513}
514
515static __inline__ int rt_valuable(struct rtable *rth)
516{
517 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
518 rth->u.dst.expires;
519}
520
521static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
522{
523 unsigned long age;
524 int ret = 0;
525
526 if (atomic_read(&rth->u.dst.__refcnt))
527 goto out;
528
529 ret = 1;
530 if (rth->u.dst.expires &&
531 time_after_eq(jiffies, rth->u.dst.expires))
532 goto out;
533
534 age = jiffies - rth->u.dst.lastuse;
535 ret = 0;
536 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
537 (age <= tmo2 && rt_valuable(rth)))
538 goto out;
539 ret = 1;
540out: return ret;
541}
542
543/* Bits of score are:
544 * 31: very valuable
545 * 30: not quite useless
546 * 29..0: usage counter
547 */
548static inline u32 rt_score(struct rtable *rt)
549{
550 u32 score = jiffies - rt->u.dst.lastuse;
551
552 score = ~score & ~(3<<30);
553
554 if (rt_valuable(rt))
555 score |= (1<<31);
556
557 if (!rt->fl.iif ||
558 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
559 score |= (1<<30);
560
561 return score;
562}
563
564static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
565{
714e85be
AV
566 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
567 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
47dcf0cb 568 (fl1->mark ^ fl2->mark) |
8238b218
DM
569 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
570 *(u16 *)&fl2->nl_u.ip4_u.tos) |
571 (fl1->oif ^ fl2->oif) |
572 (fl1->iif ^ fl2->iif)) == 0;
1da177e4
LT
573}
574
1da177e4
LT
575/* This runs via a timer and thus is always in BH context. */
576static void rt_check_expire(unsigned long dummy)
577{
bb1d23b0
ED
578 static unsigned int rover;
579 unsigned int i = rover, goal;
1da177e4
LT
580 struct rtable *rth, **rthp;
581 unsigned long now = jiffies;
bb1d23b0
ED
582 u64 mult;
583
584 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
585 if (ip_rt_gc_timeout > 1)
586 do_div(mult, ip_rt_gc_timeout);
587 goal = (unsigned int)mult;
588 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
589 for (; goal > 0; goal--) {
1da177e4
LT
590 unsigned long tmo = ip_rt_gc_timeout;
591
592 i = (i + 1) & rt_hash_mask;
593 rthp = &rt_hash_table[i].chain;
594
bb1d23b0
ED
595 if (*rthp == 0)
596 continue;
22c047cc 597 spin_lock(rt_hash_lock_addr(i));
1da177e4
LT
598 while ((rth = *rthp) != NULL) {
599 if (rth->u.dst.expires) {
600 /* Entry is expired even if it is in use */
601 if (time_before_eq(now, rth->u.dst.expires)) {
602 tmo >>= 1;
093c2ca4 603 rthp = &rth->u.dst.rt_next;
1da177e4
LT
604 continue;
605 }
606 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
607 tmo >>= 1;
093c2ca4 608 rthp = &rth->u.dst.rt_next;
1da177e4
LT
609 continue;
610 }
611
612 /* Cleanup aged off entries. */
093c2ca4 613 *rthp = rth->u.dst.rt_next;
e905a9ed 614 rt_free(rth);
1da177e4 615 }
22c047cc 616 spin_unlock(rt_hash_lock_addr(i));
1da177e4
LT
617
618 /* Fallback loop breaker. */
619 if (time_after(jiffies, now))
620 break;
621 }
622 rover = i;
bb1d23b0 623 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
1da177e4
LT
624}
625
626/* This can run from both BH and non-BH contexts, the latter
627 * in the case of a forced flush event.
628 */
629static void rt_run_flush(unsigned long dummy)
630{
631 int i;
632 struct rtable *rth, *next;
633
634 rt_deadline = 0;
635
636 get_random_bytes(&rt_hash_rnd, 4);
637
638 for (i = rt_hash_mask; i >= 0; i--) {
22c047cc 639 spin_lock_bh(rt_hash_lock_addr(i));
1da177e4
LT
640 rth = rt_hash_table[i].chain;
641 if (rth)
642 rt_hash_table[i].chain = NULL;
22c047cc 643 spin_unlock_bh(rt_hash_lock_addr(i));
1da177e4
LT
644
645 for (; rth; rth = next) {
093c2ca4 646 next = rth->u.dst.rt_next;
1da177e4
LT
647 rt_free(rth);
648 }
649 }
650}
651
652static DEFINE_SPINLOCK(rt_flush_lock);
653
654void rt_cache_flush(int delay)
655{
656 unsigned long now = jiffies;
657 int user_mode = !in_softirq();
658
659 if (delay < 0)
660 delay = ip_rt_min_delay;
661
1da177e4
LT
662 spin_lock_bh(&rt_flush_lock);
663
664 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
665 long tmo = (long)(rt_deadline - now);
666
667 /* If flush timer is already running
668 and flush request is not immediate (delay > 0):
669
670 if deadline is not achieved, prolongate timer to "delay",
671 otherwise fire it at deadline time.
672 */
673
674 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
675 tmo = 0;
e905a9ed 676
1da177e4
LT
677 if (delay > tmo)
678 delay = tmo;
679 }
680
681 if (delay <= 0) {
682 spin_unlock_bh(&rt_flush_lock);
683 rt_run_flush(0);
684 return;
685 }
686
687 if (rt_deadline == 0)
688 rt_deadline = now + ip_rt_max_delay;
689
690 mod_timer(&rt_flush_timer, now+delay);
691 spin_unlock_bh(&rt_flush_lock);
692}
693
694static void rt_secret_rebuild(unsigned long dummy)
695{
696 unsigned long now = jiffies;
697
698 rt_cache_flush(0);
699 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
700}
701
702/*
703 Short description of GC goals.
704
705 We want to build algorithm, which will keep routing cache
706 at some equilibrium point, when number of aged off entries
707 is kept approximately equal to newly generated ones.
708
709 Current expiration strength is variable "expire".
710 We try to adjust it dynamically, so that if networking
711 is idle expires is large enough to keep enough of warm entries,
712 and when load increases it reduces to limit cache size.
713 */
714
715static int rt_garbage_collect(void)
716{
717 static unsigned long expire = RT_GC_TIMEOUT;
718 static unsigned long last_gc;
719 static int rover;
720 static int equilibrium;
721 struct rtable *rth, **rthp;
722 unsigned long now = jiffies;
723 int goal;
724
725 /*
726 * Garbage collection is pretty expensive,
727 * do not make it too frequently.
728 */
729
730 RT_CACHE_STAT_INC(gc_total);
731
732 if (now - last_gc < ip_rt_gc_min_interval &&
733 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
734 RT_CACHE_STAT_INC(gc_ignored);
735 goto out;
736 }
737
738 /* Calculate number of entries, which we want to expire now. */
739 goal = atomic_read(&ipv4_dst_ops.entries) -
740 (ip_rt_gc_elasticity << rt_hash_log);
741 if (goal <= 0) {
742 if (equilibrium < ipv4_dst_ops.gc_thresh)
743 equilibrium = ipv4_dst_ops.gc_thresh;
744 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
745 if (goal > 0) {
746 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
747 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
748 }
749 } else {
750 /* We are in dangerous area. Try to reduce cache really
751 * aggressively.
752 */
753 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
754 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
755 }
756
757 if (now - last_gc >= ip_rt_gc_min_interval)
758 last_gc = now;
759
760 if (goal <= 0) {
761 equilibrium += goal;
762 goto work_done;
763 }
764
765 do {
766 int i, k;
767
768 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
769 unsigned long tmo = expire;
770
771 k = (k + 1) & rt_hash_mask;
772 rthp = &rt_hash_table[k].chain;
22c047cc 773 spin_lock_bh(rt_hash_lock_addr(k));
1da177e4
LT
774 while ((rth = *rthp) != NULL) {
775 if (!rt_may_expire(rth, tmo, expire)) {
776 tmo >>= 1;
093c2ca4 777 rthp = &rth->u.dst.rt_next;
1da177e4
LT
778 continue;
779 }
093c2ca4 780 *rthp = rth->u.dst.rt_next;
1da177e4
LT
781 rt_free(rth);
782 goal--;
1da177e4 783 }
22c047cc 784 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
785 if (goal <= 0)
786 break;
787 }
788 rover = k;
789
790 if (goal <= 0)
791 goto work_done;
792
793 /* Goal is not achieved. We stop process if:
794
795 - if expire reduced to zero. Otherwise, expire is halfed.
796 - if table is not full.
797 - if we are called from interrupt.
798 - jiffies check is just fallback/debug loop breaker.
799 We will not spin here for long time in any case.
800 */
801
802 RT_CACHE_STAT_INC(gc_goal_miss);
803
804 if (expire == 0)
805 break;
806
807 expire >>= 1;
808#if RT_CACHE_DEBUG >= 2
809 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
810 atomic_read(&ipv4_dst_ops.entries), goal, i);
811#endif
812
813 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
814 goto out;
815 } while (!in_softirq() && time_before_eq(jiffies, now));
816
817 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
818 goto out;
819 if (net_ratelimit())
820 printk(KERN_WARNING "dst cache overflow\n");
821 RT_CACHE_STAT_INC(gc_dst_overflow);
822 return 1;
823
824work_done:
825 expire += ip_rt_gc_min_interval;
826 if (expire > ip_rt_gc_timeout ||
827 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
828 expire = ip_rt_gc_timeout;
829#if RT_CACHE_DEBUG >= 2
830 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
831 atomic_read(&ipv4_dst_ops.entries), goal, rover);
832#endif
833out: return 0;
834}
835
836static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
837{
838 struct rtable *rth, **rthp;
839 unsigned long now;
840 struct rtable *cand, **candp;
841 u32 min_score;
842 int chain_length;
843 int attempts = !in_softirq();
844
845restart:
846 chain_length = 0;
847 min_score = ~(u32)0;
848 cand = NULL;
849 candp = NULL;
850 now = jiffies;
851
852 rthp = &rt_hash_table[hash].chain;
853
22c047cc 854 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 855 while ((rth = *rthp) != NULL) {
1da177e4 856 if (compare_keys(&rth->fl, &rt->fl)) {
1da177e4 857 /* Put it first */
093c2ca4 858 *rthp = rth->u.dst.rt_next;
1da177e4
LT
859 /*
860 * Since lookup is lockfree, the deletion
861 * must be visible to another weakly ordered CPU before
862 * the insertion at the start of the hash chain.
863 */
093c2ca4 864 rcu_assign_pointer(rth->u.dst.rt_next,
1da177e4
LT
865 rt_hash_table[hash].chain);
866 /*
867 * Since lookup is lockfree, the update writes
868 * must be ordered for consistency on SMP.
869 */
870 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
871
872 rth->u.dst.__use++;
873 dst_hold(&rth->u.dst);
874 rth->u.dst.lastuse = now;
22c047cc 875 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
876
877 rt_drop(rt);
878 *rp = rth;
879 return 0;
880 }
881
882 if (!atomic_read(&rth->u.dst.__refcnt)) {
883 u32 score = rt_score(rth);
884
885 if (score <= min_score) {
886 cand = rth;
887 candp = rthp;
888 min_score = score;
889 }
890 }
891
892 chain_length++;
893
093c2ca4 894 rthp = &rth->u.dst.rt_next;
1da177e4
LT
895 }
896
897 if (cand) {
898 /* ip_rt_gc_elasticity used to be average length of chain
899 * length, when exceeded gc becomes really aggressive.
900 *
901 * The second limit is less certain. At the moment it allows
902 * only 2 entries per bucket. We will see.
903 */
904 if (chain_length > ip_rt_gc_elasticity) {
093c2ca4 905 *candp = cand->u.dst.rt_next;
1da177e4
LT
906 rt_free(cand);
907 }
908 }
909
910 /* Try to bind route to arp only if it is output
911 route or unicast forwarding path.
912 */
913 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
914 int err = arp_bind_neighbour(&rt->u.dst);
915 if (err) {
22c047cc 916 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
917
918 if (err != -ENOBUFS) {
919 rt_drop(rt);
920 return err;
921 }
922
923 /* Neighbour tables are full and nothing
924 can be released. Try to shrink route cache,
925 it is most likely it holds some neighbour records.
926 */
927 if (attempts-- > 0) {
928 int saved_elasticity = ip_rt_gc_elasticity;
929 int saved_int = ip_rt_gc_min_interval;
930 ip_rt_gc_elasticity = 1;
931 ip_rt_gc_min_interval = 0;
932 rt_garbage_collect();
933 ip_rt_gc_min_interval = saved_int;
934 ip_rt_gc_elasticity = saved_elasticity;
935 goto restart;
936 }
937
938 if (net_ratelimit())
939 printk(KERN_WARNING "Neighbour table overflow.\n");
940 rt_drop(rt);
941 return -ENOBUFS;
942 }
943 }
944
093c2ca4 945 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1da177e4 946#if RT_CACHE_DEBUG >= 2
093c2ca4 947 if (rt->u.dst.rt_next) {
1da177e4
LT
948 struct rtable *trt;
949 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
950 NIPQUAD(rt->rt_dst));
093c2ca4 951 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1da177e4
LT
952 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
953 printk("\n");
954 }
955#endif
956 rt_hash_table[hash].chain = rt;
22c047cc 957 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
958 *rp = rt;
959 return 0;
960}
961
962void rt_bind_peer(struct rtable *rt, int create)
963{
964 static DEFINE_SPINLOCK(rt_peer_lock);
965 struct inet_peer *peer;
966
967 peer = inet_getpeer(rt->rt_dst, create);
968
969 spin_lock_bh(&rt_peer_lock);
970 if (rt->peer == NULL) {
971 rt->peer = peer;
972 peer = NULL;
973 }
974 spin_unlock_bh(&rt_peer_lock);
975 if (peer)
976 inet_putpeer(peer);
977}
978
979/*
980 * Peer allocation may fail only in serious out-of-memory conditions. However
981 * we still can generate some output.
982 * Random ID selection looks a bit dangerous because we have no chances to
983 * select ID being unique in a reasonable period of time.
984 * But broken packet identifier may be better than no packet at all.
985 */
986static void ip_select_fb_ident(struct iphdr *iph)
987{
988 static DEFINE_SPINLOCK(ip_fb_id_lock);
989 static u32 ip_fallback_id;
990 u32 salt;
991
992 spin_lock_bh(&ip_fb_id_lock);
e448515c 993 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
994 iph->id = htons(salt & 0xFFFF);
995 ip_fallback_id = salt;
996 spin_unlock_bh(&ip_fb_id_lock);
997}
998
999void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1000{
1001 struct rtable *rt = (struct rtable *) dst;
1002
1003 if (rt) {
1004 if (rt->peer == NULL)
1005 rt_bind_peer(rt, 1);
1006
1007 /* If peer is attached to destination, it is never detached,
1008 so that we need not to grab a lock to dereference it.
1009 */
1010 if (rt->peer) {
1011 iph->id = htons(inet_getid(rt->peer, more));
1012 return;
1013 }
1014 } else
e905a9ed 1015 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1016 __builtin_return_address(0));
1da177e4
LT
1017
1018 ip_select_fb_ident(iph);
1019}
1020
1021static void rt_del(unsigned hash, struct rtable *rt)
1022{
1023 struct rtable **rthp;
1024
22c047cc 1025 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1026 ip_rt_put(rt);
1027 for (rthp = &rt_hash_table[hash].chain; *rthp;
093c2ca4 1028 rthp = &(*rthp)->u.dst.rt_next)
1da177e4 1029 if (*rthp == rt) {
093c2ca4 1030 *rthp = rt->u.dst.rt_next;
1da177e4
LT
1031 rt_free(rt);
1032 break;
1033 }
22c047cc 1034 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1035}
1036
f7655229
AV
1037void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1038 __be32 saddr, struct net_device *dev)
1da177e4
LT
1039{
1040 int i, k;
1041 struct in_device *in_dev = in_dev_get(dev);
1042 struct rtable *rth, **rthp;
f7655229 1043 __be32 skeys[2] = { saddr, 0 };
1da177e4 1044 int ikeys[2] = { dev->ifindex, 0 };
8d71740c 1045 struct netevent_redirect netevent;
1da177e4 1046
1da177e4
LT
1047 if (!in_dev)
1048 return;
1049
1050 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1051 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1052 goto reject_redirect;
1053
1054 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1055 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1056 goto reject_redirect;
1057 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1058 goto reject_redirect;
1059 } else {
1060 if (inet_addr_type(new_gw) != RTN_UNICAST)
1061 goto reject_redirect;
1062 }
1063
1064 for (i = 0; i < 2; i++) {
1065 for (k = 0; k < 2; k++) {
8c7bc840 1066 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1da177e4
LT
1067
1068 rthp=&rt_hash_table[hash].chain;
1069
1070 rcu_read_lock();
1071 while ((rth = rcu_dereference(*rthp)) != NULL) {
1072 struct rtable *rt;
1073
1074 if (rth->fl.fl4_dst != daddr ||
1075 rth->fl.fl4_src != skeys[i] ||
1da177e4
LT
1076 rth->fl.oif != ikeys[k] ||
1077 rth->fl.iif != 0) {
093c2ca4 1078 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1079 continue;
1080 }
1081
1082 if (rth->rt_dst != daddr ||
1083 rth->rt_src != saddr ||
1084 rth->u.dst.error ||
1085 rth->rt_gateway != old_gw ||
1086 rth->u.dst.dev != dev)
1087 break;
1088
1089 dst_hold(&rth->u.dst);
1090 rcu_read_unlock();
1091
1092 rt = dst_alloc(&ipv4_dst_ops);
1093 if (rt == NULL) {
1094 ip_rt_put(rth);
1095 in_dev_put(in_dev);
1096 return;
1097 }
1098
1099 /* Copy all the information. */
1100 *rt = *rth;
e905a9ed 1101 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1da177e4
LT
1102 rt->u.dst.__use = 1;
1103 atomic_set(&rt->u.dst.__refcnt, 1);
1104 rt->u.dst.child = NULL;
1105 if (rt->u.dst.dev)
1106 dev_hold(rt->u.dst.dev);
1107 if (rt->idev)
1108 in_dev_hold(rt->idev);
1109 rt->u.dst.obsolete = 0;
1110 rt->u.dst.lastuse = jiffies;
1111 rt->u.dst.path = &rt->u.dst;
1112 rt->u.dst.neighbour = NULL;
1113 rt->u.dst.hh = NULL;
1114 rt->u.dst.xfrm = NULL;
1115
1116 rt->rt_flags |= RTCF_REDIRECTED;
1117
1118 /* Gateway is different ... */
1119 rt->rt_gateway = new_gw;
1120
1121 /* Redirect received -> path was valid */
1122 dst_confirm(&rth->u.dst);
1123
1124 if (rt->peer)
1125 atomic_inc(&rt->peer->refcnt);
1126
1127 if (arp_bind_neighbour(&rt->u.dst) ||
1128 !(rt->u.dst.neighbour->nud_state &
1129 NUD_VALID)) {
1130 if (rt->u.dst.neighbour)
1131 neigh_event_send(rt->u.dst.neighbour, NULL);
1132 ip_rt_put(rth);
1133 rt_drop(rt);
1134 goto do_next;
1135 }
e905a9ed 1136
8d71740c
TT
1137 netevent.old = &rth->u.dst;
1138 netevent.new = &rt->u.dst;
e905a9ed
YH
1139 call_netevent_notifiers(NETEVENT_REDIRECT,
1140 &netevent);
1da177e4
LT
1141
1142 rt_del(hash, rth);
1143 if (!rt_intern_hash(hash, rt, &rt))
1144 ip_rt_put(rt);
1145 goto do_next;
1146 }
1147 rcu_read_unlock();
1148 do_next:
1149 ;
1150 }
1151 }
1152 in_dev_put(in_dev);
1153 return;
1154
1155reject_redirect:
1156#ifdef CONFIG_IP_ROUTE_VERBOSE
1157 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1158 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1159 "%u.%u.%u.%u ignored.\n"
cef2685e 1160 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1da177e4 1161 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
cef2685e 1162 NIPQUAD(saddr), NIPQUAD(daddr));
1da177e4
LT
1163#endif
1164 in_dev_put(in_dev);
1165}
1166
1167static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1168{
1169 struct rtable *rt = (struct rtable*)dst;
1170 struct dst_entry *ret = dst;
1171
1172 if (rt) {
1173 if (dst->obsolete) {
1174 ip_rt_put(rt);
1175 ret = NULL;
1176 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1177 rt->u.dst.expires) {
8c7bc840
AV
1178 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1179 rt->fl.oif);
1da177e4
LT
1180#if RT_CACHE_DEBUG >= 1
1181 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1182 "%u.%u.%u.%u/%02x dropped\n",
1183 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1184#endif
1185 rt_del(hash, rt);
1186 ret = NULL;
1187 }
1188 }
1189 return ret;
1190}
1191
1192/*
1193 * Algorithm:
1194 * 1. The first ip_rt_redirect_number redirects are sent
1195 * with exponential backoff, then we stop sending them at all,
1196 * assuming that the host ignores our redirects.
1197 * 2. If we did not see packets requiring redirects
1198 * during ip_rt_redirect_silence, we assume that the host
1199 * forgot redirected route and start to send redirects again.
1200 *
1201 * This algorithm is much cheaper and more intelligent than dumb load limiting
1202 * in icmp.c.
1203 *
1204 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1205 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1206 */
1207
1208void ip_rt_send_redirect(struct sk_buff *skb)
1209{
1210 struct rtable *rt = (struct rtable*)skb->dst;
1211 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1212
1213 if (!in_dev)
1214 return;
1215
1216 if (!IN_DEV_TX_REDIRECTS(in_dev))
1217 goto out;
1218
1219 /* No redirected packets during ip_rt_redirect_silence;
1220 * reset the algorithm.
1221 */
1222 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1223 rt->u.dst.rate_tokens = 0;
1224
1225 /* Too many ignored redirects; do not send anything
1226 * set u.dst.rate_last to the last seen redirected packet.
1227 */
1228 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1229 rt->u.dst.rate_last = jiffies;
1230 goto out;
1231 }
1232
1233 /* Check for load limit; set rate_last to the latest sent
1234 * redirect.
1235 */
14fb8a76
LY
1236 if (rt->u.dst.rate_tokens == 0 ||
1237 time_after(jiffies,
1da177e4
LT
1238 (rt->u.dst.rate_last +
1239 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1240 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1241 rt->u.dst.rate_last = jiffies;
1242 ++rt->u.dst.rate_tokens;
1243#ifdef CONFIG_IP_ROUTE_VERBOSE
1244 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1245 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1246 net_ratelimit())
1247 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1248 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1249 NIPQUAD(rt->rt_src), rt->rt_iif,
1250 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1251#endif
1252 }
1253out:
e905a9ed 1254 in_dev_put(in_dev);
1da177e4
LT
1255}
1256
1257static int ip_error(struct sk_buff *skb)
1258{
1259 struct rtable *rt = (struct rtable*)skb->dst;
1260 unsigned long now;
1261 int code;
1262
1263 switch (rt->u.dst.error) {
1264 case EINVAL:
1265 default:
1266 goto out;
1267 case EHOSTUNREACH:
1268 code = ICMP_HOST_UNREACH;
1269 break;
1270 case ENETUNREACH:
1271 code = ICMP_NET_UNREACH;
1272 break;
1273 case EACCES:
1274 code = ICMP_PKT_FILTERED;
1275 break;
1276 }
1277
1278 now = jiffies;
1279 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1280 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1281 rt->u.dst.rate_tokens = ip_rt_error_burst;
1282 rt->u.dst.rate_last = now;
1283 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1284 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1285 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1286 }
1287
1288out: kfree_skb(skb);
1289 return 0;
e905a9ed 1290}
1da177e4
LT
1291
1292/*
1293 * The last two values are not from the RFC but
1294 * are needed for AMPRnet AX.25 paths.
1295 */
1296
9b5b5cff 1297static const unsigned short mtu_plateau[] =
1da177e4
LT
1298{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1299
1300static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1301{
1302 int i;
e905a9ed 1303
1da177e4
LT
1304 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1305 if (old_mtu > mtu_plateau[i])
1306 return mtu_plateau[i];
1307 return 68;
1308}
1309
1310unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1311{
1312 int i;
1313 unsigned short old_mtu = ntohs(iph->tot_len);
1314 struct rtable *rth;
e448515c
AV
1315 __be32 skeys[2] = { iph->saddr, 0, };
1316 __be32 daddr = iph->daddr;
1da177e4
LT
1317 unsigned short est_mtu = 0;
1318
1319 if (ipv4_config.no_pmtu_disc)
1320 return 0;
1321
1322 for (i = 0; i < 2; i++) {
8c7bc840 1323 unsigned hash = rt_hash(daddr, skeys[i], 0);
1da177e4
LT
1324
1325 rcu_read_lock();
1326 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 1327 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
1328 if (rth->fl.fl4_dst == daddr &&
1329 rth->fl.fl4_src == skeys[i] &&
1330 rth->rt_dst == daddr &&
1331 rth->rt_src == iph->saddr &&
1da177e4
LT
1332 rth->fl.iif == 0 &&
1333 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1334 unsigned short mtu = new_mtu;
1335
1336 if (new_mtu < 68 || new_mtu >= old_mtu) {
1337
1338 /* BSD 4.2 compatibility hack :-( */
1339 if (mtu == 0 &&
1340 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1341 old_mtu >= 68 + (iph->ihl << 2))
1342 old_mtu -= iph->ihl << 2;
1343
1344 mtu = guess_mtu(old_mtu);
1345 }
1346 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
e905a9ed 1347 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1da177e4
LT
1348 dst_confirm(&rth->u.dst);
1349 if (mtu < ip_rt_min_pmtu) {
1350 mtu = ip_rt_min_pmtu;
1351 rth->u.dst.metrics[RTAX_LOCK-1] |=
1352 (1 << RTAX_MTU);
1353 }
1354 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1355 dst_set_expires(&rth->u.dst,
1356 ip_rt_mtu_expires);
1357 }
1358 est_mtu = mtu;
1359 }
1360 }
1361 }
1362 rcu_read_unlock();
1363 }
1364 return est_mtu ? : new_mtu;
1365}
1366
1367static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1368{
1369 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1370 !(dst_metric_locked(dst, RTAX_MTU))) {
1371 if (mtu < ip_rt_min_pmtu) {
1372 mtu = ip_rt_min_pmtu;
1373 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1374 }
1375 dst->metrics[RTAX_MTU-1] = mtu;
1376 dst_set_expires(dst, ip_rt_mtu_expires);
8d71740c 1377 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1da177e4
LT
1378 }
1379}
1380
1381static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1382{
1383 return NULL;
1384}
1385
1386static void ipv4_dst_destroy(struct dst_entry *dst)
1387{
1388 struct rtable *rt = (struct rtable *) dst;
1389 struct inet_peer *peer = rt->peer;
1390 struct in_device *idev = rt->idev;
1391
1392 if (peer) {
1393 rt->peer = NULL;
1394 inet_putpeer(peer);
1395 }
1396
1397 if (idev) {
1398 rt->idev = NULL;
1399 in_dev_put(idev);
1400 }
1401}
1402
1403static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1404 int how)
1405{
1406 struct rtable *rt = (struct rtable *) dst;
1407 struct in_device *idev = rt->idev;
1408 if (dev != &loopback_dev && idev && idev->dev == dev) {
1409 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1410 if (loopback_idev) {
1411 rt->idev = loopback_idev;
1412 in_dev_put(idev);
1413 }
1414 }
1415}
1416
1417static void ipv4_link_failure(struct sk_buff *skb)
1418{
1419 struct rtable *rt;
1420
1421 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1422
1423 rt = (struct rtable *) skb->dst;
1424 if (rt)
1425 dst_set_expires(&rt->u.dst, 0);
1426}
1427
1428static int ip_rt_bug(struct sk_buff *skb)
1429{
1430 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
eddc9ec5 1431 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1da177e4
LT
1432 skb->dev ? skb->dev->name : "?");
1433 kfree_skb(skb);
1434 return 0;
1435}
1436
1437/*
1438 We do not cache source address of outgoing interface,
1439 because it is used only by IP RR, TS and SRR options,
1440 so that it out of fast path.
1441
1442 BTW remember: "addr" is allowed to be not aligned
1443 in IP options!
1444 */
1445
1446void ip_rt_get_source(u8 *addr, struct rtable *rt)
1447{
a61ced5d 1448 __be32 src;
1da177e4
LT
1449 struct fib_result res;
1450
1451 if (rt->fl.iif == 0)
1452 src = rt->rt_src;
1453 else if (fib_lookup(&rt->fl, &res) == 0) {
1454 src = FIB_RES_PREFSRC(res);
1455 fib_res_put(&res);
1456 } else
1457 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1458 RT_SCOPE_UNIVERSE);
1459 memcpy(addr, &src, 4);
1460}
1461
1462#ifdef CONFIG_NET_CLS_ROUTE
1463static void set_class_tag(struct rtable *rt, u32 tag)
1464{
1465 if (!(rt->u.dst.tclassid & 0xFFFF))
1466 rt->u.dst.tclassid |= tag & 0xFFFF;
1467 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1468 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1469}
1470#endif
1471
1472static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1473{
1474 struct fib_info *fi = res->fi;
1475
1476 if (fi) {
1477 if (FIB_RES_GW(*res) &&
1478 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1479 rt->rt_gateway = FIB_RES_GW(*res);
1480 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1481 sizeof(rt->u.dst.metrics));
1482 if (fi->fib_mtu == 0) {
1483 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1484 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1485 rt->rt_gateway != rt->rt_dst &&
1486 rt->u.dst.dev->mtu > 576)
1487 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1488 }
1489#ifdef CONFIG_NET_CLS_ROUTE
1490 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1491#endif
1492 } else
1493 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1494
1495 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1496 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1497 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1498 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1499 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1500 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1501 ip_rt_min_advmss);
1502 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1503 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1504
1505#ifdef CONFIG_NET_CLS_ROUTE
1506#ifdef CONFIG_IP_MULTIPLE_TABLES
1507 set_class_tag(rt, fib_rules_tclass(res));
1508#endif
1509 set_class_tag(rt, itag);
1510#endif
e905a9ed 1511 rt->rt_type = res->type;
1da177e4
LT
1512}
1513
9e12bb22 1514static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1515 u8 tos, struct net_device *dev, int our)
1516{
1517 unsigned hash;
1518 struct rtable *rth;
a61ced5d 1519 __be32 spec_dst;
1da177e4
LT
1520 struct in_device *in_dev = in_dev_get(dev);
1521 u32 itag = 0;
1522
1523 /* Primary sanity checks. */
1524
1525 if (in_dev == NULL)
1526 return -EINVAL;
1527
1528 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1529 skb->protocol != htons(ETH_P_IP))
1530 goto e_inval;
1531
1532 if (ZERONET(saddr)) {
1533 if (!LOCAL_MCAST(daddr))
1534 goto e_inval;
1535 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1536 } else if (fib_validate_source(saddr, 0, tos, 0,
1537 dev, &spec_dst, &itag) < 0)
1538 goto e_inval;
1539
1540 rth = dst_alloc(&ipv4_dst_ops);
1541 if (!rth)
1542 goto e_nobufs;
1543
1544 rth->u.dst.output= ip_rt_bug;
1545
1546 atomic_set(&rth->u.dst.__refcnt, 1);
1547 rth->u.dst.flags= DST_HOST;
42f811b8 1548 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
1549 rth->u.dst.flags |= DST_NOPOLICY;
1550 rth->fl.fl4_dst = daddr;
1551 rth->rt_dst = daddr;
1552 rth->fl.fl4_tos = tos;
47dcf0cb 1553 rth->fl.mark = skb->mark;
1da177e4
LT
1554 rth->fl.fl4_src = saddr;
1555 rth->rt_src = saddr;
1556#ifdef CONFIG_NET_CLS_ROUTE
1557 rth->u.dst.tclassid = itag;
1558#endif
1559 rth->rt_iif =
1560 rth->fl.iif = dev->ifindex;
1561 rth->u.dst.dev = &loopback_dev;
1562 dev_hold(rth->u.dst.dev);
1563 rth->idev = in_dev_get(rth->u.dst.dev);
1564 rth->fl.oif = 0;
1565 rth->rt_gateway = daddr;
1566 rth->rt_spec_dst= spec_dst;
1567 rth->rt_type = RTN_MULTICAST;
1568 rth->rt_flags = RTCF_MULTICAST;
1569 if (our) {
1570 rth->u.dst.input= ip_local_deliver;
1571 rth->rt_flags |= RTCF_LOCAL;
1572 }
1573
1574#ifdef CONFIG_IP_MROUTE
1575 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1576 rth->u.dst.input = ip_mr_input;
1577#endif
1578 RT_CACHE_STAT_INC(in_slow_mc);
1579
1580 in_dev_put(in_dev);
8c7bc840 1581 hash = rt_hash(daddr, saddr, dev->ifindex);
1da177e4
LT
1582 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1583
1584e_nobufs:
1585 in_dev_put(in_dev);
1586 return -ENOBUFS;
1587
1588e_inval:
1589 in_dev_put(in_dev);
1590 return -EINVAL;
1591}
1592
1593
1594static void ip_handle_martian_source(struct net_device *dev,
1595 struct in_device *in_dev,
1596 struct sk_buff *skb,
9e12bb22
AV
1597 __be32 daddr,
1598 __be32 saddr)
1da177e4
LT
1599{
1600 RT_CACHE_STAT_INC(in_martian_src);
1601#ifdef CONFIG_IP_ROUTE_VERBOSE
1602 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1603 /*
1604 * RFC1812 recommendation, if source is martian,
1605 * the only hint is MAC header.
1606 */
1607 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1608 "%u.%u.%u.%u, on dev %s\n",
1609 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
98e399f8 1610 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 1611 int i;
98e399f8 1612 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
1613 printk(KERN_WARNING "ll header: ");
1614 for (i = 0; i < dev->hard_header_len; i++, p++) {
1615 printk("%02x", *p);
1616 if (i < (dev->hard_header_len - 1))
1617 printk(":");
1618 }
1619 printk("\n");
1620 }
1621 }
1622#endif
1623}
1624
e905a9ed
YH
1625static inline int __mkroute_input(struct sk_buff *skb,
1626 struct fib_result* res,
1627 struct in_device *in_dev,
9e12bb22 1628 __be32 daddr, __be32 saddr, u32 tos,
e905a9ed 1629 struct rtable **result)
1da177e4
LT
1630{
1631
1632 struct rtable *rth;
1633 int err;
1634 struct in_device *out_dev;
1635 unsigned flags = 0;
d9c9df8c
AV
1636 __be32 spec_dst;
1637 u32 itag;
1da177e4
LT
1638
1639 /* get a working reference to the output device */
1640 out_dev = in_dev_get(FIB_RES_DEV(*res));
1641 if (out_dev == NULL) {
1642 if (net_ratelimit())
1643 printk(KERN_CRIT "Bug in ip_route_input" \
1644 "_slow(). Please, report\n");
1645 return -EINVAL;
1646 }
1647
1648
e905a9ed 1649 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1da177e4
LT
1650 in_dev->dev, &spec_dst, &itag);
1651 if (err < 0) {
e905a9ed 1652 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1653 saddr);
e905a9ed 1654
1da177e4
LT
1655 err = -EINVAL;
1656 goto cleanup;
1657 }
1658
1659 if (err)
1660 flags |= RTCF_DIRECTSRC;
1661
1662 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1663 (IN_DEV_SHARED_MEDIA(out_dev) ||
1664 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1665 flags |= RTCF_DOREDIRECT;
1666
1667 if (skb->protocol != htons(ETH_P_IP)) {
1668 /* Not IP (i.e. ARP). Do not create route, if it is
1669 * invalid for proxy arp. DNAT routes are always valid.
1670 */
1671 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1672 err = -EINVAL;
1673 goto cleanup;
1674 }
1675 }
1676
1677
1678 rth = dst_alloc(&ipv4_dst_ops);
1679 if (!rth) {
1680 err = -ENOBUFS;
1681 goto cleanup;
1682 }
1683
ce723d8e 1684 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 1685 rth->u.dst.flags= DST_HOST;
42f811b8 1686 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4 1687 rth->u.dst.flags |= DST_NOPOLICY;
42f811b8 1688 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1da177e4
LT
1689 rth->u.dst.flags |= DST_NOXFRM;
1690 rth->fl.fl4_dst = daddr;
1691 rth->rt_dst = daddr;
1692 rth->fl.fl4_tos = tos;
47dcf0cb 1693 rth->fl.mark = skb->mark;
1da177e4
LT
1694 rth->fl.fl4_src = saddr;
1695 rth->rt_src = saddr;
1696 rth->rt_gateway = daddr;
1697 rth->rt_iif =
1698 rth->fl.iif = in_dev->dev->ifindex;
1699 rth->u.dst.dev = (out_dev)->dev;
1700 dev_hold(rth->u.dst.dev);
1701 rth->idev = in_dev_get(rth->u.dst.dev);
1702 rth->fl.oif = 0;
1703 rth->rt_spec_dst= spec_dst;
1704
1705 rth->u.dst.input = ip_forward;
1706 rth->u.dst.output = ip_output;
1707
1708 rt_set_nexthop(rth, res, itag);
1709
1710 rth->rt_flags = flags;
1711
1712 *result = rth;
1713 err = 0;
1714 cleanup:
1715 /* release the working reference to the output device */
1716 in_dev_put(out_dev);
1717 return err;
e905a9ed 1718}
1da177e4 1719
e06e7c61
DM
1720static inline int ip_mkroute_input(struct sk_buff *skb,
1721 struct fib_result* res,
1722 const struct flowi *fl,
1723 struct in_device *in_dev,
1724 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1725{
7abaa27c 1726 struct rtable* rth = NULL;
1da177e4
LT
1727 int err;
1728 unsigned hash;
1729
1730#ifdef CONFIG_IP_ROUTE_MULTIPATH
1731 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1732 fib_select_multipath(fl, res);
1733#endif
1734
1735 /* create a routing cache entry */
1736 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1737 if (err)
1738 return err;
1da177e4
LT
1739
1740 /* put it into the cache */
8c7bc840 1741 hash = rt_hash(daddr, saddr, fl->iif);
e905a9ed 1742 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1da177e4
LT
1743}
1744
1da177e4
LT
1745/*
1746 * NOTE. We drop all the packets that has local source
1747 * addresses, because every properly looped back packet
1748 * must have correct destination already attached by output routine.
1749 *
1750 * Such approach solves two big problems:
1751 * 1. Not simplex devices are handled properly.
1752 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1753 */
1754
9e12bb22 1755static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1756 u8 tos, struct net_device *dev)
1757{
1758 struct fib_result res;
1759 struct in_device *in_dev = in_dev_get(dev);
1760 struct flowi fl = { .nl_u = { .ip4_u =
1761 { .daddr = daddr,
1762 .saddr = saddr,
1763 .tos = tos,
1764 .scope = RT_SCOPE_UNIVERSE,
1da177e4 1765 } },
47dcf0cb 1766 .mark = skb->mark,
1da177e4
LT
1767 .iif = dev->ifindex };
1768 unsigned flags = 0;
1769 u32 itag = 0;
1770 struct rtable * rth;
1771 unsigned hash;
9e12bb22 1772 __be32 spec_dst;
1da177e4
LT
1773 int err = -EINVAL;
1774 int free_res = 0;
1775
1776 /* IP on this device is disabled. */
1777
1778 if (!in_dev)
1779 goto out;
1780
1781 /* Check for the most weird martians, which can be not detected
1782 by fib_lookup.
1783 */
1784
1785 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1786 goto martian_source;
1787
e448515c 1788 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1da177e4
LT
1789 goto brd_input;
1790
1791 /* Accept zero addresses only to limited broadcast;
1792 * I even do not know to fix it or not. Waiting for complains :-)
1793 */
1794 if (ZERONET(saddr))
1795 goto martian_source;
1796
1797 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1798 goto martian_destination;
1799
1800 /*
1801 * Now we are ready to route packet.
1802 */
1803 if ((err = fib_lookup(&fl, &res)) != 0) {
1804 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1805 goto e_hostunreach;
1da177e4
LT
1806 goto no_route;
1807 }
1808 free_res = 1;
1809
1810 RT_CACHE_STAT_INC(in_slow_tot);
1811
1812 if (res.type == RTN_BROADCAST)
1813 goto brd_input;
1814
1815 if (res.type == RTN_LOCAL) {
1816 int result;
1817 result = fib_validate_source(saddr, daddr, tos,
1818 loopback_dev.ifindex,
1819 dev, &spec_dst, &itag);
1820 if (result < 0)
1821 goto martian_source;
1822 if (result)
1823 flags |= RTCF_DIRECTSRC;
1824 spec_dst = daddr;
1825 goto local_input;
1826 }
1827
1828 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1829 goto e_hostunreach;
1da177e4
LT
1830 if (res.type != RTN_UNICAST)
1831 goto martian_destination;
1832
1833 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1834 if (err == -ENOBUFS)
1835 goto e_nobufs;
1836 if (err == -EINVAL)
1837 goto e_inval;
e905a9ed 1838
1da177e4
LT
1839done:
1840 in_dev_put(in_dev);
1841 if (free_res)
1842 fib_res_put(&res);
1843out: return err;
1844
1845brd_input:
1846 if (skb->protocol != htons(ETH_P_IP))
1847 goto e_inval;
1848
1849 if (ZERONET(saddr))
1850 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1851 else {
1852 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1853 &itag);
1854 if (err < 0)
1855 goto martian_source;
1856 if (err)
1857 flags |= RTCF_DIRECTSRC;
1858 }
1859 flags |= RTCF_BROADCAST;
1860 res.type = RTN_BROADCAST;
1861 RT_CACHE_STAT_INC(in_brd);
1862
1863local_input:
1864 rth = dst_alloc(&ipv4_dst_ops);
1865 if (!rth)
1866 goto e_nobufs;
1867
1868 rth->u.dst.output= ip_rt_bug;
1869
1870 atomic_set(&rth->u.dst.__refcnt, 1);
1871 rth->u.dst.flags= DST_HOST;
42f811b8 1872 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
1873 rth->u.dst.flags |= DST_NOPOLICY;
1874 rth->fl.fl4_dst = daddr;
1875 rth->rt_dst = daddr;
1876 rth->fl.fl4_tos = tos;
47dcf0cb 1877 rth->fl.mark = skb->mark;
1da177e4
LT
1878 rth->fl.fl4_src = saddr;
1879 rth->rt_src = saddr;
1880#ifdef CONFIG_NET_CLS_ROUTE
1881 rth->u.dst.tclassid = itag;
1882#endif
1883 rth->rt_iif =
1884 rth->fl.iif = dev->ifindex;
1885 rth->u.dst.dev = &loopback_dev;
1886 dev_hold(rth->u.dst.dev);
1887 rth->idev = in_dev_get(rth->u.dst.dev);
1888 rth->rt_gateway = daddr;
1889 rth->rt_spec_dst= spec_dst;
1890 rth->u.dst.input= ip_local_deliver;
1891 rth->rt_flags = flags|RTCF_LOCAL;
1892 if (res.type == RTN_UNREACHABLE) {
1893 rth->u.dst.input= ip_error;
1894 rth->u.dst.error= -err;
1895 rth->rt_flags &= ~RTCF_LOCAL;
1896 }
1897 rth->rt_type = res.type;
8c7bc840 1898 hash = rt_hash(daddr, saddr, fl.iif);
1da177e4
LT
1899 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1900 goto done;
1901
1902no_route:
1903 RT_CACHE_STAT_INC(in_no_route);
1904 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1905 res.type = RTN_UNREACHABLE;
1906 goto local_input;
1907
1908 /*
1909 * Do not cache martian addresses: they should be logged (RFC1812)
1910 */
1911martian_destination:
1912 RT_CACHE_STAT_INC(in_martian_dst);
1913#ifdef CONFIG_IP_ROUTE_VERBOSE
1914 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1915 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1916 "%u.%u.%u.%u, dev %s\n",
1917 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1918#endif
2c2910a4
DE
1919
1920e_hostunreach:
e905a9ed
YH
1921 err = -EHOSTUNREACH;
1922 goto done;
2c2910a4 1923
1da177e4
LT
1924e_inval:
1925 err = -EINVAL;
1926 goto done;
1927
1928e_nobufs:
1929 err = -ENOBUFS;
1930 goto done;
1931
1932martian_source:
1933 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1934 goto e_inval;
1935}
1936
9e12bb22 1937int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1938 u8 tos, struct net_device *dev)
1939{
1940 struct rtable * rth;
1941 unsigned hash;
1942 int iif = dev->ifindex;
1943
1944 tos &= IPTOS_RT_MASK;
8c7bc840 1945 hash = rt_hash(daddr, saddr, iif);
1da177e4
LT
1946
1947 rcu_read_lock();
1948 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 1949 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
1950 if (rth->fl.fl4_dst == daddr &&
1951 rth->fl.fl4_src == saddr &&
1952 rth->fl.iif == iif &&
1953 rth->fl.oif == 0 &&
47dcf0cb 1954 rth->fl.mark == skb->mark &&
1da177e4
LT
1955 rth->fl.fl4_tos == tos) {
1956 rth->u.dst.lastuse = jiffies;
1957 dst_hold(&rth->u.dst);
1958 rth->u.dst.__use++;
1959 RT_CACHE_STAT_INC(in_hit);
1960 rcu_read_unlock();
1961 skb->dst = (struct dst_entry*)rth;
1962 return 0;
1963 }
1964 RT_CACHE_STAT_INC(in_hlist_search);
1965 }
1966 rcu_read_unlock();
1967
1968 /* Multicast recognition logic is moved from route cache to here.
1969 The problem was that too many Ethernet cards have broken/missing
1970 hardware multicast filters :-( As result the host on multicasting
1971 network acquires a lot of useless route cache entries, sort of
1972 SDR messages from all the world. Now we try to get rid of them.
1973 Really, provided software IP multicast filter is organized
1974 reasonably (at least, hashed), it does not result in a slowdown
1975 comparing with route cache reject entries.
1976 Note, that multicast routers are not affected, because
1977 route cache entry is created eventually.
1978 */
1979 if (MULTICAST(daddr)) {
1980 struct in_device *in_dev;
1981
1982 rcu_read_lock();
e5ed6399 1983 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1da177e4 1984 int our = ip_check_mc(in_dev, daddr, saddr,
eddc9ec5 1985 ip_hdr(skb)->protocol);
1da177e4
LT
1986 if (our
1987#ifdef CONFIG_IP_MROUTE
1988 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1989#endif
1990 ) {
1991 rcu_read_unlock();
1992 return ip_route_input_mc(skb, daddr, saddr,
1993 tos, dev, our);
1994 }
1995 }
1996 rcu_read_unlock();
1997 return -EINVAL;
1998 }
1999 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2000}
2001
2002static inline int __mkroute_output(struct rtable **result,
e905a9ed 2003 struct fib_result* res,
1da177e4 2004 const struct flowi *fl,
e905a9ed
YH
2005 const struct flowi *oldflp,
2006 struct net_device *dev_out,
2007 unsigned flags)
1da177e4
LT
2008{
2009 struct rtable *rth;
2010 struct in_device *in_dev;
2011 u32 tos = RT_FL_TOS(oldflp);
2012 int err = 0;
2013
2014 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2015 return -EINVAL;
2016
e448515c 2017 if (fl->fl4_dst == htonl(0xFFFFFFFF))
1da177e4
LT
2018 res->type = RTN_BROADCAST;
2019 else if (MULTICAST(fl->fl4_dst))
2020 res->type = RTN_MULTICAST;
2021 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2022 return -EINVAL;
2023
2024 if (dev_out->flags & IFF_LOOPBACK)
2025 flags |= RTCF_LOCAL;
2026
2027 /* get work reference to inet device */
2028 in_dev = in_dev_get(dev_out);
2029 if (!in_dev)
2030 return -EINVAL;
2031
2032 if (res->type == RTN_BROADCAST) {
2033 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2034 if (res->fi) {
2035 fib_info_put(res->fi);
2036 res->fi = NULL;
2037 }
2038 } else if (res->type == RTN_MULTICAST) {
2039 flags |= RTCF_MULTICAST|RTCF_LOCAL;
e905a9ed 2040 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
1da177e4
LT
2041 oldflp->proto))
2042 flags &= ~RTCF_LOCAL;
2043 /* If multicast route do not exist use
2044 default one, but do not gateway in this case.
2045 Yes, it is hack.
2046 */
2047 if (res->fi && res->prefixlen < 4) {
2048 fib_info_put(res->fi);
2049 res->fi = NULL;
2050 }
2051 }
2052
2053
2054 rth = dst_alloc(&ipv4_dst_ops);
2055 if (!rth) {
2056 err = -ENOBUFS;
2057 goto cleanup;
e905a9ed 2058 }
1da177e4 2059
ce723d8e 2060 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 2061 rth->u.dst.flags= DST_HOST;
42f811b8 2062 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
1da177e4 2063 rth->u.dst.flags |= DST_NOXFRM;
42f811b8 2064 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
2065 rth->u.dst.flags |= DST_NOPOLICY;
2066
2067 rth->fl.fl4_dst = oldflp->fl4_dst;
2068 rth->fl.fl4_tos = tos;
2069 rth->fl.fl4_src = oldflp->fl4_src;
2070 rth->fl.oif = oldflp->oif;
47dcf0cb 2071 rth->fl.mark = oldflp->mark;
1da177e4
LT
2072 rth->rt_dst = fl->fl4_dst;
2073 rth->rt_src = fl->fl4_src;
2074 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
e905a9ed 2075 /* get references to the devices that are to be hold by the routing
1da177e4
LT
2076 cache entry */
2077 rth->u.dst.dev = dev_out;
2078 dev_hold(dev_out);
2079 rth->idev = in_dev_get(dev_out);
2080 rth->rt_gateway = fl->fl4_dst;
2081 rth->rt_spec_dst= fl->fl4_src;
2082
2083 rth->u.dst.output=ip_output;
2084
2085 RT_CACHE_STAT_INC(out_slow_tot);
2086
2087 if (flags & RTCF_LOCAL) {
2088 rth->u.dst.input = ip_local_deliver;
2089 rth->rt_spec_dst = fl->fl4_dst;
2090 }
2091 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2092 rth->rt_spec_dst = fl->fl4_src;
e905a9ed 2093 if (flags & RTCF_LOCAL &&
1da177e4
LT
2094 !(dev_out->flags & IFF_LOOPBACK)) {
2095 rth->u.dst.output = ip_mc_output;
2096 RT_CACHE_STAT_INC(out_slow_mc);
2097 }
2098#ifdef CONFIG_IP_MROUTE
2099 if (res->type == RTN_MULTICAST) {
2100 if (IN_DEV_MFORWARD(in_dev) &&
2101 !LOCAL_MCAST(oldflp->fl4_dst)) {
2102 rth->u.dst.input = ip_mr_input;
2103 rth->u.dst.output = ip_mc_output;
2104 }
2105 }
2106#endif
2107 }
2108
2109 rt_set_nexthop(rth, res, 0);
2110
2111 rth->rt_flags = flags;
2112
2113 *result = rth;
2114 cleanup:
2115 /* release work reference to inet device */
2116 in_dev_put(in_dev);
2117
2118 return err;
2119}
2120
e06e7c61
DM
2121static inline int ip_mkroute_output(struct rtable **rp,
2122 struct fib_result* res,
2123 const struct flowi *fl,
2124 const struct flowi *oldflp,
2125 struct net_device *dev_out,
2126 unsigned flags)
1da177e4 2127{
7abaa27c 2128 struct rtable *rth = NULL;
1da177e4
LT
2129 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2130 unsigned hash;
2131 if (err == 0) {
8c7bc840 2132 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
1da177e4
LT
2133 err = rt_intern_hash(hash, rth, rp);
2134 }
e905a9ed 2135
1da177e4
LT
2136 return err;
2137}
2138
1da177e4
LT
2139/*
2140 * Major route resolver routine.
2141 */
2142
2143static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2144{
2145 u32 tos = RT_FL_TOS(oldflp);
2146 struct flowi fl = { .nl_u = { .ip4_u =
2147 { .daddr = oldflp->fl4_dst,
2148 .saddr = oldflp->fl4_src,
2149 .tos = tos & IPTOS_RT_MASK,
2150 .scope = ((tos & RTO_ONLINK) ?
2151 RT_SCOPE_LINK :
2152 RT_SCOPE_UNIVERSE),
1da177e4 2153 } },
47dcf0cb 2154 .mark = oldflp->mark,
1da177e4
LT
2155 .iif = loopback_dev.ifindex,
2156 .oif = oldflp->oif };
2157 struct fib_result res;
2158 unsigned flags = 0;
2159 struct net_device *dev_out = NULL;
2160 int free_res = 0;
2161 int err;
2162
2163
2164 res.fi = NULL;
2165#ifdef CONFIG_IP_MULTIPLE_TABLES
2166 res.r = NULL;
2167#endif
2168
2169 if (oldflp->fl4_src) {
2170 err = -EINVAL;
2171 if (MULTICAST(oldflp->fl4_src) ||
2172 BADCLASS(oldflp->fl4_src) ||
2173 ZERONET(oldflp->fl4_src))
2174 goto out;
2175
2176 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2177 dev_out = ip_dev_find(oldflp->fl4_src);
f6c5d736 2178 if (dev_out == NULL)
1da177e4
LT
2179 goto out;
2180
2181 /* I removed check for oif == dev_out->oif here.
2182 It was wrong for two reasons:
2183 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2184 assigned to multiple interfaces.
2185 2. Moreover, we are allowed to send packets with saddr
2186 of another iface. --ANK
2187 */
2188
f6c5d736 2189 if (oldflp->oif == 0
e448515c 2190 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
1da177e4
LT
2191 /* Special hack: user can direct multicasts
2192 and limited broadcast via necessary interface
2193 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2194 This hack is not just for fun, it allows
2195 vic,vat and friends to work.
2196 They bind socket to loopback, set ttl to zero
2197 and expect that it will work.
2198 From the viewpoint of routing cache they are broken,
2199 because we are not allowed to build multicast path
2200 with loopback source addr (look, routing cache
2201 cannot know, that ttl is zero, so that packet
2202 will not leave this host and route is valid).
2203 Luckily, this hack is good workaround.
2204 */
2205
2206 fl.oif = dev_out->ifindex;
2207 goto make_route;
2208 }
2209 if (dev_out)
2210 dev_put(dev_out);
2211 dev_out = NULL;
2212 }
2213
2214
2215 if (oldflp->oif) {
881d966b 2216 dev_out = dev_get_by_index(&init_net, oldflp->oif);
1da177e4
LT
2217 err = -ENODEV;
2218 if (dev_out == NULL)
2219 goto out;
e5ed6399
HX
2220
2221 /* RACE: Check return value of inet_select_addr instead. */
2222 if (__in_dev_get_rtnl(dev_out) == NULL) {
1da177e4
LT
2223 dev_put(dev_out);
2224 goto out; /* Wrong error code */
2225 }
2226
e448515c 2227 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
1da177e4
LT
2228 if (!fl.fl4_src)
2229 fl.fl4_src = inet_select_addr(dev_out, 0,
2230 RT_SCOPE_LINK);
2231 goto make_route;
2232 }
2233 if (!fl.fl4_src) {
2234 if (MULTICAST(oldflp->fl4_dst))
2235 fl.fl4_src = inet_select_addr(dev_out, 0,
2236 fl.fl4_scope);
2237 else if (!oldflp->fl4_dst)
2238 fl.fl4_src = inet_select_addr(dev_out, 0,
2239 RT_SCOPE_HOST);
2240 }
2241 }
2242
2243 if (!fl.fl4_dst) {
2244 fl.fl4_dst = fl.fl4_src;
2245 if (!fl.fl4_dst)
2246 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2247 if (dev_out)
2248 dev_put(dev_out);
2249 dev_out = &loopback_dev;
2250 dev_hold(dev_out);
2251 fl.oif = loopback_dev.ifindex;
2252 res.type = RTN_LOCAL;
2253 flags |= RTCF_LOCAL;
2254 goto make_route;
2255 }
2256
2257 if (fib_lookup(&fl, &res)) {
2258 res.fi = NULL;
2259 if (oldflp->oif) {
2260 /* Apparently, routing tables are wrong. Assume,
2261 that the destination is on link.
2262
2263 WHY? DW.
2264 Because we are allowed to send to iface
2265 even if it has NO routes and NO assigned
2266 addresses. When oif is specified, routing
2267 tables are looked up with only one purpose:
2268 to catch if destination is gatewayed, rather than
2269 direct. Moreover, if MSG_DONTROUTE is set,
2270 we send packet, ignoring both routing tables
2271 and ifaddr state. --ANK
2272
2273
2274 We could make it even if oif is unknown,
2275 likely IPv6, but we do not.
2276 */
2277
2278 if (fl.fl4_src == 0)
2279 fl.fl4_src = inet_select_addr(dev_out, 0,
2280 RT_SCOPE_LINK);
2281 res.type = RTN_UNICAST;
2282 goto make_route;
2283 }
2284 if (dev_out)
2285 dev_put(dev_out);
2286 err = -ENETUNREACH;
2287 goto out;
2288 }
2289 free_res = 1;
2290
2291 if (res.type == RTN_LOCAL) {
2292 if (!fl.fl4_src)
2293 fl.fl4_src = fl.fl4_dst;
2294 if (dev_out)
2295 dev_put(dev_out);
2296 dev_out = &loopback_dev;
2297 dev_hold(dev_out);
2298 fl.oif = dev_out->ifindex;
2299 if (res.fi)
2300 fib_info_put(res.fi);
2301 res.fi = NULL;
2302 flags |= RTCF_LOCAL;
2303 goto make_route;
2304 }
2305
2306#ifdef CONFIG_IP_ROUTE_MULTIPATH
2307 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2308 fib_select_multipath(&fl, &res);
2309 else
2310#endif
2311 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2312 fib_select_default(&fl, &res);
2313
2314 if (!fl.fl4_src)
2315 fl.fl4_src = FIB_RES_PREFSRC(res);
2316
2317 if (dev_out)
2318 dev_put(dev_out);
2319 dev_out = FIB_RES_DEV(res);
2320 dev_hold(dev_out);
2321 fl.oif = dev_out->ifindex;
2322
2323
2324make_route:
2325 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2326
2327
2328 if (free_res)
2329 fib_res_put(&res);
2330 if (dev_out)
2331 dev_put(dev_out);
2332out: return err;
2333}
2334
2335int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2336{
2337 unsigned hash;
2338 struct rtable *rth;
2339
8c7bc840 2340 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
1da177e4
LT
2341
2342 rcu_read_lock_bh();
2343 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 2344 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
2345 if (rth->fl.fl4_dst == flp->fl4_dst &&
2346 rth->fl.fl4_src == flp->fl4_src &&
2347 rth->fl.iif == 0 &&
2348 rth->fl.oif == flp->oif &&
47dcf0cb 2349 rth->fl.mark == flp->mark &&
1da177e4
LT
2350 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2351 (IPTOS_RT_MASK | RTO_ONLINK))) {
1da177e4
LT
2352 rth->u.dst.lastuse = jiffies;
2353 dst_hold(&rth->u.dst);
2354 rth->u.dst.__use++;
2355 RT_CACHE_STAT_INC(out_hit);
2356 rcu_read_unlock_bh();
2357 *rp = rth;
2358 return 0;
2359 }
2360 RT_CACHE_STAT_INC(out_hlist_search);
2361 }
2362 rcu_read_unlock_bh();
2363
2364 return ip_route_output_slow(rp, flp);
2365}
2366
d8c97a94
ACM
2367EXPORT_SYMBOL_GPL(__ip_route_output_key);
2368
14e50e57
DM
2369static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2370{
2371}
2372
2373static struct dst_ops ipv4_dst_blackhole_ops = {
2374 .family = AF_INET,
2375 .protocol = __constant_htons(ETH_P_IP),
2376 .destroy = ipv4_dst_destroy,
2377 .check = ipv4_dst_check,
2378 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2379 .entry_size = sizeof(struct rtable),
2380};
2381
2382
2383static int ipv4_blackhole_output(struct sk_buff *skb)
2384{
2385 kfree_skb(skb);
2386 return 0;
2387}
2388
2389static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2390{
2391 struct rtable *ort = *rp;
2392 struct rtable *rt = (struct rtable *)
2393 dst_alloc(&ipv4_dst_blackhole_ops);
2394
2395 if (rt) {
2396 struct dst_entry *new = &rt->u.dst;
2397
2398 atomic_set(&new->__refcnt, 1);
2399 new->__use = 1;
2400 new->input = ipv4_blackhole_output;
2401 new->output = ipv4_blackhole_output;
2402 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2403
2404 new->dev = ort->u.dst.dev;
2405 if (new->dev)
2406 dev_hold(new->dev);
2407
2408 rt->fl = ort->fl;
2409
2410 rt->idev = ort->idev;
2411 if (rt->idev)
2412 in_dev_hold(rt->idev);
2413 rt->rt_flags = ort->rt_flags;
2414 rt->rt_type = ort->rt_type;
2415 rt->rt_dst = ort->rt_dst;
2416 rt->rt_src = ort->rt_src;
2417 rt->rt_iif = ort->rt_iif;
2418 rt->rt_gateway = ort->rt_gateway;
2419 rt->rt_spec_dst = ort->rt_spec_dst;
2420 rt->peer = ort->peer;
2421 if (rt->peer)
2422 atomic_inc(&rt->peer->refcnt);
2423
2424 dst_free(new);
2425 }
2426
2427 dst_release(&(*rp)->u.dst);
2428 *rp = rt;
2429 return (rt ? 0 : -ENOMEM);
2430}
2431
1da177e4
LT
2432int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2433{
2434 int err;
2435
2436 if ((err = __ip_route_output_key(rp, flp)) != 0)
2437 return err;
2438
2439 if (flp->proto) {
2440 if (!flp->fl4_src)
2441 flp->fl4_src = (*rp)->rt_src;
2442 if (!flp->fl4_dst)
2443 flp->fl4_dst = (*rp)->rt_dst;
14e50e57
DM
2444 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2445 if (err == -EREMOTE)
2446 err = ipv4_dst_blackhole(rp, flp, sk);
2447
2448 return err;
1da177e4
LT
2449 }
2450
2451 return 0;
2452}
2453
d8c97a94
ACM
2454EXPORT_SYMBOL_GPL(ip_route_output_flow);
2455
1da177e4
LT
2456int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2457{
2458 return ip_route_output_flow(rp, flp, NULL, 0);
2459}
2460
2461static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2462 int nowait, unsigned int flags)
1da177e4
LT
2463{
2464 struct rtable *rt = (struct rtable*)skb->dst;
2465 struct rtmsg *r;
be403ea1 2466 struct nlmsghdr *nlh;
e3703b3d
TG
2467 long expires;
2468 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2469
2470 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2471 if (nlh == NULL)
26932566 2472 return -EMSGSIZE;
be403ea1
TG
2473
2474 r = nlmsg_data(nlh);
1da177e4
LT
2475 r->rtm_family = AF_INET;
2476 r->rtm_dst_len = 32;
2477 r->rtm_src_len = 0;
2478 r->rtm_tos = rt->fl.fl4_tos;
2479 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2480 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2481 r->rtm_type = rt->rt_type;
2482 r->rtm_scope = RT_SCOPE_UNIVERSE;
2483 r->rtm_protocol = RTPROT_UNSPEC;
2484 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2485 if (rt->rt_flags & RTCF_NOTIFY)
2486 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2487
17fb2c64 2488 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2489
1da177e4
LT
2490 if (rt->fl.fl4_src) {
2491 r->rtm_src_len = 32;
17fb2c64 2492 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
1da177e4
LT
2493 }
2494 if (rt->u.dst.dev)
be403ea1 2495 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
1da177e4
LT
2496#ifdef CONFIG_NET_CLS_ROUTE
2497 if (rt->u.dst.tclassid)
be403ea1 2498 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
1da177e4
LT
2499#endif
2500 if (rt->fl.iif)
17fb2c64 2501 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
1da177e4 2502 else if (rt->rt_src != rt->fl.fl4_src)
17fb2c64 2503 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2504
1da177e4 2505 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2506 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2507
1da177e4 2508 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
be403ea1
TG
2509 goto nla_put_failure;
2510
e3703b3d
TG
2511 error = rt->u.dst.error;
2512 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
1da177e4 2513 if (rt->peer) {
e3703b3d 2514 id = rt->peer->ip_id_count;
1da177e4 2515 if (rt->peer->tcp_ts_stamp) {
e3703b3d 2516 ts = rt->peer->tcp_ts;
9d729f72 2517 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
1da177e4
LT
2518 }
2519 }
be403ea1 2520
1da177e4
LT
2521 if (rt->fl.iif) {
2522#ifdef CONFIG_IP_MROUTE
e448515c 2523 __be32 dst = rt->rt_dst;
1da177e4
LT
2524
2525 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
42f811b8 2526 IPV4_DEVCONF_ALL(MC_FORWARDING)) {
1da177e4
LT
2527 int err = ipmr_get_route(skb, r, nowait);
2528 if (err <= 0) {
2529 if (!nowait) {
2530 if (err == 0)
2531 return 0;
be403ea1 2532 goto nla_put_failure;
1da177e4
LT
2533 } else {
2534 if (err == -EMSGSIZE)
be403ea1 2535 goto nla_put_failure;
e3703b3d 2536 error = err;
1da177e4
LT
2537 }
2538 }
2539 } else
2540#endif
be403ea1 2541 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
1da177e4
LT
2542 }
2543
e3703b3d
TG
2544 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2545 expires, error) < 0)
2546 goto nla_put_failure;
be403ea1
TG
2547
2548 return nlmsg_end(skb, nlh);
1da177e4 2549
be403ea1 2550nla_put_failure:
26932566
PM
2551 nlmsg_cancel(skb, nlh);
2552 return -EMSGSIZE;
1da177e4
LT
2553}
2554
63f3444f 2555static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 2556{
d889ce3b
TG
2557 struct rtmsg *rtm;
2558 struct nlattr *tb[RTA_MAX+1];
1da177e4 2559 struct rtable *rt = NULL;
9e12bb22
AV
2560 __be32 dst = 0;
2561 __be32 src = 0;
2562 u32 iif;
d889ce3b 2563 int err;
1da177e4
LT
2564 struct sk_buff *skb;
2565
d889ce3b
TG
2566 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2567 if (err < 0)
2568 goto errout;
2569
2570 rtm = nlmsg_data(nlh);
2571
1da177e4 2572 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2573 if (skb == NULL) {
2574 err = -ENOBUFS;
2575 goto errout;
2576 }
1da177e4
LT
2577
2578 /* Reserve room for dummy headers, this skb can pass
2579 through good chunk of routing engine.
2580 */
459a98ed 2581 skb_reset_mac_header(skb);
c1d2bbe1 2582 skb_reset_network_header(skb);
d2c962b8
SH
2583
2584 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2585 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2586 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2587
17fb2c64
AV
2588 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2589 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2590 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
1da177e4
LT
2591
2592 if (iif) {
d889ce3b
TG
2593 struct net_device *dev;
2594
881d966b 2595 dev = __dev_get_by_index(&init_net, iif);
d889ce3b
TG
2596 if (dev == NULL) {
2597 err = -ENODEV;
2598 goto errout_free;
2599 }
2600
1da177e4
LT
2601 skb->protocol = htons(ETH_P_IP);
2602 skb->dev = dev;
2603 local_bh_disable();
2604 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2605 local_bh_enable();
d889ce3b
TG
2606
2607 rt = (struct rtable*) skb->dst;
2608 if (err == 0 && rt->u.dst.error)
1da177e4
LT
2609 err = -rt->u.dst.error;
2610 } else {
d889ce3b
TG
2611 struct flowi fl = {
2612 .nl_u = {
2613 .ip4_u = {
2614 .daddr = dst,
2615 .saddr = src,
2616 .tos = rtm->rtm_tos,
2617 },
2618 },
2619 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2620 };
1da177e4
LT
2621 err = ip_route_output_key(&rt, &fl);
2622 }
d889ce3b 2623
1da177e4 2624 if (err)
d889ce3b 2625 goto errout_free;
1da177e4
LT
2626
2627 skb->dst = &rt->u.dst;
2628 if (rtm->rtm_flags & RTM_F_NOTIFY)
2629 rt->rt_flags |= RTCF_NOTIFY;
2630
1da177e4 2631 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
b6544c0b 2632 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2633 if (err <= 0)
2634 goto errout_free;
1da177e4 2635
2942e900 2636 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
d889ce3b 2637errout:
2942e900 2638 return err;
1da177e4 2639
d889ce3b 2640errout_free:
1da177e4 2641 kfree_skb(skb);
d889ce3b 2642 goto errout;
1da177e4
LT
2643}
2644
2645int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2646{
2647 struct rtable *rt;
2648 int h, s_h;
2649 int idx, s_idx;
2650
2651 s_h = cb->args[0];
2652 s_idx = idx = cb->args[1];
2653 for (h = 0; h <= rt_hash_mask; h++) {
2654 if (h < s_h) continue;
2655 if (h > s_h)
2656 s_idx = 0;
2657 rcu_read_lock_bh();
2658 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
093c2ca4 2659 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
1da177e4
LT
2660 if (idx < s_idx)
2661 continue;
2662 skb->dst = dst_clone(&rt->u.dst);
2663 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
e905a9ed 2664 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 2665 1, NLM_F_MULTI) <= 0) {
1da177e4
LT
2666 dst_release(xchg(&skb->dst, NULL));
2667 rcu_read_unlock_bh();
2668 goto done;
2669 }
2670 dst_release(xchg(&skb->dst, NULL));
2671 }
2672 rcu_read_unlock_bh();
2673 }
2674
2675done:
2676 cb->args[0] = h;
2677 cb->args[1] = idx;
2678 return skb->len;
2679}
2680
2681void ip_rt_multicast_event(struct in_device *in_dev)
2682{
2683 rt_cache_flush(0);
2684}
2685
2686#ifdef CONFIG_SYSCTL
2687static int flush_delay;
2688
2689static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2690 struct file *filp, void __user *buffer,
2691 size_t *lenp, loff_t *ppos)
2692{
2693 if (write) {
2694 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2695 rt_cache_flush(flush_delay);
2696 return 0;
e905a9ed 2697 }
1da177e4
LT
2698
2699 return -EINVAL;
2700}
2701
2702static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2703 int __user *name,
2704 int nlen,
2705 void __user *oldval,
2706 size_t __user *oldlenp,
2707 void __user *newval,
1f29bcd7 2708 size_t newlen)
1da177e4
LT
2709{
2710 int delay;
2711 if (newlen != sizeof(int))
2712 return -EINVAL;
2713 if (get_user(delay, (int __user *)newval))
e905a9ed
YH
2714 return -EFAULT;
2715 rt_cache_flush(delay);
1da177e4
LT
2716 return 0;
2717}
2718
2719ctl_table ipv4_route_table[] = {
e905a9ed 2720 {
1da177e4
LT
2721 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2722 .procname = "flush",
2723 .data = &flush_delay,
2724 .maxlen = sizeof(int),
7e3e0360 2725 .mode = 0200,
1da177e4
LT
2726 .proc_handler = &ipv4_sysctl_rtcache_flush,
2727 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2728 },
2729 {
2730 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2731 .procname = "min_delay",
2732 .data = &ip_rt_min_delay,
2733 .maxlen = sizeof(int),
2734 .mode = 0644,
2735 .proc_handler = &proc_dointvec_jiffies,
2736 .strategy = &sysctl_jiffies,
2737 },
2738 {
2739 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2740 .procname = "max_delay",
2741 .data = &ip_rt_max_delay,
2742 .maxlen = sizeof(int),
2743 .mode = 0644,
2744 .proc_handler = &proc_dointvec_jiffies,
2745 .strategy = &sysctl_jiffies,
2746 },
2747 {
2748 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2749 .procname = "gc_thresh",
2750 .data = &ipv4_dst_ops.gc_thresh,
2751 .maxlen = sizeof(int),
2752 .mode = 0644,
2753 .proc_handler = &proc_dointvec,
2754 },
2755 {
2756 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2757 .procname = "max_size",
2758 .data = &ip_rt_max_size,
2759 .maxlen = sizeof(int),
2760 .mode = 0644,
2761 .proc_handler = &proc_dointvec,
2762 },
2763 {
2764 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2765
1da177e4
LT
2766 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2767 .procname = "gc_min_interval",
2768 .data = &ip_rt_gc_min_interval,
2769 .maxlen = sizeof(int),
2770 .mode = 0644,
2771 .proc_handler = &proc_dointvec_jiffies,
2772 .strategy = &sysctl_jiffies,
2773 },
2774 {
2775 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2776 .procname = "gc_min_interval_ms",
2777 .data = &ip_rt_gc_min_interval,
2778 .maxlen = sizeof(int),
2779 .mode = 0644,
2780 .proc_handler = &proc_dointvec_ms_jiffies,
2781 .strategy = &sysctl_ms_jiffies,
2782 },
2783 {
2784 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2785 .procname = "gc_timeout",
2786 .data = &ip_rt_gc_timeout,
2787 .maxlen = sizeof(int),
2788 .mode = 0644,
2789 .proc_handler = &proc_dointvec_jiffies,
2790 .strategy = &sysctl_jiffies,
2791 },
2792 {
2793 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2794 .procname = "gc_interval",
2795 .data = &ip_rt_gc_interval,
2796 .maxlen = sizeof(int),
2797 .mode = 0644,
2798 .proc_handler = &proc_dointvec_jiffies,
2799 .strategy = &sysctl_jiffies,
2800 },
2801 {
2802 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2803 .procname = "redirect_load",
2804 .data = &ip_rt_redirect_load,
2805 .maxlen = sizeof(int),
2806 .mode = 0644,
2807 .proc_handler = &proc_dointvec,
2808 },
2809 {
2810 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2811 .procname = "redirect_number",
2812 .data = &ip_rt_redirect_number,
2813 .maxlen = sizeof(int),
2814 .mode = 0644,
2815 .proc_handler = &proc_dointvec,
2816 },
2817 {
2818 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2819 .procname = "redirect_silence",
2820 .data = &ip_rt_redirect_silence,
2821 .maxlen = sizeof(int),
2822 .mode = 0644,
2823 .proc_handler = &proc_dointvec,
2824 },
2825 {
2826 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2827 .procname = "error_cost",
2828 .data = &ip_rt_error_cost,
2829 .maxlen = sizeof(int),
2830 .mode = 0644,
2831 .proc_handler = &proc_dointvec,
2832 },
2833 {
2834 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2835 .procname = "error_burst",
2836 .data = &ip_rt_error_burst,
2837 .maxlen = sizeof(int),
2838 .mode = 0644,
2839 .proc_handler = &proc_dointvec,
2840 },
2841 {
2842 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2843 .procname = "gc_elasticity",
2844 .data = &ip_rt_gc_elasticity,
2845 .maxlen = sizeof(int),
2846 .mode = 0644,
2847 .proc_handler = &proc_dointvec,
2848 },
2849 {
2850 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2851 .procname = "mtu_expires",
2852 .data = &ip_rt_mtu_expires,
2853 .maxlen = sizeof(int),
2854 .mode = 0644,
2855 .proc_handler = &proc_dointvec_jiffies,
2856 .strategy = &sysctl_jiffies,
2857 },
2858 {
2859 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2860 .procname = "min_pmtu",
2861 .data = &ip_rt_min_pmtu,
2862 .maxlen = sizeof(int),
2863 .mode = 0644,
2864 .proc_handler = &proc_dointvec,
2865 },
2866 {
2867 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2868 .procname = "min_adv_mss",
2869 .data = &ip_rt_min_advmss,
2870 .maxlen = sizeof(int),
2871 .mode = 0644,
2872 .proc_handler = &proc_dointvec,
2873 },
2874 {
2875 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2876 .procname = "secret_interval",
2877 .data = &ip_rt_secret_interval,
2878 .maxlen = sizeof(int),
2879 .mode = 0644,
2880 .proc_handler = &proc_dointvec_jiffies,
2881 .strategy = &sysctl_jiffies,
2882 },
2883 { .ctl_name = 0 }
2884};
2885#endif
2886
2887#ifdef CONFIG_NET_CLS_ROUTE
2888struct ip_rt_acct *ip_rt_acct;
2889
2890/* This code sucks. But you should have seen it before! --RR */
2891
2892/* IP route accounting ptr for this logical cpu number. */
2893#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2894
2895#ifdef CONFIG_PROC_FS
2896static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2897 int length, int *eof, void *data)
2898{
2899 unsigned int i;
2900
2901 if ((offset & 3) || (length & 3))
2902 return -EIO;
2903
2904 if (offset >= sizeof(struct ip_rt_acct) * 256) {
2905 *eof = 1;
2906 return 0;
2907 }
2908
2909 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2910 length = sizeof(struct ip_rt_acct) * 256 - offset;
2911 *eof = 1;
2912 }
2913
2914 offset /= sizeof(u32);
2915
2916 if (length > 0) {
2917 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2918 u32 *dst = (u32 *) buffer;
2919
2920 /* Copy first cpu. */
2921 *start = buffer;
2922 memcpy(dst, src, length);
2923
2924 /* Add the other cpus in, one int at a time */
6f912042 2925 for_each_possible_cpu(i) {
1da177e4
LT
2926 unsigned int j;
2927
2928 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2929
2930 for (j = 0; j < length/4; j++)
2931 dst[j] += src[j];
2932 }
2933 }
2934 return length;
2935}
2936#endif /* CONFIG_PROC_FS */
2937#endif /* CONFIG_NET_CLS_ROUTE */
2938
2939static __initdata unsigned long rhash_entries;
2940static int __init set_rhash_entries(char *str)
2941{
2942 if (!str)
2943 return 0;
2944 rhash_entries = simple_strtoul(str, &str, 0);
2945 return 1;
2946}
2947__setup("rhash_entries=", set_rhash_entries);
2948
2949int __init ip_rt_init(void)
2950{
424c4b70 2951 int rc = 0;
1da177e4
LT
2952
2953 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2954 (jiffies ^ (jiffies >> 7)));
2955
2956#ifdef CONFIG_NET_CLS_ROUTE
424c4b70
ED
2957 {
2958 int order;
1da177e4
LT
2959 for (order = 0;
2960 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2961 /* NOTHING */;
2962 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2963 if (!ip_rt_acct)
2964 panic("IP: failed to allocate ip_rt_acct\n");
2965 memset(ip_rt_acct, 0, PAGE_SIZE << order);
424c4b70 2966 }
1da177e4
LT
2967#endif
2968
e5d679f3
AD
2969 ipv4_dst_ops.kmem_cachep =
2970 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 2971 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 2972
14e50e57
DM
2973 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2974
424c4b70
ED
2975 rt_hash_table = (struct rt_hash_bucket *)
2976 alloc_large_system_hash("IP route cache",
2977 sizeof(struct rt_hash_bucket),
2978 rhash_entries,
2979 (num_physpages >= 128 * 1024) ?
18955cfc 2980 15 : 17,
8d1502de 2981 0,
424c4b70
ED
2982 &rt_hash_log,
2983 &rt_hash_mask,
2984 0);
22c047cc
ED
2985 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2986 rt_hash_lock_init();
1da177e4
LT
2987
2988 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2989 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2990
1da177e4
LT
2991 devinet_init();
2992 ip_fib_init();
2993
2994 init_timer(&rt_flush_timer);
2995 rt_flush_timer.function = rt_run_flush;
2996 init_timer(&rt_periodic_timer);
2997 rt_periodic_timer.function = rt_check_expire;
2998 init_timer(&rt_secret_timer);
2999 rt_secret_timer.function = rt_secret_rebuild;
3000
3001 /* All the timers, started at system startup tend
3002 to synchronize. Perturb it a bit.
3003 */
3004 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3005 ip_rt_gc_interval;
3006 add_timer(&rt_periodic_timer);
3007
3008 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3009 ip_rt_secret_interval;
3010 add_timer(&rt_secret_timer);
3011
3012#ifdef CONFIG_PROC_FS
3013 {
3014 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
457c4cbc 3015 if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
e905a9ed 3016 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
457c4cbc 3017 init_net.proc_net_stat))) {
1da177e4
LT
3018 return -ENOMEM;
3019 }
3020 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3021 }
3022#ifdef CONFIG_NET_CLS_ROUTE
457c4cbc 3023 create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
1da177e4
LT
3024#endif
3025#endif
3026#ifdef CONFIG_XFRM
3027 xfrm_init();
3028 xfrm4_init();
3029#endif
63f3444f
TG
3030 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3031
1da177e4
LT
3032 return rc;
3033}
3034
3035EXPORT_SYMBOL(__ip_select_ident);
3036EXPORT_SYMBOL(ip_route_input);
3037EXPORT_SYMBOL(ip_route_output_key);