]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/route.c
[IPV4] ROUTE: Collect proc-related functions together
[net-next-2.6.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
e905a9ed 23 * Alan Cox : Super /proc >4K
1da177e4
LT
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
e905a9ed 41 *
1da177e4
LT
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
60 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
1da177e4
LT
67#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
1da177e4 73#include <linux/mm.h>
424c4b70 74#include <linux/bootmem.h>
1da177e4
LT
75#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
39c90ece 84#include <linux/workqueue.h>
1da177e4 85#include <linux/skbuff.h>
1da177e4
LT
86#include <linux/inetdevice.h>
87#include <linux/igmp.h>
88#include <linux/pkt_sched.h>
89#include <linux/mroute.h>
90#include <linux/netfilter_ipv4.h>
91#include <linux/random.h>
92#include <linux/jhash.h>
93#include <linux/rcupdate.h>
94#include <linux/times.h>
352e512c 95#include <net/dst.h>
457c4cbc 96#include <net/net_namespace.h>
1da177e4
LT
97#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
112
113#define RT_FL_TOS(oldflp) \
114 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116#define IP_MAX_MTU 0xFFF0
117
118#define RT_GC_TIMEOUT (300*HZ)
119
120static int ip_rt_min_delay = 2 * HZ;
121static int ip_rt_max_delay = 10 * HZ;
122static int ip_rt_max_size;
123static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
124static int ip_rt_gc_interval = 60 * HZ;
125static int ip_rt_gc_min_interval = HZ / 2;
126static int ip_rt_redirect_number = 9;
127static int ip_rt_redirect_load = HZ / 50;
128static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
129static int ip_rt_error_cost = HZ;
130static int ip_rt_error_burst = 5 * HZ;
131static int ip_rt_gc_elasticity = 8;
132static int ip_rt_mtu_expires = 10 * 60 * HZ;
133static int ip_rt_min_pmtu = 512 + 20 + 20;
134static int ip_rt_min_advmss = 256;
135static int ip_rt_secret_interval = 10 * 60 * HZ;
beb659bd 136static int ip_rt_flush_expected;
1da177e4
LT
137static unsigned long rt_deadline;
138
139#define RTprint(a...) printk(KERN_DEBUG a)
140
141static struct timer_list rt_flush_timer;
beb659bd
ED
142static void rt_worker_func(struct work_struct *work);
143static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
1da177e4
LT
144static struct timer_list rt_secret_timer;
145
146/*
147 * Interface to generic destination cache.
148 */
149
150static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151static void ipv4_dst_destroy(struct dst_entry *dst);
152static void ipv4_dst_ifdown(struct dst_entry *dst,
153 struct net_device *dev, int how);
154static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155static void ipv4_link_failure(struct sk_buff *skb);
156static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
157static int rt_garbage_collect(void);
158
159
160static struct dst_ops ipv4_dst_ops = {
161 .family = AF_INET,
162 .protocol = __constant_htons(ETH_P_IP),
163 .gc = rt_garbage_collect,
164 .check = ipv4_dst_check,
165 .destroy = ipv4_dst_destroy,
166 .ifdown = ipv4_dst_ifdown,
167 .negative_advice = ipv4_negative_advice,
168 .link_failure = ipv4_link_failure,
169 .update_pmtu = ip_rt_update_pmtu,
862b82c6 170 .local_out = ip_local_out,
1da177e4
LT
171 .entry_size = sizeof(struct rtable),
172};
173
174#define ECN_OR_COST(class) TC_PRIO_##class
175
4839c52b 176const __u8 ip_tos2prio[16] = {
1da177e4
LT
177 TC_PRIO_BESTEFFORT,
178 ECN_OR_COST(FILLER),
179 TC_PRIO_BESTEFFORT,
180 ECN_OR_COST(BESTEFFORT),
181 TC_PRIO_BULK,
182 ECN_OR_COST(BULK),
183 TC_PRIO_BULK,
184 ECN_OR_COST(BULK),
185 TC_PRIO_INTERACTIVE,
186 ECN_OR_COST(INTERACTIVE),
187 TC_PRIO_INTERACTIVE,
188 ECN_OR_COST(INTERACTIVE),
189 TC_PRIO_INTERACTIVE_BULK,
190 ECN_OR_COST(INTERACTIVE_BULK),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK)
193};
194
195
196/*
197 * Route cache.
198 */
199
200/* The locking scheme is rather straight forward:
201 *
202 * 1) Read-Copy Update protects the buckets of the central route hash.
203 * 2) Only writers remove entries, and they hold the lock
204 * as they look at rtable reference counts.
205 * 3) Only readers acquire references to rtable entries,
206 * they do so with atomic increments and with the
207 * lock held.
208 */
209
210struct rt_hash_bucket {
211 struct rtable *chain;
22c047cc 212};
8a25d5de
IM
213#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
214 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
215/*
216 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
217 * The size of this table is a power of two and depends on the number of CPUS.
62051200 218 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 219 */
62051200
IM
220#ifdef CONFIG_LOCKDEP
221# define RT_HASH_LOCK_SZ 256
22c047cc 222#else
62051200
IM
223# if NR_CPUS >= 32
224# define RT_HASH_LOCK_SZ 4096
225# elif NR_CPUS >= 16
226# define RT_HASH_LOCK_SZ 2048
227# elif NR_CPUS >= 8
228# define RT_HASH_LOCK_SZ 1024
229# elif NR_CPUS >= 4
230# define RT_HASH_LOCK_SZ 512
231# else
232# define RT_HASH_LOCK_SZ 256
233# endif
22c047cc
ED
234#endif
235
236static spinlock_t *rt_hash_locks;
237# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
238# define rt_hash_lock_init() { \
239 int i; \
240 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
241 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
242 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
243 spin_lock_init(&rt_hash_locks[i]); \
244 }
245#else
246# define rt_hash_lock_addr(slot) NULL
247# define rt_hash_lock_init()
248#endif
1da177e4
LT
249
250static struct rt_hash_bucket *rt_hash_table;
251static unsigned rt_hash_mask;
cfcabdcc 252static unsigned int rt_hash_log;
1da177e4
LT
253static unsigned int rt_hash_rnd;
254
2f970d83 255static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
dbd2915c 256#define RT_CACHE_STAT_INC(field) \
bfe5d834 257 (__raw_get_cpu_var(rt_cache_stat).field++)
1da177e4
LT
258
259static int rt_intern_hash(unsigned hash, struct rtable *rth,
260 struct rtable **res);
261
cef2685e 262static unsigned int rt_hash_code(u32 daddr, u32 saddr)
1da177e4 263{
cef2685e 264 return (jhash_2words(daddr, saddr, rt_hash_rnd)
1da177e4
LT
265 & rt_hash_mask);
266}
267
8c7bc840
AV
268#define rt_hash(daddr, saddr, idx) \
269 rt_hash_code((__force u32)(__be32)(daddr),\
270 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
271
1da177e4
LT
272#ifdef CONFIG_PROC_FS
273struct rt_cache_iter_state {
274 int bucket;
275};
276
277static struct rtable *rt_cache_get_first(struct seq_file *seq)
278{
279 struct rtable *r = NULL;
280 struct rt_cache_iter_state *st = seq->private;
281
282 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283 rcu_read_lock_bh();
284 r = rt_hash_table[st->bucket].chain;
285 if (r)
286 break;
287 rcu_read_unlock_bh();
288 }
0bcceadc 289 return rcu_dereference(r);
1da177e4
LT
290}
291
292static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
293{
0bcceadc 294 struct rt_cache_iter_state *st = seq->private;
1da177e4 295
093c2ca4 296 r = r->u.dst.rt_next;
1da177e4
LT
297 while (!r) {
298 rcu_read_unlock_bh();
299 if (--st->bucket < 0)
300 break;
301 rcu_read_lock_bh();
302 r = rt_hash_table[st->bucket].chain;
303 }
0bcceadc 304 return rcu_dereference(r);
1da177e4
LT
305}
306
307static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
308{
309 struct rtable *r = rt_cache_get_first(seq);
310
311 if (r)
312 while (pos && (r = rt_cache_get_next(seq, r)))
313 --pos;
314 return pos ? NULL : r;
315}
316
317static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
318{
319 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
320}
321
322static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
323{
324 struct rtable *r = NULL;
325
326 if (v == SEQ_START_TOKEN)
327 r = rt_cache_get_first(seq);
328 else
329 r = rt_cache_get_next(seq, v);
330 ++*pos;
331 return r;
332}
333
334static void rt_cache_seq_stop(struct seq_file *seq, void *v)
335{
336 if (v && v != SEQ_START_TOKEN)
337 rcu_read_unlock_bh();
338}
339
340static int rt_cache_seq_show(struct seq_file *seq, void *v)
341{
342 if (v == SEQ_START_TOKEN)
343 seq_printf(seq, "%-127s\n",
344 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
345 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
346 "HHUptod\tSpecDst");
347 else {
348 struct rtable *r = v;
349 char temp[256];
350
351 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
352 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
353 r->u.dst.dev ? r->u.dst.dev->name : "*",
354 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
355 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
356 r->u.dst.__use, 0, (unsigned long)r->rt_src,
357 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
358 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
359 dst_metric(&r->u.dst, RTAX_WINDOW),
360 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
361 dst_metric(&r->u.dst, RTAX_RTTVAR)),
362 r->fl.fl4_tos,
363 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
364 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
365 dev_queue_xmit) : 0,
366 r->rt_spec_dst);
367 seq_printf(seq, "%-127s\n", temp);
e905a9ed
YH
368 }
369 return 0;
1da177e4
LT
370}
371
f690808e 372static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
373 .start = rt_cache_seq_start,
374 .next = rt_cache_seq_next,
375 .stop = rt_cache_seq_stop,
376 .show = rt_cache_seq_show,
377};
378
379static int rt_cache_seq_open(struct inode *inode, struct file *file)
380{
cf7732e4
PE
381 return seq_open_private(file, &rt_cache_seq_ops,
382 sizeof(struct rt_cache_iter_state));
1da177e4
LT
383}
384
9a32144e 385static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
386 .owner = THIS_MODULE,
387 .open = rt_cache_seq_open,
388 .read = seq_read,
389 .llseek = seq_lseek,
390 .release = seq_release_private,
391};
392
393
394static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
395{
396 int cpu;
397
398 if (*pos == 0)
399 return SEQ_START_TOKEN;
400
401 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
402 if (!cpu_possible(cpu))
403 continue;
404 *pos = cpu+1;
2f970d83 405 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
406 }
407 return NULL;
408}
409
410static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
411{
412 int cpu;
413
414 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
415 if (!cpu_possible(cpu))
416 continue;
417 *pos = cpu+1;
2f970d83 418 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
419 }
420 return NULL;
e905a9ed 421
1da177e4
LT
422}
423
424static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
425{
426
427}
428
429static int rt_cpu_seq_show(struct seq_file *seq, void *v)
430{
431 struct rt_cache_stat *st = v;
432
433 if (v == SEQ_START_TOKEN) {
5bec0039 434 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
435 return 0;
436 }
e905a9ed 437
1da177e4
LT
438 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
439 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
440 atomic_read(&ipv4_dst_ops.entries),
441 st->in_hit,
442 st->in_slow_tot,
443 st->in_slow_mc,
444 st->in_no_route,
445 st->in_brd,
446 st->in_martian_dst,
447 st->in_martian_src,
448
449 st->out_hit,
450 st->out_slow_tot,
e905a9ed 451 st->out_slow_mc,
1da177e4
LT
452
453 st->gc_total,
454 st->gc_ignored,
455 st->gc_goal_miss,
456 st->gc_dst_overflow,
457 st->in_hlist_search,
458 st->out_hlist_search
459 );
460 return 0;
461}
462
f690808e 463static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
464 .start = rt_cpu_seq_start,
465 .next = rt_cpu_seq_next,
466 .stop = rt_cpu_seq_stop,
467 .show = rt_cpu_seq_show,
468};
469
470
471static int rt_cpu_seq_open(struct inode *inode, struct file *file)
472{
473 return seq_open(file, &rt_cpu_seq_ops);
474}
475
9a32144e 476static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
477 .owner = THIS_MODULE,
478 .open = rt_cpu_seq_open,
479 .read = seq_read,
480 .llseek = seq_lseek,
481 .release = seq_release,
482};
483
78c686e9
PE
484#ifdef CONFIG_NET_CLS_ROUTE
485static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
486 int length, int *eof, void *data)
487{
488 unsigned int i;
489
490 if ((offset & 3) || (length & 3))
491 return -EIO;
492
493 if (offset >= sizeof(struct ip_rt_acct) * 256) {
494 *eof = 1;
495 return 0;
496 }
497
498 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
499 length = sizeof(struct ip_rt_acct) * 256 - offset;
500 *eof = 1;
501 }
502
503 offset /= sizeof(u32);
504
505 if (length > 0) {
506 u32 *dst = (u32 *) buffer;
507
508 *start = buffer;
509 memset(dst, 0, length);
510
511 for_each_possible_cpu(i) {
512 unsigned int j;
513 u32 *src;
514
515 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
516 for (j = 0; j < length/4; j++)
517 dst[j] += src[j];
518 }
519 }
520 return length;
521}
522#endif
1da177e4 523#endif /* CONFIG_PROC_FS */
e905a9ed 524
1da177e4
LT
525static __inline__ void rt_free(struct rtable *rt)
526{
1da177e4
LT
527 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
528}
529
530static __inline__ void rt_drop(struct rtable *rt)
531{
1da177e4
LT
532 ip_rt_put(rt);
533 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
534}
535
536static __inline__ int rt_fast_clean(struct rtable *rth)
537{
538 /* Kill broadcast/multicast entries very aggresively, if they
539 collide in hash table with more useful entries */
540 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
093c2ca4 541 rth->fl.iif && rth->u.dst.rt_next;
1da177e4
LT
542}
543
544static __inline__ int rt_valuable(struct rtable *rth)
545{
546 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
547 rth->u.dst.expires;
548}
549
550static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
551{
552 unsigned long age;
553 int ret = 0;
554
555 if (atomic_read(&rth->u.dst.__refcnt))
556 goto out;
557
558 ret = 1;
559 if (rth->u.dst.expires &&
560 time_after_eq(jiffies, rth->u.dst.expires))
561 goto out;
562
563 age = jiffies - rth->u.dst.lastuse;
564 ret = 0;
565 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
566 (age <= tmo2 && rt_valuable(rth)))
567 goto out;
568 ret = 1;
569out: return ret;
570}
571
572/* Bits of score are:
573 * 31: very valuable
574 * 30: not quite useless
575 * 29..0: usage counter
576 */
577static inline u32 rt_score(struct rtable *rt)
578{
579 u32 score = jiffies - rt->u.dst.lastuse;
580
581 score = ~score & ~(3<<30);
582
583 if (rt_valuable(rt))
584 score |= (1<<31);
585
586 if (!rt->fl.iif ||
587 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
588 score |= (1<<30);
589
590 return score;
591}
592
593static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
594{
714e85be
AV
595 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
596 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
47dcf0cb 597 (fl1->mark ^ fl2->mark) |
8238b218
DM
598 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
599 *(u16 *)&fl2->nl_u.ip4_u.tos) |
600 (fl1->oif ^ fl2->oif) |
601 (fl1->iif ^ fl2->iif)) == 0;
1da177e4
LT
602}
603
beb659bd
ED
604/*
605 * Perform a full scan of hash table and free all entries.
606 * Can be called by a softirq or a process.
607 * In the later case, we want to be reschedule if necessary
608 */
609static void rt_do_flush(int process_context)
610{
611 unsigned int i;
612 struct rtable *rth, *next;
613
614 for (i = 0; i <= rt_hash_mask; i++) {
615 if (process_context && need_resched())
616 cond_resched();
617 rth = rt_hash_table[i].chain;
618 if (!rth)
619 continue;
620
621 spin_lock_bh(rt_hash_lock_addr(i));
622 rth = rt_hash_table[i].chain;
623 rt_hash_table[i].chain = NULL;
624 spin_unlock_bh(rt_hash_lock_addr(i));
625
626 for (; rth; rth = next) {
627 next = rth->u.dst.rt_next;
628 rt_free(rth);
629 }
630 }
631}
632
633static void rt_check_expire(void)
1da177e4 634{
bb1d23b0
ED
635 static unsigned int rover;
636 unsigned int i = rover, goal;
1da177e4 637 struct rtable *rth, **rthp;
bb1d23b0
ED
638 u64 mult;
639
640 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
641 if (ip_rt_gc_timeout > 1)
642 do_div(mult, ip_rt_gc_timeout);
643 goal = (unsigned int)mult;
39c90ece
ED
644 if (goal > rt_hash_mask)
645 goal = rt_hash_mask + 1;
bb1d23b0 646 for (; goal > 0; goal--) {
1da177e4
LT
647 unsigned long tmo = ip_rt_gc_timeout;
648
649 i = (i + 1) & rt_hash_mask;
650 rthp = &rt_hash_table[i].chain;
651
d90bf5a9
ED
652 if (need_resched())
653 cond_resched();
654
cfcabdcc 655 if (*rthp == NULL)
bb1d23b0 656 continue;
39c90ece 657 spin_lock_bh(rt_hash_lock_addr(i));
1da177e4
LT
658 while ((rth = *rthp) != NULL) {
659 if (rth->u.dst.expires) {
660 /* Entry is expired even if it is in use */
39c90ece 661 if (time_before_eq(jiffies, rth->u.dst.expires)) {
1da177e4 662 tmo >>= 1;
093c2ca4 663 rthp = &rth->u.dst.rt_next;
1da177e4
LT
664 continue;
665 }
666 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
667 tmo >>= 1;
093c2ca4 668 rthp = &rth->u.dst.rt_next;
1da177e4
LT
669 continue;
670 }
671
672 /* Cleanup aged off entries. */
093c2ca4 673 *rthp = rth->u.dst.rt_next;
e905a9ed 674 rt_free(rth);
1da177e4 675 }
39c90ece 676 spin_unlock_bh(rt_hash_lock_addr(i));
1da177e4
LT
677 }
678 rover = i;
beb659bd
ED
679}
680
681/*
682 * rt_worker_func() is run in process context.
683 * If a whole flush was scheduled, it is done.
684 * Else, we call rt_check_expire() to scan part of the hash table
685 */
686static void rt_worker_func(struct work_struct *work)
687{
688 if (ip_rt_flush_expected) {
689 ip_rt_flush_expected = 0;
690 rt_do_flush(1);
691 } else
692 rt_check_expire();
39c90ece 693 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
1da177e4
LT
694}
695
696/* This can run from both BH and non-BH contexts, the latter
697 * in the case of a forced flush event.
698 */
beb659bd 699static void rt_run_flush(unsigned long process_context)
1da177e4 700{
1da177e4
LT
701 rt_deadline = 0;
702
703 get_random_bytes(&rt_hash_rnd, 4);
704
beb659bd 705 rt_do_flush(process_context);
1da177e4
LT
706}
707
708static DEFINE_SPINLOCK(rt_flush_lock);
709
710void rt_cache_flush(int delay)
711{
712 unsigned long now = jiffies;
713 int user_mode = !in_softirq();
714
715 if (delay < 0)
716 delay = ip_rt_min_delay;
717
1da177e4
LT
718 spin_lock_bh(&rt_flush_lock);
719
720 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
721 long tmo = (long)(rt_deadline - now);
722
723 /* If flush timer is already running
724 and flush request is not immediate (delay > 0):
725
726 if deadline is not achieved, prolongate timer to "delay",
727 otherwise fire it at deadline time.
728 */
729
730 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
731 tmo = 0;
e905a9ed 732
1da177e4
LT
733 if (delay > tmo)
734 delay = tmo;
735 }
736
737 if (delay <= 0) {
738 spin_unlock_bh(&rt_flush_lock);
beb659bd 739 rt_run_flush(user_mode);
1da177e4
LT
740 return;
741 }
742
743 if (rt_deadline == 0)
744 rt_deadline = now + ip_rt_max_delay;
745
746 mod_timer(&rt_flush_timer, now+delay);
747 spin_unlock_bh(&rt_flush_lock);
748}
749
beb659bd
ED
750/*
751 * We change rt_hash_rnd and ask next rt_worker_func() invocation
752 * to perform a flush in process context
753 */
1da177e4
LT
754static void rt_secret_rebuild(unsigned long dummy)
755{
beb659bd
ED
756 get_random_bytes(&rt_hash_rnd, 4);
757 ip_rt_flush_expected = 1;
758 cancel_delayed_work(&expires_work);
759 schedule_delayed_work(&expires_work, HZ/10);
760 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
1da177e4
LT
761}
762
763/*
764 Short description of GC goals.
765
766 We want to build algorithm, which will keep routing cache
767 at some equilibrium point, when number of aged off entries
768 is kept approximately equal to newly generated ones.
769
770 Current expiration strength is variable "expire".
771 We try to adjust it dynamically, so that if networking
772 is idle expires is large enough to keep enough of warm entries,
773 and when load increases it reduces to limit cache size.
774 */
775
776static int rt_garbage_collect(void)
777{
778 static unsigned long expire = RT_GC_TIMEOUT;
779 static unsigned long last_gc;
780 static int rover;
781 static int equilibrium;
782 struct rtable *rth, **rthp;
783 unsigned long now = jiffies;
784 int goal;
785
786 /*
787 * Garbage collection is pretty expensive,
788 * do not make it too frequently.
789 */
790
791 RT_CACHE_STAT_INC(gc_total);
792
793 if (now - last_gc < ip_rt_gc_min_interval &&
794 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
795 RT_CACHE_STAT_INC(gc_ignored);
796 goto out;
797 }
798
799 /* Calculate number of entries, which we want to expire now. */
800 goal = atomic_read(&ipv4_dst_ops.entries) -
801 (ip_rt_gc_elasticity << rt_hash_log);
802 if (goal <= 0) {
803 if (equilibrium < ipv4_dst_ops.gc_thresh)
804 equilibrium = ipv4_dst_ops.gc_thresh;
805 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
806 if (goal > 0) {
807 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
808 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
809 }
810 } else {
811 /* We are in dangerous area. Try to reduce cache really
812 * aggressively.
813 */
814 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
815 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
816 }
817
818 if (now - last_gc >= ip_rt_gc_min_interval)
819 last_gc = now;
820
821 if (goal <= 0) {
822 equilibrium += goal;
823 goto work_done;
824 }
825
826 do {
827 int i, k;
828
829 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
830 unsigned long tmo = expire;
831
832 k = (k + 1) & rt_hash_mask;
833 rthp = &rt_hash_table[k].chain;
22c047cc 834 spin_lock_bh(rt_hash_lock_addr(k));
1da177e4
LT
835 while ((rth = *rthp) != NULL) {
836 if (!rt_may_expire(rth, tmo, expire)) {
837 tmo >>= 1;
093c2ca4 838 rthp = &rth->u.dst.rt_next;
1da177e4
LT
839 continue;
840 }
093c2ca4 841 *rthp = rth->u.dst.rt_next;
1da177e4
LT
842 rt_free(rth);
843 goal--;
1da177e4 844 }
22c047cc 845 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
846 if (goal <= 0)
847 break;
848 }
849 rover = k;
850
851 if (goal <= 0)
852 goto work_done;
853
854 /* Goal is not achieved. We stop process if:
855
856 - if expire reduced to zero. Otherwise, expire is halfed.
857 - if table is not full.
858 - if we are called from interrupt.
859 - jiffies check is just fallback/debug loop breaker.
860 We will not spin here for long time in any case.
861 */
862
863 RT_CACHE_STAT_INC(gc_goal_miss);
864
865 if (expire == 0)
866 break;
867
868 expire >>= 1;
869#if RT_CACHE_DEBUG >= 2
870 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
871 atomic_read(&ipv4_dst_ops.entries), goal, i);
872#endif
873
874 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
875 goto out;
876 } while (!in_softirq() && time_before_eq(jiffies, now));
877
878 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
879 goto out;
880 if (net_ratelimit())
881 printk(KERN_WARNING "dst cache overflow\n");
882 RT_CACHE_STAT_INC(gc_dst_overflow);
883 return 1;
884
885work_done:
886 expire += ip_rt_gc_min_interval;
887 if (expire > ip_rt_gc_timeout ||
888 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
889 expire = ip_rt_gc_timeout;
890#if RT_CACHE_DEBUG >= 2
891 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
892 atomic_read(&ipv4_dst_ops.entries), goal, rover);
893#endif
894out: return 0;
895}
896
897static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
898{
899 struct rtable *rth, **rthp;
900 unsigned long now;
901 struct rtable *cand, **candp;
902 u32 min_score;
903 int chain_length;
904 int attempts = !in_softirq();
905
906restart:
907 chain_length = 0;
908 min_score = ~(u32)0;
909 cand = NULL;
910 candp = NULL;
911 now = jiffies;
912
913 rthp = &rt_hash_table[hash].chain;
914
22c047cc 915 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 916 while ((rth = *rthp) != NULL) {
1da177e4 917 if (compare_keys(&rth->fl, &rt->fl)) {
1da177e4 918 /* Put it first */
093c2ca4 919 *rthp = rth->u.dst.rt_next;
1da177e4
LT
920 /*
921 * Since lookup is lockfree, the deletion
922 * must be visible to another weakly ordered CPU before
923 * the insertion at the start of the hash chain.
924 */
093c2ca4 925 rcu_assign_pointer(rth->u.dst.rt_next,
1da177e4
LT
926 rt_hash_table[hash].chain);
927 /*
928 * Since lookup is lockfree, the update writes
929 * must be ordered for consistency on SMP.
930 */
931 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
932
03f49f34 933 dst_use(&rth->u.dst, now);
22c047cc 934 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
935
936 rt_drop(rt);
937 *rp = rth;
938 return 0;
939 }
940
941 if (!atomic_read(&rth->u.dst.__refcnt)) {
942 u32 score = rt_score(rth);
943
944 if (score <= min_score) {
945 cand = rth;
946 candp = rthp;
947 min_score = score;
948 }
949 }
950
951 chain_length++;
952
093c2ca4 953 rthp = &rth->u.dst.rt_next;
1da177e4
LT
954 }
955
956 if (cand) {
957 /* ip_rt_gc_elasticity used to be average length of chain
958 * length, when exceeded gc becomes really aggressive.
959 *
960 * The second limit is less certain. At the moment it allows
961 * only 2 entries per bucket. We will see.
962 */
963 if (chain_length > ip_rt_gc_elasticity) {
093c2ca4 964 *candp = cand->u.dst.rt_next;
1da177e4
LT
965 rt_free(cand);
966 }
967 }
968
969 /* Try to bind route to arp only if it is output
970 route or unicast forwarding path.
971 */
972 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
973 int err = arp_bind_neighbour(&rt->u.dst);
974 if (err) {
22c047cc 975 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
976
977 if (err != -ENOBUFS) {
978 rt_drop(rt);
979 return err;
980 }
981
982 /* Neighbour tables are full and nothing
983 can be released. Try to shrink route cache,
984 it is most likely it holds some neighbour records.
985 */
986 if (attempts-- > 0) {
987 int saved_elasticity = ip_rt_gc_elasticity;
988 int saved_int = ip_rt_gc_min_interval;
989 ip_rt_gc_elasticity = 1;
990 ip_rt_gc_min_interval = 0;
991 rt_garbage_collect();
992 ip_rt_gc_min_interval = saved_int;
993 ip_rt_gc_elasticity = saved_elasticity;
994 goto restart;
995 }
996
997 if (net_ratelimit())
998 printk(KERN_WARNING "Neighbour table overflow.\n");
999 rt_drop(rt);
1000 return -ENOBUFS;
1001 }
1002 }
1003
093c2ca4 1004 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1da177e4 1005#if RT_CACHE_DEBUG >= 2
093c2ca4 1006 if (rt->u.dst.rt_next) {
1da177e4
LT
1007 struct rtable *trt;
1008 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1009 NIPQUAD(rt->rt_dst));
093c2ca4 1010 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1da177e4
LT
1011 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1012 printk("\n");
1013 }
1014#endif
1015 rt_hash_table[hash].chain = rt;
22c047cc 1016 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1017 *rp = rt;
1018 return 0;
1019}
1020
1021void rt_bind_peer(struct rtable *rt, int create)
1022{
1023 static DEFINE_SPINLOCK(rt_peer_lock);
1024 struct inet_peer *peer;
1025
1026 peer = inet_getpeer(rt->rt_dst, create);
1027
1028 spin_lock_bh(&rt_peer_lock);
1029 if (rt->peer == NULL) {
1030 rt->peer = peer;
1031 peer = NULL;
1032 }
1033 spin_unlock_bh(&rt_peer_lock);
1034 if (peer)
1035 inet_putpeer(peer);
1036}
1037
1038/*
1039 * Peer allocation may fail only in serious out-of-memory conditions. However
1040 * we still can generate some output.
1041 * Random ID selection looks a bit dangerous because we have no chances to
1042 * select ID being unique in a reasonable period of time.
1043 * But broken packet identifier may be better than no packet at all.
1044 */
1045static void ip_select_fb_ident(struct iphdr *iph)
1046{
1047 static DEFINE_SPINLOCK(ip_fb_id_lock);
1048 static u32 ip_fallback_id;
1049 u32 salt;
1050
1051 spin_lock_bh(&ip_fb_id_lock);
e448515c 1052 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1053 iph->id = htons(salt & 0xFFFF);
1054 ip_fallback_id = salt;
1055 spin_unlock_bh(&ip_fb_id_lock);
1056}
1057
1058void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1059{
1060 struct rtable *rt = (struct rtable *) dst;
1061
1062 if (rt) {
1063 if (rt->peer == NULL)
1064 rt_bind_peer(rt, 1);
1065
1066 /* If peer is attached to destination, it is never detached,
1067 so that we need not to grab a lock to dereference it.
1068 */
1069 if (rt->peer) {
1070 iph->id = htons(inet_getid(rt->peer, more));
1071 return;
1072 }
1073 } else
e905a9ed 1074 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1075 __builtin_return_address(0));
1da177e4
LT
1076
1077 ip_select_fb_ident(iph);
1078}
1079
1080static void rt_del(unsigned hash, struct rtable *rt)
1081{
1082 struct rtable **rthp;
1083
22c047cc 1084 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1085 ip_rt_put(rt);
1086 for (rthp = &rt_hash_table[hash].chain; *rthp;
093c2ca4 1087 rthp = &(*rthp)->u.dst.rt_next)
1da177e4 1088 if (*rthp == rt) {
093c2ca4 1089 *rthp = rt->u.dst.rt_next;
1da177e4
LT
1090 rt_free(rt);
1091 break;
1092 }
22c047cc 1093 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1094}
1095
f7655229
AV
1096void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1097 __be32 saddr, struct net_device *dev)
1da177e4
LT
1098{
1099 int i, k;
1100 struct in_device *in_dev = in_dev_get(dev);
1101 struct rtable *rth, **rthp;
f7655229 1102 __be32 skeys[2] = { saddr, 0 };
1da177e4 1103 int ikeys[2] = { dev->ifindex, 0 };
8d71740c 1104 struct netevent_redirect netevent;
1da177e4 1105
1da177e4
LT
1106 if (!in_dev)
1107 return;
1108
1109 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1110 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1111 goto reject_redirect;
1112
1113 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1114 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1115 goto reject_redirect;
1116 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1117 goto reject_redirect;
1118 } else {
1119 if (inet_addr_type(new_gw) != RTN_UNICAST)
1120 goto reject_redirect;
1121 }
1122
1123 for (i = 0; i < 2; i++) {
1124 for (k = 0; k < 2; k++) {
8c7bc840 1125 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1da177e4
LT
1126
1127 rthp=&rt_hash_table[hash].chain;
1128
1129 rcu_read_lock();
1130 while ((rth = rcu_dereference(*rthp)) != NULL) {
1131 struct rtable *rt;
1132
1133 if (rth->fl.fl4_dst != daddr ||
1134 rth->fl.fl4_src != skeys[i] ||
1da177e4
LT
1135 rth->fl.oif != ikeys[k] ||
1136 rth->fl.iif != 0) {
093c2ca4 1137 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1138 continue;
1139 }
1140
1141 if (rth->rt_dst != daddr ||
1142 rth->rt_src != saddr ||
1143 rth->u.dst.error ||
1144 rth->rt_gateway != old_gw ||
1145 rth->u.dst.dev != dev)
1146 break;
1147
1148 dst_hold(&rth->u.dst);
1149 rcu_read_unlock();
1150
1151 rt = dst_alloc(&ipv4_dst_ops);
1152 if (rt == NULL) {
1153 ip_rt_put(rth);
1154 in_dev_put(in_dev);
1155 return;
1156 }
1157
1158 /* Copy all the information. */
1159 *rt = *rth;
e905a9ed 1160 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1da177e4
LT
1161 rt->u.dst.__use = 1;
1162 atomic_set(&rt->u.dst.__refcnt, 1);
1163 rt->u.dst.child = NULL;
1164 if (rt->u.dst.dev)
1165 dev_hold(rt->u.dst.dev);
1166 if (rt->idev)
1167 in_dev_hold(rt->idev);
1168 rt->u.dst.obsolete = 0;
1169 rt->u.dst.lastuse = jiffies;
1170 rt->u.dst.path = &rt->u.dst;
1171 rt->u.dst.neighbour = NULL;
1172 rt->u.dst.hh = NULL;
1173 rt->u.dst.xfrm = NULL;
1174
1175 rt->rt_flags |= RTCF_REDIRECTED;
1176
1177 /* Gateway is different ... */
1178 rt->rt_gateway = new_gw;
1179
1180 /* Redirect received -> path was valid */
1181 dst_confirm(&rth->u.dst);
1182
1183 if (rt->peer)
1184 atomic_inc(&rt->peer->refcnt);
1185
1186 if (arp_bind_neighbour(&rt->u.dst) ||
1187 !(rt->u.dst.neighbour->nud_state &
1188 NUD_VALID)) {
1189 if (rt->u.dst.neighbour)
1190 neigh_event_send(rt->u.dst.neighbour, NULL);
1191 ip_rt_put(rth);
1192 rt_drop(rt);
1193 goto do_next;
1194 }
e905a9ed 1195
8d71740c
TT
1196 netevent.old = &rth->u.dst;
1197 netevent.new = &rt->u.dst;
e905a9ed
YH
1198 call_netevent_notifiers(NETEVENT_REDIRECT,
1199 &netevent);
1da177e4
LT
1200
1201 rt_del(hash, rth);
1202 if (!rt_intern_hash(hash, rt, &rt))
1203 ip_rt_put(rt);
1204 goto do_next;
1205 }
1206 rcu_read_unlock();
1207 do_next:
1208 ;
1209 }
1210 }
1211 in_dev_put(in_dev);
1212 return;
1213
1214reject_redirect:
1215#ifdef CONFIG_IP_ROUTE_VERBOSE
1216 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1217 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1218 "%u.%u.%u.%u ignored.\n"
cef2685e 1219 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1da177e4 1220 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
cef2685e 1221 NIPQUAD(saddr), NIPQUAD(daddr));
1da177e4
LT
1222#endif
1223 in_dev_put(in_dev);
1224}
1225
1226static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1227{
1228 struct rtable *rt = (struct rtable*)dst;
1229 struct dst_entry *ret = dst;
1230
1231 if (rt) {
1232 if (dst->obsolete) {
1233 ip_rt_put(rt);
1234 ret = NULL;
1235 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1236 rt->u.dst.expires) {
8c7bc840
AV
1237 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1238 rt->fl.oif);
1da177e4 1239#if RT_CACHE_DEBUG >= 1
56c99d04 1240 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1da177e4
LT
1241 "%u.%u.%u.%u/%02x dropped\n",
1242 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1243#endif
1244 rt_del(hash, rt);
1245 ret = NULL;
1246 }
1247 }
1248 return ret;
1249}
1250
1251/*
1252 * Algorithm:
1253 * 1. The first ip_rt_redirect_number redirects are sent
1254 * with exponential backoff, then we stop sending them at all,
1255 * assuming that the host ignores our redirects.
1256 * 2. If we did not see packets requiring redirects
1257 * during ip_rt_redirect_silence, we assume that the host
1258 * forgot redirected route and start to send redirects again.
1259 *
1260 * This algorithm is much cheaper and more intelligent than dumb load limiting
1261 * in icmp.c.
1262 *
1263 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1264 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1265 */
1266
1267void ip_rt_send_redirect(struct sk_buff *skb)
1268{
1269 struct rtable *rt = (struct rtable*)skb->dst;
1270 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1271
1272 if (!in_dev)
1273 return;
1274
1275 if (!IN_DEV_TX_REDIRECTS(in_dev))
1276 goto out;
1277
1278 /* No redirected packets during ip_rt_redirect_silence;
1279 * reset the algorithm.
1280 */
1281 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1282 rt->u.dst.rate_tokens = 0;
1283
1284 /* Too many ignored redirects; do not send anything
1285 * set u.dst.rate_last to the last seen redirected packet.
1286 */
1287 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1288 rt->u.dst.rate_last = jiffies;
1289 goto out;
1290 }
1291
1292 /* Check for load limit; set rate_last to the latest sent
1293 * redirect.
1294 */
14fb8a76
LY
1295 if (rt->u.dst.rate_tokens == 0 ||
1296 time_after(jiffies,
1da177e4
LT
1297 (rt->u.dst.rate_last +
1298 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1299 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1300 rt->u.dst.rate_last = jiffies;
1301 ++rt->u.dst.rate_tokens;
1302#ifdef CONFIG_IP_ROUTE_VERBOSE
1303 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1304 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1305 net_ratelimit())
1306 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1307 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1308 NIPQUAD(rt->rt_src), rt->rt_iif,
1309 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1310#endif
1311 }
1312out:
e905a9ed 1313 in_dev_put(in_dev);
1da177e4
LT
1314}
1315
1316static int ip_error(struct sk_buff *skb)
1317{
1318 struct rtable *rt = (struct rtable*)skb->dst;
1319 unsigned long now;
1320 int code;
1321
1322 switch (rt->u.dst.error) {
1323 case EINVAL:
1324 default:
1325 goto out;
1326 case EHOSTUNREACH:
1327 code = ICMP_HOST_UNREACH;
1328 break;
1329 case ENETUNREACH:
1330 code = ICMP_NET_UNREACH;
7f53878d 1331 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1da177e4
LT
1332 break;
1333 case EACCES:
1334 code = ICMP_PKT_FILTERED;
1335 break;
1336 }
1337
1338 now = jiffies;
1339 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1340 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1341 rt->u.dst.rate_tokens = ip_rt_error_burst;
1342 rt->u.dst.rate_last = now;
1343 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1344 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1345 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1346 }
1347
1348out: kfree_skb(skb);
1349 return 0;
e905a9ed 1350}
1da177e4
LT
1351
1352/*
1353 * The last two values are not from the RFC but
1354 * are needed for AMPRnet AX.25 paths.
1355 */
1356
9b5b5cff 1357static const unsigned short mtu_plateau[] =
1da177e4
LT
1358{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1359
1360static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1361{
1362 int i;
e905a9ed 1363
1da177e4
LT
1364 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1365 if (old_mtu > mtu_plateau[i])
1366 return mtu_plateau[i];
1367 return 68;
1368}
1369
1370unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1371{
1372 int i;
1373 unsigned short old_mtu = ntohs(iph->tot_len);
1374 struct rtable *rth;
e448515c
AV
1375 __be32 skeys[2] = { iph->saddr, 0, };
1376 __be32 daddr = iph->daddr;
1da177e4
LT
1377 unsigned short est_mtu = 0;
1378
1379 if (ipv4_config.no_pmtu_disc)
1380 return 0;
1381
1382 for (i = 0; i < 2; i++) {
8c7bc840 1383 unsigned hash = rt_hash(daddr, skeys[i], 0);
1da177e4
LT
1384
1385 rcu_read_lock();
1386 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 1387 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
1388 if (rth->fl.fl4_dst == daddr &&
1389 rth->fl.fl4_src == skeys[i] &&
1390 rth->rt_dst == daddr &&
1391 rth->rt_src == iph->saddr &&
1da177e4
LT
1392 rth->fl.iif == 0 &&
1393 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1394 unsigned short mtu = new_mtu;
1395
1396 if (new_mtu < 68 || new_mtu >= old_mtu) {
1397
1398 /* BSD 4.2 compatibility hack :-( */
1399 if (mtu == 0 &&
1400 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1401 old_mtu >= 68 + (iph->ihl << 2))
1402 old_mtu -= iph->ihl << 2;
1403
1404 mtu = guess_mtu(old_mtu);
1405 }
1406 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
e905a9ed 1407 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1da177e4
LT
1408 dst_confirm(&rth->u.dst);
1409 if (mtu < ip_rt_min_pmtu) {
1410 mtu = ip_rt_min_pmtu;
1411 rth->u.dst.metrics[RTAX_LOCK-1] |=
1412 (1 << RTAX_MTU);
1413 }
1414 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1415 dst_set_expires(&rth->u.dst,
1416 ip_rt_mtu_expires);
1417 }
1418 est_mtu = mtu;
1419 }
1420 }
1421 }
1422 rcu_read_unlock();
1423 }
1424 return est_mtu ? : new_mtu;
1425}
1426
1427static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1428{
1429 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1430 !(dst_metric_locked(dst, RTAX_MTU))) {
1431 if (mtu < ip_rt_min_pmtu) {
1432 mtu = ip_rt_min_pmtu;
1433 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1434 }
1435 dst->metrics[RTAX_MTU-1] = mtu;
1436 dst_set_expires(dst, ip_rt_mtu_expires);
8d71740c 1437 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1da177e4
LT
1438 }
1439}
1440
1441static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1442{
1443 return NULL;
1444}
1445
1446static void ipv4_dst_destroy(struct dst_entry *dst)
1447{
1448 struct rtable *rt = (struct rtable *) dst;
1449 struct inet_peer *peer = rt->peer;
1450 struct in_device *idev = rt->idev;
1451
1452 if (peer) {
1453 rt->peer = NULL;
1454 inet_putpeer(peer);
1455 }
1456
1457 if (idev) {
1458 rt->idev = NULL;
1459 in_dev_put(idev);
1460 }
1461}
1462
1463static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1464 int how)
1465{
1466 struct rtable *rt = (struct rtable *) dst;
1467 struct in_device *idev = rt->idev;
2774c7ab
EB
1468 if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1469 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
1da177e4
LT
1470 if (loopback_idev) {
1471 rt->idev = loopback_idev;
1472 in_dev_put(idev);
1473 }
1474 }
1475}
1476
1477static void ipv4_link_failure(struct sk_buff *skb)
1478{
1479 struct rtable *rt;
1480
1481 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1482
1483 rt = (struct rtable *) skb->dst;
1484 if (rt)
1485 dst_set_expires(&rt->u.dst, 0);
1486}
1487
1488static int ip_rt_bug(struct sk_buff *skb)
1489{
1490 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
eddc9ec5 1491 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1da177e4
LT
1492 skb->dev ? skb->dev->name : "?");
1493 kfree_skb(skb);
1494 return 0;
1495}
1496
1497/*
1498 We do not cache source address of outgoing interface,
1499 because it is used only by IP RR, TS and SRR options,
1500 so that it out of fast path.
1501
1502 BTW remember: "addr" is allowed to be not aligned
1503 in IP options!
1504 */
1505
1506void ip_rt_get_source(u8 *addr, struct rtable *rt)
1507{
a61ced5d 1508 __be32 src;
1da177e4
LT
1509 struct fib_result res;
1510
1511 if (rt->fl.iif == 0)
1512 src = rt->rt_src;
1513 else if (fib_lookup(&rt->fl, &res) == 0) {
1514 src = FIB_RES_PREFSRC(res);
1515 fib_res_put(&res);
1516 } else
1517 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1518 RT_SCOPE_UNIVERSE);
1519 memcpy(addr, &src, 4);
1520}
1521
1522#ifdef CONFIG_NET_CLS_ROUTE
1523static void set_class_tag(struct rtable *rt, u32 tag)
1524{
1525 if (!(rt->u.dst.tclassid & 0xFFFF))
1526 rt->u.dst.tclassid |= tag & 0xFFFF;
1527 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1528 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1529}
1530#endif
1531
1532static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1533{
1534 struct fib_info *fi = res->fi;
1535
1536 if (fi) {
1537 if (FIB_RES_GW(*res) &&
1538 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1539 rt->rt_gateway = FIB_RES_GW(*res);
1540 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1541 sizeof(rt->u.dst.metrics));
1542 if (fi->fib_mtu == 0) {
1543 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1544 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1545 rt->rt_gateway != rt->rt_dst &&
1546 rt->u.dst.dev->mtu > 576)
1547 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1548 }
1549#ifdef CONFIG_NET_CLS_ROUTE
1550 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1551#endif
1552 } else
1553 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1554
1555 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1556 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1557 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1558 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1559 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1560 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1561 ip_rt_min_advmss);
1562 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1563 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1564
1565#ifdef CONFIG_NET_CLS_ROUTE
1566#ifdef CONFIG_IP_MULTIPLE_TABLES
1567 set_class_tag(rt, fib_rules_tclass(res));
1568#endif
1569 set_class_tag(rt, itag);
1570#endif
e905a9ed 1571 rt->rt_type = res->type;
1da177e4
LT
1572}
1573
9e12bb22 1574static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1575 u8 tos, struct net_device *dev, int our)
1576{
1577 unsigned hash;
1578 struct rtable *rth;
a61ced5d 1579 __be32 spec_dst;
1da177e4
LT
1580 struct in_device *in_dev = in_dev_get(dev);
1581 u32 itag = 0;
1582
1583 /* Primary sanity checks. */
1584
1585 if (in_dev == NULL)
1586 return -EINVAL;
1587
1588 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1589 skb->protocol != htons(ETH_P_IP))
1590 goto e_inval;
1591
1592 if (ZERONET(saddr)) {
1593 if (!LOCAL_MCAST(daddr))
1594 goto e_inval;
1595 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1596 } else if (fib_validate_source(saddr, 0, tos, 0,
1597 dev, &spec_dst, &itag) < 0)
1598 goto e_inval;
1599
1600 rth = dst_alloc(&ipv4_dst_ops);
1601 if (!rth)
1602 goto e_nobufs;
1603
1604 rth->u.dst.output= ip_rt_bug;
1605
1606 atomic_set(&rth->u.dst.__refcnt, 1);
1607 rth->u.dst.flags= DST_HOST;
42f811b8 1608 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
1609 rth->u.dst.flags |= DST_NOPOLICY;
1610 rth->fl.fl4_dst = daddr;
1611 rth->rt_dst = daddr;
1612 rth->fl.fl4_tos = tos;
47dcf0cb 1613 rth->fl.mark = skb->mark;
1da177e4
LT
1614 rth->fl.fl4_src = saddr;
1615 rth->rt_src = saddr;
1616#ifdef CONFIG_NET_CLS_ROUTE
1617 rth->u.dst.tclassid = itag;
1618#endif
1619 rth->rt_iif =
1620 rth->fl.iif = dev->ifindex;
2774c7ab 1621 rth->u.dst.dev = init_net.loopback_dev;
1da177e4
LT
1622 dev_hold(rth->u.dst.dev);
1623 rth->idev = in_dev_get(rth->u.dst.dev);
1624 rth->fl.oif = 0;
1625 rth->rt_gateway = daddr;
1626 rth->rt_spec_dst= spec_dst;
1627 rth->rt_type = RTN_MULTICAST;
1628 rth->rt_flags = RTCF_MULTICAST;
1629 if (our) {
1630 rth->u.dst.input= ip_local_deliver;
1631 rth->rt_flags |= RTCF_LOCAL;
1632 }
1633
1634#ifdef CONFIG_IP_MROUTE
1635 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1636 rth->u.dst.input = ip_mr_input;
1637#endif
1638 RT_CACHE_STAT_INC(in_slow_mc);
1639
1640 in_dev_put(in_dev);
8c7bc840 1641 hash = rt_hash(daddr, saddr, dev->ifindex);
1da177e4
LT
1642 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1643
1644e_nobufs:
1645 in_dev_put(in_dev);
1646 return -ENOBUFS;
1647
1648e_inval:
1649 in_dev_put(in_dev);
1650 return -EINVAL;
1651}
1652
1653
1654static void ip_handle_martian_source(struct net_device *dev,
1655 struct in_device *in_dev,
1656 struct sk_buff *skb,
9e12bb22
AV
1657 __be32 daddr,
1658 __be32 saddr)
1da177e4
LT
1659{
1660 RT_CACHE_STAT_INC(in_martian_src);
1661#ifdef CONFIG_IP_ROUTE_VERBOSE
1662 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1663 /*
1664 * RFC1812 recommendation, if source is martian,
1665 * the only hint is MAC header.
1666 */
1667 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1668 "%u.%u.%u.%u, on dev %s\n",
1669 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
98e399f8 1670 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 1671 int i;
98e399f8 1672 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
1673 printk(KERN_WARNING "ll header: ");
1674 for (i = 0; i < dev->hard_header_len; i++, p++) {
1675 printk("%02x", *p);
1676 if (i < (dev->hard_header_len - 1))
1677 printk(":");
1678 }
1679 printk("\n");
1680 }
1681 }
1682#endif
1683}
1684
e905a9ed
YH
1685static inline int __mkroute_input(struct sk_buff *skb,
1686 struct fib_result* res,
1687 struct in_device *in_dev,
9e12bb22 1688 __be32 daddr, __be32 saddr, u32 tos,
e905a9ed 1689 struct rtable **result)
1da177e4
LT
1690{
1691
1692 struct rtable *rth;
1693 int err;
1694 struct in_device *out_dev;
1695 unsigned flags = 0;
d9c9df8c
AV
1696 __be32 spec_dst;
1697 u32 itag;
1da177e4
LT
1698
1699 /* get a working reference to the output device */
1700 out_dev = in_dev_get(FIB_RES_DEV(*res));
1701 if (out_dev == NULL) {
1702 if (net_ratelimit())
1703 printk(KERN_CRIT "Bug in ip_route_input" \
1704 "_slow(). Please, report\n");
1705 return -EINVAL;
1706 }
1707
1708
e905a9ed 1709 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1da177e4
LT
1710 in_dev->dev, &spec_dst, &itag);
1711 if (err < 0) {
e905a9ed 1712 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1713 saddr);
e905a9ed 1714
1da177e4
LT
1715 err = -EINVAL;
1716 goto cleanup;
1717 }
1718
1719 if (err)
1720 flags |= RTCF_DIRECTSRC;
1721
1722 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1723 (IN_DEV_SHARED_MEDIA(out_dev) ||
1724 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1725 flags |= RTCF_DOREDIRECT;
1726
1727 if (skb->protocol != htons(ETH_P_IP)) {
1728 /* Not IP (i.e. ARP). Do not create route, if it is
1729 * invalid for proxy arp. DNAT routes are always valid.
1730 */
1731 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1732 err = -EINVAL;
1733 goto cleanup;
1734 }
1735 }
1736
1737
1738 rth = dst_alloc(&ipv4_dst_ops);
1739 if (!rth) {
1740 err = -ENOBUFS;
1741 goto cleanup;
1742 }
1743
ce723d8e 1744 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 1745 rth->u.dst.flags= DST_HOST;
42f811b8 1746 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4 1747 rth->u.dst.flags |= DST_NOPOLICY;
42f811b8 1748 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1da177e4
LT
1749 rth->u.dst.flags |= DST_NOXFRM;
1750 rth->fl.fl4_dst = daddr;
1751 rth->rt_dst = daddr;
1752 rth->fl.fl4_tos = tos;
47dcf0cb 1753 rth->fl.mark = skb->mark;
1da177e4
LT
1754 rth->fl.fl4_src = saddr;
1755 rth->rt_src = saddr;
1756 rth->rt_gateway = daddr;
1757 rth->rt_iif =
1758 rth->fl.iif = in_dev->dev->ifindex;
1759 rth->u.dst.dev = (out_dev)->dev;
1760 dev_hold(rth->u.dst.dev);
1761 rth->idev = in_dev_get(rth->u.dst.dev);
1762 rth->fl.oif = 0;
1763 rth->rt_spec_dst= spec_dst;
1764
1765 rth->u.dst.input = ip_forward;
1766 rth->u.dst.output = ip_output;
1767
1768 rt_set_nexthop(rth, res, itag);
1769
1770 rth->rt_flags = flags;
1771
1772 *result = rth;
1773 err = 0;
1774 cleanup:
1775 /* release the working reference to the output device */
1776 in_dev_put(out_dev);
1777 return err;
e905a9ed 1778}
1da177e4 1779
e06e7c61
DM
1780static inline int ip_mkroute_input(struct sk_buff *skb,
1781 struct fib_result* res,
1782 const struct flowi *fl,
1783 struct in_device *in_dev,
1784 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1785{
7abaa27c 1786 struct rtable* rth = NULL;
1da177e4
LT
1787 int err;
1788 unsigned hash;
1789
1790#ifdef CONFIG_IP_ROUTE_MULTIPATH
1791 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1792 fib_select_multipath(fl, res);
1793#endif
1794
1795 /* create a routing cache entry */
1796 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1797 if (err)
1798 return err;
1da177e4
LT
1799
1800 /* put it into the cache */
8c7bc840 1801 hash = rt_hash(daddr, saddr, fl->iif);
e905a9ed 1802 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1da177e4
LT
1803}
1804
1da177e4
LT
1805/*
1806 * NOTE. We drop all the packets that has local source
1807 * addresses, because every properly looped back packet
1808 * must have correct destination already attached by output routine.
1809 *
1810 * Such approach solves two big problems:
1811 * 1. Not simplex devices are handled properly.
1812 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1813 */
1814
9e12bb22 1815static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1816 u8 tos, struct net_device *dev)
1817{
1818 struct fib_result res;
1819 struct in_device *in_dev = in_dev_get(dev);
1820 struct flowi fl = { .nl_u = { .ip4_u =
1821 { .daddr = daddr,
1822 .saddr = saddr,
1823 .tos = tos,
1824 .scope = RT_SCOPE_UNIVERSE,
1da177e4 1825 } },
47dcf0cb 1826 .mark = skb->mark,
1da177e4
LT
1827 .iif = dev->ifindex };
1828 unsigned flags = 0;
1829 u32 itag = 0;
1830 struct rtable * rth;
1831 unsigned hash;
9e12bb22 1832 __be32 spec_dst;
1da177e4
LT
1833 int err = -EINVAL;
1834 int free_res = 0;
1835
1836 /* IP on this device is disabled. */
1837
1838 if (!in_dev)
1839 goto out;
1840
1841 /* Check for the most weird martians, which can be not detected
1842 by fib_lookup.
1843 */
1844
1845 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1846 goto martian_source;
1847
e448515c 1848 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1da177e4
LT
1849 goto brd_input;
1850
1851 /* Accept zero addresses only to limited broadcast;
1852 * I even do not know to fix it or not. Waiting for complains :-)
1853 */
1854 if (ZERONET(saddr))
1855 goto martian_source;
1856
1857 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1858 goto martian_destination;
1859
1860 /*
1861 * Now we are ready to route packet.
1862 */
1863 if ((err = fib_lookup(&fl, &res)) != 0) {
1864 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1865 goto e_hostunreach;
1da177e4
LT
1866 goto no_route;
1867 }
1868 free_res = 1;
1869
1870 RT_CACHE_STAT_INC(in_slow_tot);
1871
1872 if (res.type == RTN_BROADCAST)
1873 goto brd_input;
1874
1875 if (res.type == RTN_LOCAL) {
1876 int result;
1877 result = fib_validate_source(saddr, daddr, tos,
2774c7ab 1878 init_net.loopback_dev->ifindex,
1da177e4
LT
1879 dev, &spec_dst, &itag);
1880 if (result < 0)
1881 goto martian_source;
1882 if (result)
1883 flags |= RTCF_DIRECTSRC;
1884 spec_dst = daddr;
1885 goto local_input;
1886 }
1887
1888 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1889 goto e_hostunreach;
1da177e4
LT
1890 if (res.type != RTN_UNICAST)
1891 goto martian_destination;
1892
1893 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1da177e4
LT
1894done:
1895 in_dev_put(in_dev);
1896 if (free_res)
1897 fib_res_put(&res);
1898out: return err;
1899
1900brd_input:
1901 if (skb->protocol != htons(ETH_P_IP))
1902 goto e_inval;
1903
1904 if (ZERONET(saddr))
1905 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1906 else {
1907 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1908 &itag);
1909 if (err < 0)
1910 goto martian_source;
1911 if (err)
1912 flags |= RTCF_DIRECTSRC;
1913 }
1914 flags |= RTCF_BROADCAST;
1915 res.type = RTN_BROADCAST;
1916 RT_CACHE_STAT_INC(in_brd);
1917
1918local_input:
1919 rth = dst_alloc(&ipv4_dst_ops);
1920 if (!rth)
1921 goto e_nobufs;
1922
1923 rth->u.dst.output= ip_rt_bug;
1924
1925 atomic_set(&rth->u.dst.__refcnt, 1);
1926 rth->u.dst.flags= DST_HOST;
42f811b8 1927 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
1928 rth->u.dst.flags |= DST_NOPOLICY;
1929 rth->fl.fl4_dst = daddr;
1930 rth->rt_dst = daddr;
1931 rth->fl.fl4_tos = tos;
47dcf0cb 1932 rth->fl.mark = skb->mark;
1da177e4
LT
1933 rth->fl.fl4_src = saddr;
1934 rth->rt_src = saddr;
1935#ifdef CONFIG_NET_CLS_ROUTE
1936 rth->u.dst.tclassid = itag;
1937#endif
1938 rth->rt_iif =
1939 rth->fl.iif = dev->ifindex;
2774c7ab 1940 rth->u.dst.dev = init_net.loopback_dev;
1da177e4
LT
1941 dev_hold(rth->u.dst.dev);
1942 rth->idev = in_dev_get(rth->u.dst.dev);
1943 rth->rt_gateway = daddr;
1944 rth->rt_spec_dst= spec_dst;
1945 rth->u.dst.input= ip_local_deliver;
1946 rth->rt_flags = flags|RTCF_LOCAL;
1947 if (res.type == RTN_UNREACHABLE) {
1948 rth->u.dst.input= ip_error;
1949 rth->u.dst.error= -err;
1950 rth->rt_flags &= ~RTCF_LOCAL;
1951 }
1952 rth->rt_type = res.type;
8c7bc840 1953 hash = rt_hash(daddr, saddr, fl.iif);
1da177e4
LT
1954 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1955 goto done;
1956
1957no_route:
1958 RT_CACHE_STAT_INC(in_no_route);
1959 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1960 res.type = RTN_UNREACHABLE;
7f53878d
MC
1961 if (err == -ESRCH)
1962 err = -ENETUNREACH;
1da177e4
LT
1963 goto local_input;
1964
1965 /*
1966 * Do not cache martian addresses: they should be logged (RFC1812)
1967 */
1968martian_destination:
1969 RT_CACHE_STAT_INC(in_martian_dst);
1970#ifdef CONFIG_IP_ROUTE_VERBOSE
1971 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1972 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1973 "%u.%u.%u.%u, dev %s\n",
1974 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1975#endif
2c2910a4
DE
1976
1977e_hostunreach:
e905a9ed
YH
1978 err = -EHOSTUNREACH;
1979 goto done;
2c2910a4 1980
1da177e4
LT
1981e_inval:
1982 err = -EINVAL;
1983 goto done;
1984
1985e_nobufs:
1986 err = -ENOBUFS;
1987 goto done;
1988
1989martian_source:
1990 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1991 goto e_inval;
1992}
1993
9e12bb22 1994int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1995 u8 tos, struct net_device *dev)
1996{
1997 struct rtable * rth;
1998 unsigned hash;
1999 int iif = dev->ifindex;
2000
2001 tos &= IPTOS_RT_MASK;
8c7bc840 2002 hash = rt_hash(daddr, saddr, iif);
1da177e4
LT
2003
2004 rcu_read_lock();
2005 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 2006 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
2007 if (rth->fl.fl4_dst == daddr &&
2008 rth->fl.fl4_src == saddr &&
2009 rth->fl.iif == iif &&
2010 rth->fl.oif == 0 &&
47dcf0cb 2011 rth->fl.mark == skb->mark &&
1da177e4 2012 rth->fl.fl4_tos == tos) {
03f49f34 2013 dst_use(&rth->u.dst, jiffies);
1da177e4
LT
2014 RT_CACHE_STAT_INC(in_hit);
2015 rcu_read_unlock();
2016 skb->dst = (struct dst_entry*)rth;
2017 return 0;
2018 }
2019 RT_CACHE_STAT_INC(in_hlist_search);
2020 }
2021 rcu_read_unlock();
2022
2023 /* Multicast recognition logic is moved from route cache to here.
2024 The problem was that too many Ethernet cards have broken/missing
2025 hardware multicast filters :-( As result the host on multicasting
2026 network acquires a lot of useless route cache entries, sort of
2027 SDR messages from all the world. Now we try to get rid of them.
2028 Really, provided software IP multicast filter is organized
2029 reasonably (at least, hashed), it does not result in a slowdown
2030 comparing with route cache reject entries.
2031 Note, that multicast routers are not affected, because
2032 route cache entry is created eventually.
2033 */
2034 if (MULTICAST(daddr)) {
2035 struct in_device *in_dev;
2036
2037 rcu_read_lock();
e5ed6399 2038 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1da177e4 2039 int our = ip_check_mc(in_dev, daddr, saddr,
eddc9ec5 2040 ip_hdr(skb)->protocol);
1da177e4
LT
2041 if (our
2042#ifdef CONFIG_IP_MROUTE
2043 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2044#endif
2045 ) {
2046 rcu_read_unlock();
2047 return ip_route_input_mc(skb, daddr, saddr,
2048 tos, dev, our);
2049 }
2050 }
2051 rcu_read_unlock();
2052 return -EINVAL;
2053 }
2054 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2055}
2056
2057static inline int __mkroute_output(struct rtable **result,
e905a9ed 2058 struct fib_result* res,
1da177e4 2059 const struct flowi *fl,
e905a9ed
YH
2060 const struct flowi *oldflp,
2061 struct net_device *dev_out,
2062 unsigned flags)
1da177e4
LT
2063{
2064 struct rtable *rth;
2065 struct in_device *in_dev;
2066 u32 tos = RT_FL_TOS(oldflp);
2067 int err = 0;
2068
2069 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2070 return -EINVAL;
2071
e448515c 2072 if (fl->fl4_dst == htonl(0xFFFFFFFF))
1da177e4
LT
2073 res->type = RTN_BROADCAST;
2074 else if (MULTICAST(fl->fl4_dst))
2075 res->type = RTN_MULTICAST;
2076 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2077 return -EINVAL;
2078
2079 if (dev_out->flags & IFF_LOOPBACK)
2080 flags |= RTCF_LOCAL;
2081
2082 /* get work reference to inet device */
2083 in_dev = in_dev_get(dev_out);
2084 if (!in_dev)
2085 return -EINVAL;
2086
2087 if (res->type == RTN_BROADCAST) {
2088 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2089 if (res->fi) {
2090 fib_info_put(res->fi);
2091 res->fi = NULL;
2092 }
2093 } else if (res->type == RTN_MULTICAST) {
2094 flags |= RTCF_MULTICAST|RTCF_LOCAL;
e905a9ed 2095 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
1da177e4
LT
2096 oldflp->proto))
2097 flags &= ~RTCF_LOCAL;
2098 /* If multicast route do not exist use
2099 default one, but do not gateway in this case.
2100 Yes, it is hack.
2101 */
2102 if (res->fi && res->prefixlen < 4) {
2103 fib_info_put(res->fi);
2104 res->fi = NULL;
2105 }
2106 }
2107
2108
2109 rth = dst_alloc(&ipv4_dst_ops);
2110 if (!rth) {
2111 err = -ENOBUFS;
2112 goto cleanup;
e905a9ed 2113 }
1da177e4 2114
ce723d8e 2115 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 2116 rth->u.dst.flags= DST_HOST;
42f811b8 2117 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
1da177e4 2118 rth->u.dst.flags |= DST_NOXFRM;
42f811b8 2119 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
2120 rth->u.dst.flags |= DST_NOPOLICY;
2121
2122 rth->fl.fl4_dst = oldflp->fl4_dst;
2123 rth->fl.fl4_tos = tos;
2124 rth->fl.fl4_src = oldflp->fl4_src;
2125 rth->fl.oif = oldflp->oif;
47dcf0cb 2126 rth->fl.mark = oldflp->mark;
1da177e4
LT
2127 rth->rt_dst = fl->fl4_dst;
2128 rth->rt_src = fl->fl4_src;
2129 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
e905a9ed 2130 /* get references to the devices that are to be hold by the routing
1da177e4
LT
2131 cache entry */
2132 rth->u.dst.dev = dev_out;
2133 dev_hold(dev_out);
2134 rth->idev = in_dev_get(dev_out);
2135 rth->rt_gateway = fl->fl4_dst;
2136 rth->rt_spec_dst= fl->fl4_src;
2137
2138 rth->u.dst.output=ip_output;
2139
2140 RT_CACHE_STAT_INC(out_slow_tot);
2141
2142 if (flags & RTCF_LOCAL) {
2143 rth->u.dst.input = ip_local_deliver;
2144 rth->rt_spec_dst = fl->fl4_dst;
2145 }
2146 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2147 rth->rt_spec_dst = fl->fl4_src;
e905a9ed 2148 if (flags & RTCF_LOCAL &&
1da177e4
LT
2149 !(dev_out->flags & IFF_LOOPBACK)) {
2150 rth->u.dst.output = ip_mc_output;
2151 RT_CACHE_STAT_INC(out_slow_mc);
2152 }
2153#ifdef CONFIG_IP_MROUTE
2154 if (res->type == RTN_MULTICAST) {
2155 if (IN_DEV_MFORWARD(in_dev) &&
2156 !LOCAL_MCAST(oldflp->fl4_dst)) {
2157 rth->u.dst.input = ip_mr_input;
2158 rth->u.dst.output = ip_mc_output;
2159 }
2160 }
2161#endif
2162 }
2163
2164 rt_set_nexthop(rth, res, 0);
2165
2166 rth->rt_flags = flags;
2167
2168 *result = rth;
2169 cleanup:
2170 /* release work reference to inet device */
2171 in_dev_put(in_dev);
2172
2173 return err;
2174}
2175
e06e7c61
DM
2176static inline int ip_mkroute_output(struct rtable **rp,
2177 struct fib_result* res,
2178 const struct flowi *fl,
2179 const struct flowi *oldflp,
2180 struct net_device *dev_out,
2181 unsigned flags)
1da177e4 2182{
7abaa27c 2183 struct rtable *rth = NULL;
1da177e4
LT
2184 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2185 unsigned hash;
2186 if (err == 0) {
8c7bc840 2187 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
1da177e4
LT
2188 err = rt_intern_hash(hash, rth, rp);
2189 }
e905a9ed 2190
1da177e4
LT
2191 return err;
2192}
2193
1da177e4
LT
2194/*
2195 * Major route resolver routine.
2196 */
2197
2198static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2199{
2200 u32 tos = RT_FL_TOS(oldflp);
2201 struct flowi fl = { .nl_u = { .ip4_u =
2202 { .daddr = oldflp->fl4_dst,
2203 .saddr = oldflp->fl4_src,
2204 .tos = tos & IPTOS_RT_MASK,
2205 .scope = ((tos & RTO_ONLINK) ?
2206 RT_SCOPE_LINK :
2207 RT_SCOPE_UNIVERSE),
1da177e4 2208 } },
47dcf0cb 2209 .mark = oldflp->mark,
2774c7ab 2210 .iif = init_net.loopback_dev->ifindex,
1da177e4
LT
2211 .oif = oldflp->oif };
2212 struct fib_result res;
2213 unsigned flags = 0;
2214 struct net_device *dev_out = NULL;
2215 int free_res = 0;
2216 int err;
2217
2218
2219 res.fi = NULL;
2220#ifdef CONFIG_IP_MULTIPLE_TABLES
2221 res.r = NULL;
2222#endif
2223
2224 if (oldflp->fl4_src) {
2225 err = -EINVAL;
2226 if (MULTICAST(oldflp->fl4_src) ||
2227 BADCLASS(oldflp->fl4_src) ||
2228 ZERONET(oldflp->fl4_src))
2229 goto out;
2230
2231 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2232 dev_out = ip_dev_find(oldflp->fl4_src);
f6c5d736 2233 if (dev_out == NULL)
1da177e4
LT
2234 goto out;
2235
2236 /* I removed check for oif == dev_out->oif here.
2237 It was wrong for two reasons:
2238 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2239 assigned to multiple interfaces.
2240 2. Moreover, we are allowed to send packets with saddr
2241 of another iface. --ANK
2242 */
2243
f6c5d736 2244 if (oldflp->oif == 0
e448515c 2245 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
1da177e4
LT
2246 /* Special hack: user can direct multicasts
2247 and limited broadcast via necessary interface
2248 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2249 This hack is not just for fun, it allows
2250 vic,vat and friends to work.
2251 They bind socket to loopback, set ttl to zero
2252 and expect that it will work.
2253 From the viewpoint of routing cache they are broken,
2254 because we are not allowed to build multicast path
2255 with loopback source addr (look, routing cache
2256 cannot know, that ttl is zero, so that packet
2257 will not leave this host and route is valid).
2258 Luckily, this hack is good workaround.
2259 */
2260
2261 fl.oif = dev_out->ifindex;
2262 goto make_route;
2263 }
2264 if (dev_out)
2265 dev_put(dev_out);
2266 dev_out = NULL;
2267 }
2268
2269
2270 if (oldflp->oif) {
881d966b 2271 dev_out = dev_get_by_index(&init_net, oldflp->oif);
1da177e4
LT
2272 err = -ENODEV;
2273 if (dev_out == NULL)
2274 goto out;
e5ed6399
HX
2275
2276 /* RACE: Check return value of inet_select_addr instead. */
2277 if (__in_dev_get_rtnl(dev_out) == NULL) {
1da177e4
LT
2278 dev_put(dev_out);
2279 goto out; /* Wrong error code */
2280 }
2281
e448515c 2282 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
1da177e4
LT
2283 if (!fl.fl4_src)
2284 fl.fl4_src = inet_select_addr(dev_out, 0,
2285 RT_SCOPE_LINK);
2286 goto make_route;
2287 }
2288 if (!fl.fl4_src) {
2289 if (MULTICAST(oldflp->fl4_dst))
2290 fl.fl4_src = inet_select_addr(dev_out, 0,
2291 fl.fl4_scope);
2292 else if (!oldflp->fl4_dst)
2293 fl.fl4_src = inet_select_addr(dev_out, 0,
2294 RT_SCOPE_HOST);
2295 }
2296 }
2297
2298 if (!fl.fl4_dst) {
2299 fl.fl4_dst = fl.fl4_src;
2300 if (!fl.fl4_dst)
2301 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2302 if (dev_out)
2303 dev_put(dev_out);
2774c7ab 2304 dev_out = init_net.loopback_dev;
1da177e4 2305 dev_hold(dev_out);
2774c7ab 2306 fl.oif = init_net.loopback_dev->ifindex;
1da177e4
LT
2307 res.type = RTN_LOCAL;
2308 flags |= RTCF_LOCAL;
2309 goto make_route;
2310 }
2311
2312 if (fib_lookup(&fl, &res)) {
2313 res.fi = NULL;
2314 if (oldflp->oif) {
2315 /* Apparently, routing tables are wrong. Assume,
2316 that the destination is on link.
2317
2318 WHY? DW.
2319 Because we are allowed to send to iface
2320 even if it has NO routes and NO assigned
2321 addresses. When oif is specified, routing
2322 tables are looked up with only one purpose:
2323 to catch if destination is gatewayed, rather than
2324 direct. Moreover, if MSG_DONTROUTE is set,
2325 we send packet, ignoring both routing tables
2326 and ifaddr state. --ANK
2327
2328
2329 We could make it even if oif is unknown,
2330 likely IPv6, but we do not.
2331 */
2332
2333 if (fl.fl4_src == 0)
2334 fl.fl4_src = inet_select_addr(dev_out, 0,
2335 RT_SCOPE_LINK);
2336 res.type = RTN_UNICAST;
2337 goto make_route;
2338 }
2339 if (dev_out)
2340 dev_put(dev_out);
2341 err = -ENETUNREACH;
2342 goto out;
2343 }
2344 free_res = 1;
2345
2346 if (res.type == RTN_LOCAL) {
2347 if (!fl.fl4_src)
2348 fl.fl4_src = fl.fl4_dst;
2349 if (dev_out)
2350 dev_put(dev_out);
2774c7ab 2351 dev_out = init_net.loopback_dev;
1da177e4
LT
2352 dev_hold(dev_out);
2353 fl.oif = dev_out->ifindex;
2354 if (res.fi)
2355 fib_info_put(res.fi);
2356 res.fi = NULL;
2357 flags |= RTCF_LOCAL;
2358 goto make_route;
2359 }
2360
2361#ifdef CONFIG_IP_ROUTE_MULTIPATH
2362 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2363 fib_select_multipath(&fl, &res);
2364 else
2365#endif
2366 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2367 fib_select_default(&fl, &res);
2368
2369 if (!fl.fl4_src)
2370 fl.fl4_src = FIB_RES_PREFSRC(res);
2371
2372 if (dev_out)
2373 dev_put(dev_out);
2374 dev_out = FIB_RES_DEV(res);
2375 dev_hold(dev_out);
2376 fl.oif = dev_out->ifindex;
2377
2378
2379make_route:
2380 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2381
2382
2383 if (free_res)
2384 fib_res_put(&res);
2385 if (dev_out)
2386 dev_put(dev_out);
2387out: return err;
2388}
2389
2390int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2391{
2392 unsigned hash;
2393 struct rtable *rth;
2394
8c7bc840 2395 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
1da177e4
LT
2396
2397 rcu_read_lock_bh();
2398 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 2399 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
2400 if (rth->fl.fl4_dst == flp->fl4_dst &&
2401 rth->fl.fl4_src == flp->fl4_src &&
2402 rth->fl.iif == 0 &&
2403 rth->fl.oif == flp->oif &&
47dcf0cb 2404 rth->fl.mark == flp->mark &&
1da177e4
LT
2405 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2406 (IPTOS_RT_MASK | RTO_ONLINK))) {
03f49f34 2407 dst_use(&rth->u.dst, jiffies);
1da177e4
LT
2408 RT_CACHE_STAT_INC(out_hit);
2409 rcu_read_unlock_bh();
2410 *rp = rth;
2411 return 0;
2412 }
2413 RT_CACHE_STAT_INC(out_hlist_search);
2414 }
2415 rcu_read_unlock_bh();
2416
2417 return ip_route_output_slow(rp, flp);
2418}
2419
d8c97a94
ACM
2420EXPORT_SYMBOL_GPL(__ip_route_output_key);
2421
14e50e57
DM
2422static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2423{
2424}
2425
2426static struct dst_ops ipv4_dst_blackhole_ops = {
2427 .family = AF_INET,
2428 .protocol = __constant_htons(ETH_P_IP),
2429 .destroy = ipv4_dst_destroy,
2430 .check = ipv4_dst_check,
2431 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2432 .entry_size = sizeof(struct rtable),
2433};
2434
2435
14e50e57
DM
2436static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2437{
2438 struct rtable *ort = *rp;
2439 struct rtable *rt = (struct rtable *)
2440 dst_alloc(&ipv4_dst_blackhole_ops);
2441
2442 if (rt) {
2443 struct dst_entry *new = &rt->u.dst;
2444
2445 atomic_set(&new->__refcnt, 1);
2446 new->__use = 1;
352e512c
HX
2447 new->input = dst_discard;
2448 new->output = dst_discard;
14e50e57
DM
2449 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2450
2451 new->dev = ort->u.dst.dev;
2452 if (new->dev)
2453 dev_hold(new->dev);
2454
2455 rt->fl = ort->fl;
2456
2457 rt->idev = ort->idev;
2458 if (rt->idev)
2459 in_dev_hold(rt->idev);
2460 rt->rt_flags = ort->rt_flags;
2461 rt->rt_type = ort->rt_type;
2462 rt->rt_dst = ort->rt_dst;
2463 rt->rt_src = ort->rt_src;
2464 rt->rt_iif = ort->rt_iif;
2465 rt->rt_gateway = ort->rt_gateway;
2466 rt->rt_spec_dst = ort->rt_spec_dst;
2467 rt->peer = ort->peer;
2468 if (rt->peer)
2469 atomic_inc(&rt->peer->refcnt);
2470
2471 dst_free(new);
2472 }
2473
2474 dst_release(&(*rp)->u.dst);
2475 *rp = rt;
2476 return (rt ? 0 : -ENOMEM);
2477}
2478
1da177e4
LT
2479int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2480{
2481 int err;
2482
2483 if ((err = __ip_route_output_key(rp, flp)) != 0)
2484 return err;
2485
2486 if (flp->proto) {
2487 if (!flp->fl4_src)
2488 flp->fl4_src = (*rp)->rt_src;
2489 if (!flp->fl4_dst)
2490 flp->fl4_dst = (*rp)->rt_dst;
14e50e57
DM
2491 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2492 if (err == -EREMOTE)
2493 err = ipv4_dst_blackhole(rp, flp, sk);
2494
2495 return err;
1da177e4
LT
2496 }
2497
2498 return 0;
2499}
2500
d8c97a94
ACM
2501EXPORT_SYMBOL_GPL(ip_route_output_flow);
2502
1da177e4
LT
2503int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2504{
2505 return ip_route_output_flow(rp, flp, NULL, 0);
2506}
2507
2508static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2509 int nowait, unsigned int flags)
1da177e4
LT
2510{
2511 struct rtable *rt = (struct rtable*)skb->dst;
2512 struct rtmsg *r;
be403ea1 2513 struct nlmsghdr *nlh;
e3703b3d
TG
2514 long expires;
2515 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2516
2517 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2518 if (nlh == NULL)
26932566 2519 return -EMSGSIZE;
be403ea1
TG
2520
2521 r = nlmsg_data(nlh);
1da177e4
LT
2522 r->rtm_family = AF_INET;
2523 r->rtm_dst_len = 32;
2524 r->rtm_src_len = 0;
2525 r->rtm_tos = rt->fl.fl4_tos;
2526 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2527 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2528 r->rtm_type = rt->rt_type;
2529 r->rtm_scope = RT_SCOPE_UNIVERSE;
2530 r->rtm_protocol = RTPROT_UNSPEC;
2531 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2532 if (rt->rt_flags & RTCF_NOTIFY)
2533 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2534
17fb2c64 2535 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2536
1da177e4
LT
2537 if (rt->fl.fl4_src) {
2538 r->rtm_src_len = 32;
17fb2c64 2539 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
1da177e4
LT
2540 }
2541 if (rt->u.dst.dev)
be403ea1 2542 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
1da177e4
LT
2543#ifdef CONFIG_NET_CLS_ROUTE
2544 if (rt->u.dst.tclassid)
be403ea1 2545 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
1da177e4
LT
2546#endif
2547 if (rt->fl.iif)
17fb2c64 2548 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
1da177e4 2549 else if (rt->rt_src != rt->fl.fl4_src)
17fb2c64 2550 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2551
1da177e4 2552 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2553 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2554
1da177e4 2555 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
be403ea1
TG
2556 goto nla_put_failure;
2557
e3703b3d
TG
2558 error = rt->u.dst.error;
2559 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
1da177e4 2560 if (rt->peer) {
e3703b3d 2561 id = rt->peer->ip_id_count;
1da177e4 2562 if (rt->peer->tcp_ts_stamp) {
e3703b3d 2563 ts = rt->peer->tcp_ts;
9d729f72 2564 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
1da177e4
LT
2565 }
2566 }
be403ea1 2567
1da177e4
LT
2568 if (rt->fl.iif) {
2569#ifdef CONFIG_IP_MROUTE
e448515c 2570 __be32 dst = rt->rt_dst;
1da177e4
LT
2571
2572 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
42f811b8 2573 IPV4_DEVCONF_ALL(MC_FORWARDING)) {
1da177e4
LT
2574 int err = ipmr_get_route(skb, r, nowait);
2575 if (err <= 0) {
2576 if (!nowait) {
2577 if (err == 0)
2578 return 0;
be403ea1 2579 goto nla_put_failure;
1da177e4
LT
2580 } else {
2581 if (err == -EMSGSIZE)
be403ea1 2582 goto nla_put_failure;
e3703b3d 2583 error = err;
1da177e4
LT
2584 }
2585 }
2586 } else
2587#endif
be403ea1 2588 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
1da177e4
LT
2589 }
2590
e3703b3d
TG
2591 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2592 expires, error) < 0)
2593 goto nla_put_failure;
be403ea1
TG
2594
2595 return nlmsg_end(skb, nlh);
1da177e4 2596
be403ea1 2597nla_put_failure:
26932566
PM
2598 nlmsg_cancel(skb, nlh);
2599 return -EMSGSIZE;
1da177e4
LT
2600}
2601
63f3444f 2602static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 2603{
b854272b 2604 struct net *net = in_skb->sk->sk_net;
d889ce3b
TG
2605 struct rtmsg *rtm;
2606 struct nlattr *tb[RTA_MAX+1];
1da177e4 2607 struct rtable *rt = NULL;
9e12bb22
AV
2608 __be32 dst = 0;
2609 __be32 src = 0;
2610 u32 iif;
d889ce3b 2611 int err;
1da177e4
LT
2612 struct sk_buff *skb;
2613
b854272b
DL
2614 if (net != &init_net)
2615 return -EINVAL;
2616
d889ce3b
TG
2617 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2618 if (err < 0)
2619 goto errout;
2620
2621 rtm = nlmsg_data(nlh);
2622
1da177e4 2623 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2624 if (skb == NULL) {
2625 err = -ENOBUFS;
2626 goto errout;
2627 }
1da177e4
LT
2628
2629 /* Reserve room for dummy headers, this skb can pass
2630 through good chunk of routing engine.
2631 */
459a98ed 2632 skb_reset_mac_header(skb);
c1d2bbe1 2633 skb_reset_network_header(skb);
d2c962b8
SH
2634
2635 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2636 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2637 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2638
17fb2c64
AV
2639 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2640 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2641 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
1da177e4
LT
2642
2643 if (iif) {
d889ce3b
TG
2644 struct net_device *dev;
2645
881d966b 2646 dev = __dev_get_by_index(&init_net, iif);
d889ce3b
TG
2647 if (dev == NULL) {
2648 err = -ENODEV;
2649 goto errout_free;
2650 }
2651
1da177e4
LT
2652 skb->protocol = htons(ETH_P_IP);
2653 skb->dev = dev;
2654 local_bh_disable();
2655 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2656 local_bh_enable();
d889ce3b
TG
2657
2658 rt = (struct rtable*) skb->dst;
2659 if (err == 0 && rt->u.dst.error)
1da177e4
LT
2660 err = -rt->u.dst.error;
2661 } else {
d889ce3b
TG
2662 struct flowi fl = {
2663 .nl_u = {
2664 .ip4_u = {
2665 .daddr = dst,
2666 .saddr = src,
2667 .tos = rtm->rtm_tos,
2668 },
2669 },
2670 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2671 };
1da177e4
LT
2672 err = ip_route_output_key(&rt, &fl);
2673 }
d889ce3b 2674
1da177e4 2675 if (err)
d889ce3b 2676 goto errout_free;
1da177e4
LT
2677
2678 skb->dst = &rt->u.dst;
2679 if (rtm->rtm_flags & RTM_F_NOTIFY)
2680 rt->rt_flags |= RTCF_NOTIFY;
2681
1da177e4 2682 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
b6544c0b 2683 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2684 if (err <= 0)
2685 goto errout_free;
1da177e4 2686
97c53cac 2687 err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
d889ce3b 2688errout:
2942e900 2689 return err;
1da177e4 2690
d889ce3b 2691errout_free:
1da177e4 2692 kfree_skb(skb);
d889ce3b 2693 goto errout;
1da177e4
LT
2694}
2695
2696int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2697{
2698 struct rtable *rt;
2699 int h, s_h;
2700 int idx, s_idx;
2701
2702 s_h = cb->args[0];
d8c92830
ED
2703 if (s_h < 0)
2704 s_h = 0;
1da177e4 2705 s_idx = idx = cb->args[1];
d8c92830 2706 for (h = s_h; h <= rt_hash_mask; h++) {
1da177e4
LT
2707 rcu_read_lock_bh();
2708 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
093c2ca4 2709 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
1da177e4
LT
2710 if (idx < s_idx)
2711 continue;
2712 skb->dst = dst_clone(&rt->u.dst);
2713 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
e905a9ed 2714 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 2715 1, NLM_F_MULTI) <= 0) {
1da177e4
LT
2716 dst_release(xchg(&skb->dst, NULL));
2717 rcu_read_unlock_bh();
2718 goto done;
2719 }
2720 dst_release(xchg(&skb->dst, NULL));
2721 }
2722 rcu_read_unlock_bh();
d8c92830 2723 s_idx = 0;
1da177e4
LT
2724 }
2725
2726done:
2727 cb->args[0] = h;
2728 cb->args[1] = idx;
2729 return skb->len;
2730}
2731
2732void ip_rt_multicast_event(struct in_device *in_dev)
2733{
2734 rt_cache_flush(0);
2735}
2736
2737#ifdef CONFIG_SYSCTL
2738static int flush_delay;
2739
2740static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2741 struct file *filp, void __user *buffer,
2742 size_t *lenp, loff_t *ppos)
2743{
2744 if (write) {
2745 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2746 rt_cache_flush(flush_delay);
2747 return 0;
e905a9ed 2748 }
1da177e4
LT
2749
2750 return -EINVAL;
2751}
2752
2753static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2754 int __user *name,
2755 int nlen,
2756 void __user *oldval,
2757 size_t __user *oldlenp,
2758 void __user *newval,
1f29bcd7 2759 size_t newlen)
1da177e4
LT
2760{
2761 int delay;
2762 if (newlen != sizeof(int))
2763 return -EINVAL;
2764 if (get_user(delay, (int __user *)newval))
e905a9ed
YH
2765 return -EFAULT;
2766 rt_cache_flush(delay);
1da177e4
LT
2767 return 0;
2768}
2769
2770ctl_table ipv4_route_table[] = {
e905a9ed 2771 {
1da177e4
LT
2772 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2773 .procname = "flush",
2774 .data = &flush_delay,
2775 .maxlen = sizeof(int),
7e3e0360 2776 .mode = 0200,
1da177e4
LT
2777 .proc_handler = &ipv4_sysctl_rtcache_flush,
2778 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2779 },
2780 {
2781 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2782 .procname = "min_delay",
2783 .data = &ip_rt_min_delay,
2784 .maxlen = sizeof(int),
2785 .mode = 0644,
2786 .proc_handler = &proc_dointvec_jiffies,
2787 .strategy = &sysctl_jiffies,
2788 },
2789 {
2790 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2791 .procname = "max_delay",
2792 .data = &ip_rt_max_delay,
2793 .maxlen = sizeof(int),
2794 .mode = 0644,
2795 .proc_handler = &proc_dointvec_jiffies,
2796 .strategy = &sysctl_jiffies,
2797 },
2798 {
2799 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2800 .procname = "gc_thresh",
2801 .data = &ipv4_dst_ops.gc_thresh,
2802 .maxlen = sizeof(int),
2803 .mode = 0644,
2804 .proc_handler = &proc_dointvec,
2805 },
2806 {
2807 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2808 .procname = "max_size",
2809 .data = &ip_rt_max_size,
2810 .maxlen = sizeof(int),
2811 .mode = 0644,
2812 .proc_handler = &proc_dointvec,
2813 },
2814 {
2815 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2816
1da177e4
LT
2817 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2818 .procname = "gc_min_interval",
2819 .data = &ip_rt_gc_min_interval,
2820 .maxlen = sizeof(int),
2821 .mode = 0644,
2822 .proc_handler = &proc_dointvec_jiffies,
2823 .strategy = &sysctl_jiffies,
2824 },
2825 {
2826 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2827 .procname = "gc_min_interval_ms",
2828 .data = &ip_rt_gc_min_interval,
2829 .maxlen = sizeof(int),
2830 .mode = 0644,
2831 .proc_handler = &proc_dointvec_ms_jiffies,
2832 .strategy = &sysctl_ms_jiffies,
2833 },
2834 {
2835 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2836 .procname = "gc_timeout",
2837 .data = &ip_rt_gc_timeout,
2838 .maxlen = sizeof(int),
2839 .mode = 0644,
2840 .proc_handler = &proc_dointvec_jiffies,
2841 .strategy = &sysctl_jiffies,
2842 },
2843 {
2844 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2845 .procname = "gc_interval",
2846 .data = &ip_rt_gc_interval,
2847 .maxlen = sizeof(int),
2848 .mode = 0644,
2849 .proc_handler = &proc_dointvec_jiffies,
2850 .strategy = &sysctl_jiffies,
2851 },
2852 {
2853 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2854 .procname = "redirect_load",
2855 .data = &ip_rt_redirect_load,
2856 .maxlen = sizeof(int),
2857 .mode = 0644,
2858 .proc_handler = &proc_dointvec,
2859 },
2860 {
2861 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2862 .procname = "redirect_number",
2863 .data = &ip_rt_redirect_number,
2864 .maxlen = sizeof(int),
2865 .mode = 0644,
2866 .proc_handler = &proc_dointvec,
2867 },
2868 {
2869 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2870 .procname = "redirect_silence",
2871 .data = &ip_rt_redirect_silence,
2872 .maxlen = sizeof(int),
2873 .mode = 0644,
2874 .proc_handler = &proc_dointvec,
2875 },
2876 {
2877 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2878 .procname = "error_cost",
2879 .data = &ip_rt_error_cost,
2880 .maxlen = sizeof(int),
2881 .mode = 0644,
2882 .proc_handler = &proc_dointvec,
2883 },
2884 {
2885 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2886 .procname = "error_burst",
2887 .data = &ip_rt_error_burst,
2888 .maxlen = sizeof(int),
2889 .mode = 0644,
2890 .proc_handler = &proc_dointvec,
2891 },
2892 {
2893 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2894 .procname = "gc_elasticity",
2895 .data = &ip_rt_gc_elasticity,
2896 .maxlen = sizeof(int),
2897 .mode = 0644,
2898 .proc_handler = &proc_dointvec,
2899 },
2900 {
2901 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2902 .procname = "mtu_expires",
2903 .data = &ip_rt_mtu_expires,
2904 .maxlen = sizeof(int),
2905 .mode = 0644,
2906 .proc_handler = &proc_dointvec_jiffies,
2907 .strategy = &sysctl_jiffies,
2908 },
2909 {
2910 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2911 .procname = "min_pmtu",
2912 .data = &ip_rt_min_pmtu,
2913 .maxlen = sizeof(int),
2914 .mode = 0644,
2915 .proc_handler = &proc_dointvec,
2916 },
2917 {
2918 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2919 .procname = "min_adv_mss",
2920 .data = &ip_rt_min_advmss,
2921 .maxlen = sizeof(int),
2922 .mode = 0644,
2923 .proc_handler = &proc_dointvec,
2924 },
2925 {
2926 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2927 .procname = "secret_interval",
2928 .data = &ip_rt_secret_interval,
2929 .maxlen = sizeof(int),
2930 .mode = 0644,
2931 .proc_handler = &proc_dointvec_jiffies,
2932 .strategy = &sysctl_jiffies,
2933 },
2934 { .ctl_name = 0 }
2935};
2936#endif
2937
2938#ifdef CONFIG_NET_CLS_ROUTE
8dbde28d 2939struct ip_rt_acct *ip_rt_acct __read_mostly;
1da177e4
LT
2940#endif /* CONFIG_NET_CLS_ROUTE */
2941
2942static __initdata unsigned long rhash_entries;
2943static int __init set_rhash_entries(char *str)
2944{
2945 if (!str)
2946 return 0;
2947 rhash_entries = simple_strtoul(str, &str, 0);
2948 return 1;
2949}
2950__setup("rhash_entries=", set_rhash_entries);
2951
2952int __init ip_rt_init(void)
2953{
424c4b70 2954 int rc = 0;
1da177e4
LT
2955
2956 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2957 (jiffies ^ (jiffies >> 7)));
2958
2959#ifdef CONFIG_NET_CLS_ROUTE
8dbde28d 2960 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
1da177e4
LT
2961 if (!ip_rt_acct)
2962 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
2963#endif
2964
e5d679f3
AD
2965 ipv4_dst_ops.kmem_cachep =
2966 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 2967 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 2968
14e50e57
DM
2969 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2970
424c4b70
ED
2971 rt_hash_table = (struct rt_hash_bucket *)
2972 alloc_large_system_hash("IP route cache",
2973 sizeof(struct rt_hash_bucket),
2974 rhash_entries,
2975 (num_physpages >= 128 * 1024) ?
18955cfc 2976 15 : 17,
8d1502de 2977 0,
424c4b70
ED
2978 &rt_hash_log,
2979 &rt_hash_mask,
2980 0);
22c047cc
ED
2981 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2982 rt_hash_lock_init();
1da177e4
LT
2983
2984 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2985 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2986
1da177e4
LT
2987 devinet_init();
2988 ip_fib_init();
2989
b24b8a24
PE
2990 setup_timer(&rt_flush_timer, rt_run_flush, 0);
2991 setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
1da177e4
LT
2992
2993 /* All the timers, started at system startup tend
2994 to synchronize. Perturb it a bit.
2995 */
39c90ece
ED
2996 schedule_delayed_work(&expires_work,
2997 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
1da177e4
LT
2998
2999 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3000 ip_rt_secret_interval;
3001 add_timer(&rt_secret_timer);
3002
3003#ifdef CONFIG_PROC_FS
3004 {
3005 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
457c4cbc 3006 if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
e905a9ed 3007 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
457c4cbc 3008 init_net.proc_net_stat))) {
1da177e4
LT
3009 return -ENOMEM;
3010 }
3011 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3012 }
3013#ifdef CONFIG_NET_CLS_ROUTE
457c4cbc 3014 create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
1da177e4
LT
3015#endif
3016#endif
3017#ifdef CONFIG_XFRM
3018 xfrm_init();
3019 xfrm4_init();
3020#endif
63f3444f
TG
3021 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3022
1da177e4
LT
3023 return rc;
3024}
3025
3026EXPORT_SYMBOL(__ip_select_ident);
3027EXPORT_SYMBOL(ip_route_input);
3028EXPORT_SYMBOL(ip_route_output_key);