]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/route.c
[NETLINK]: Mark attribute construction exception unlikely
[net-next-2.6.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
e905a9ed 23 * Alan Cox : Super /proc >4K
1da177e4
LT
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
e905a9ed 41 *
1da177e4
LT
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
60 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
1da177e4
LT
67#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
1da177e4 73#include <linux/mm.h>
424c4b70 74#include <linux/bootmem.h>
1da177e4
LT
75#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
39c90ece 84#include <linux/workqueue.h>
1da177e4 85#include <linux/skbuff.h>
1da177e4
LT
86#include <linux/inetdevice.h>
87#include <linux/igmp.h>
88#include <linux/pkt_sched.h>
89#include <linux/mroute.h>
90#include <linux/netfilter_ipv4.h>
91#include <linux/random.h>
92#include <linux/jhash.h>
93#include <linux/rcupdate.h>
94#include <linux/times.h>
352e512c 95#include <net/dst.h>
457c4cbc 96#include <net/net_namespace.h>
1da177e4
LT
97#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
112
113#define RT_FL_TOS(oldflp) \
114 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116#define IP_MAX_MTU 0xFFF0
117
118#define RT_GC_TIMEOUT (300*HZ)
119
120static int ip_rt_min_delay = 2 * HZ;
121static int ip_rt_max_delay = 10 * HZ;
122static int ip_rt_max_size;
123static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
124static int ip_rt_gc_interval = 60 * HZ;
125static int ip_rt_gc_min_interval = HZ / 2;
126static int ip_rt_redirect_number = 9;
127static int ip_rt_redirect_load = HZ / 50;
128static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
129static int ip_rt_error_cost = HZ;
130static int ip_rt_error_burst = 5 * HZ;
131static int ip_rt_gc_elasticity = 8;
132static int ip_rt_mtu_expires = 10 * 60 * HZ;
133static int ip_rt_min_pmtu = 512 + 20 + 20;
134static int ip_rt_min_advmss = 256;
135static int ip_rt_secret_interval = 10 * 60 * HZ;
beb659bd 136static int ip_rt_flush_expected;
1da177e4
LT
137static unsigned long rt_deadline;
138
139#define RTprint(a...) printk(KERN_DEBUG a)
140
141static struct timer_list rt_flush_timer;
beb659bd
ED
142static void rt_worker_func(struct work_struct *work);
143static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
1da177e4
LT
144static struct timer_list rt_secret_timer;
145
146/*
147 * Interface to generic destination cache.
148 */
149
150static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151static void ipv4_dst_destroy(struct dst_entry *dst);
152static void ipv4_dst_ifdown(struct dst_entry *dst,
153 struct net_device *dev, int how);
154static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155static void ipv4_link_failure(struct sk_buff *skb);
156static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
157static int rt_garbage_collect(void);
158
159
160static struct dst_ops ipv4_dst_ops = {
161 .family = AF_INET,
162 .protocol = __constant_htons(ETH_P_IP),
163 .gc = rt_garbage_collect,
164 .check = ipv4_dst_check,
165 .destroy = ipv4_dst_destroy,
166 .ifdown = ipv4_dst_ifdown,
167 .negative_advice = ipv4_negative_advice,
168 .link_failure = ipv4_link_failure,
169 .update_pmtu = ip_rt_update_pmtu,
862b82c6 170 .local_out = ip_local_out,
1da177e4
LT
171 .entry_size = sizeof(struct rtable),
172};
173
174#define ECN_OR_COST(class) TC_PRIO_##class
175
4839c52b 176const __u8 ip_tos2prio[16] = {
1da177e4
LT
177 TC_PRIO_BESTEFFORT,
178 ECN_OR_COST(FILLER),
179 TC_PRIO_BESTEFFORT,
180 ECN_OR_COST(BESTEFFORT),
181 TC_PRIO_BULK,
182 ECN_OR_COST(BULK),
183 TC_PRIO_BULK,
184 ECN_OR_COST(BULK),
185 TC_PRIO_INTERACTIVE,
186 ECN_OR_COST(INTERACTIVE),
187 TC_PRIO_INTERACTIVE,
188 ECN_OR_COST(INTERACTIVE),
189 TC_PRIO_INTERACTIVE_BULK,
190 ECN_OR_COST(INTERACTIVE_BULK),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK)
193};
194
195
196/*
197 * Route cache.
198 */
199
200/* The locking scheme is rather straight forward:
201 *
202 * 1) Read-Copy Update protects the buckets of the central route hash.
203 * 2) Only writers remove entries, and they hold the lock
204 * as they look at rtable reference counts.
205 * 3) Only readers acquire references to rtable entries,
206 * they do so with atomic increments and with the
207 * lock held.
208 */
209
210struct rt_hash_bucket {
211 struct rtable *chain;
22c047cc 212};
8a25d5de
IM
213#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
214 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
215/*
216 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
217 * The size of this table is a power of two and depends on the number of CPUS.
62051200 218 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 219 */
62051200
IM
220#ifdef CONFIG_LOCKDEP
221# define RT_HASH_LOCK_SZ 256
22c047cc 222#else
62051200
IM
223# if NR_CPUS >= 32
224# define RT_HASH_LOCK_SZ 4096
225# elif NR_CPUS >= 16
226# define RT_HASH_LOCK_SZ 2048
227# elif NR_CPUS >= 8
228# define RT_HASH_LOCK_SZ 1024
229# elif NR_CPUS >= 4
230# define RT_HASH_LOCK_SZ 512
231# else
232# define RT_HASH_LOCK_SZ 256
233# endif
22c047cc
ED
234#endif
235
236static spinlock_t *rt_hash_locks;
237# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
238# define rt_hash_lock_init() { \
239 int i; \
240 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
241 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
242 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
243 spin_lock_init(&rt_hash_locks[i]); \
244 }
245#else
246# define rt_hash_lock_addr(slot) NULL
247# define rt_hash_lock_init()
248#endif
1da177e4
LT
249
250static struct rt_hash_bucket *rt_hash_table;
251static unsigned rt_hash_mask;
cfcabdcc 252static unsigned int rt_hash_log;
1da177e4
LT
253static unsigned int rt_hash_rnd;
254
2f970d83 255static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
dbd2915c 256#define RT_CACHE_STAT_INC(field) \
bfe5d834 257 (__raw_get_cpu_var(rt_cache_stat).field++)
1da177e4
LT
258
259static int rt_intern_hash(unsigned hash, struct rtable *rth,
260 struct rtable **res);
261
cef2685e 262static unsigned int rt_hash_code(u32 daddr, u32 saddr)
1da177e4 263{
cef2685e 264 return (jhash_2words(daddr, saddr, rt_hash_rnd)
1da177e4
LT
265 & rt_hash_mask);
266}
267
8c7bc840
AV
268#define rt_hash(daddr, saddr, idx) \
269 rt_hash_code((__force u32)(__be32)(daddr),\
270 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
271
1da177e4
LT
272#ifdef CONFIG_PROC_FS
273struct rt_cache_iter_state {
274 int bucket;
275};
276
277static struct rtable *rt_cache_get_first(struct seq_file *seq)
278{
279 struct rtable *r = NULL;
280 struct rt_cache_iter_state *st = seq->private;
281
282 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283 rcu_read_lock_bh();
284 r = rt_hash_table[st->bucket].chain;
285 if (r)
286 break;
287 rcu_read_unlock_bh();
288 }
0bcceadc 289 return rcu_dereference(r);
1da177e4
LT
290}
291
292static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
293{
0bcceadc 294 struct rt_cache_iter_state *st = seq->private;
1da177e4 295
093c2ca4 296 r = r->u.dst.rt_next;
1da177e4
LT
297 while (!r) {
298 rcu_read_unlock_bh();
299 if (--st->bucket < 0)
300 break;
301 rcu_read_lock_bh();
302 r = rt_hash_table[st->bucket].chain;
303 }
0bcceadc 304 return rcu_dereference(r);
1da177e4
LT
305}
306
307static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
308{
309 struct rtable *r = rt_cache_get_first(seq);
310
311 if (r)
312 while (pos && (r = rt_cache_get_next(seq, r)))
313 --pos;
314 return pos ? NULL : r;
315}
316
317static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
318{
319 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
320}
321
322static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
323{
324 struct rtable *r = NULL;
325
326 if (v == SEQ_START_TOKEN)
327 r = rt_cache_get_first(seq);
328 else
329 r = rt_cache_get_next(seq, v);
330 ++*pos;
331 return r;
332}
333
334static void rt_cache_seq_stop(struct seq_file *seq, void *v)
335{
336 if (v && v != SEQ_START_TOKEN)
337 rcu_read_unlock_bh();
338}
339
340static int rt_cache_seq_show(struct seq_file *seq, void *v)
341{
342 if (v == SEQ_START_TOKEN)
343 seq_printf(seq, "%-127s\n",
344 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
345 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
346 "HHUptod\tSpecDst");
347 else {
348 struct rtable *r = v;
349 char temp[256];
350
351 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
352 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
353 r->u.dst.dev ? r->u.dst.dev->name : "*",
354 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
355 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
356 r->u.dst.__use, 0, (unsigned long)r->rt_src,
357 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
358 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
359 dst_metric(&r->u.dst, RTAX_WINDOW),
360 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
361 dst_metric(&r->u.dst, RTAX_RTTVAR)),
362 r->fl.fl4_tos,
363 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
364 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
365 dev_queue_xmit) : 0,
366 r->rt_spec_dst);
367 seq_printf(seq, "%-127s\n", temp);
e905a9ed
YH
368 }
369 return 0;
1da177e4
LT
370}
371
f690808e 372static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
373 .start = rt_cache_seq_start,
374 .next = rt_cache_seq_next,
375 .stop = rt_cache_seq_stop,
376 .show = rt_cache_seq_show,
377};
378
379static int rt_cache_seq_open(struct inode *inode, struct file *file)
380{
cf7732e4
PE
381 return seq_open_private(file, &rt_cache_seq_ops,
382 sizeof(struct rt_cache_iter_state));
1da177e4
LT
383}
384
9a32144e 385static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
386 .owner = THIS_MODULE,
387 .open = rt_cache_seq_open,
388 .read = seq_read,
389 .llseek = seq_lseek,
390 .release = seq_release_private,
391};
392
393
394static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
395{
396 int cpu;
397
398 if (*pos == 0)
399 return SEQ_START_TOKEN;
400
401 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
402 if (!cpu_possible(cpu))
403 continue;
404 *pos = cpu+1;
2f970d83 405 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
406 }
407 return NULL;
408}
409
410static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
411{
412 int cpu;
413
414 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
415 if (!cpu_possible(cpu))
416 continue;
417 *pos = cpu+1;
2f970d83 418 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
419 }
420 return NULL;
e905a9ed 421
1da177e4
LT
422}
423
424static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
425{
426
427}
428
429static int rt_cpu_seq_show(struct seq_file *seq, void *v)
430{
431 struct rt_cache_stat *st = v;
432
433 if (v == SEQ_START_TOKEN) {
5bec0039 434 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
435 return 0;
436 }
e905a9ed 437
1da177e4
LT
438 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
439 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
440 atomic_read(&ipv4_dst_ops.entries),
441 st->in_hit,
442 st->in_slow_tot,
443 st->in_slow_mc,
444 st->in_no_route,
445 st->in_brd,
446 st->in_martian_dst,
447 st->in_martian_src,
448
449 st->out_hit,
450 st->out_slow_tot,
e905a9ed 451 st->out_slow_mc,
1da177e4
LT
452
453 st->gc_total,
454 st->gc_ignored,
455 st->gc_goal_miss,
456 st->gc_dst_overflow,
457 st->in_hlist_search,
458 st->out_hlist_search
459 );
460 return 0;
461}
462
f690808e 463static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
464 .start = rt_cpu_seq_start,
465 .next = rt_cpu_seq_next,
466 .stop = rt_cpu_seq_stop,
467 .show = rt_cpu_seq_show,
468};
469
470
471static int rt_cpu_seq_open(struct inode *inode, struct file *file)
472{
473 return seq_open(file, &rt_cpu_seq_ops);
474}
475
9a32144e 476static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
477 .owner = THIS_MODULE,
478 .open = rt_cpu_seq_open,
479 .read = seq_read,
480 .llseek = seq_lseek,
481 .release = seq_release,
482};
483
484#endif /* CONFIG_PROC_FS */
e905a9ed 485
1da177e4
LT
486static __inline__ void rt_free(struct rtable *rt)
487{
1da177e4
LT
488 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
489}
490
491static __inline__ void rt_drop(struct rtable *rt)
492{
1da177e4
LT
493 ip_rt_put(rt);
494 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
495}
496
497static __inline__ int rt_fast_clean(struct rtable *rth)
498{
499 /* Kill broadcast/multicast entries very aggresively, if they
500 collide in hash table with more useful entries */
501 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
093c2ca4 502 rth->fl.iif && rth->u.dst.rt_next;
1da177e4
LT
503}
504
505static __inline__ int rt_valuable(struct rtable *rth)
506{
507 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
508 rth->u.dst.expires;
509}
510
511static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
512{
513 unsigned long age;
514 int ret = 0;
515
516 if (atomic_read(&rth->u.dst.__refcnt))
517 goto out;
518
519 ret = 1;
520 if (rth->u.dst.expires &&
521 time_after_eq(jiffies, rth->u.dst.expires))
522 goto out;
523
524 age = jiffies - rth->u.dst.lastuse;
525 ret = 0;
526 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
527 (age <= tmo2 && rt_valuable(rth)))
528 goto out;
529 ret = 1;
530out: return ret;
531}
532
533/* Bits of score are:
534 * 31: very valuable
535 * 30: not quite useless
536 * 29..0: usage counter
537 */
538static inline u32 rt_score(struct rtable *rt)
539{
540 u32 score = jiffies - rt->u.dst.lastuse;
541
542 score = ~score & ~(3<<30);
543
544 if (rt_valuable(rt))
545 score |= (1<<31);
546
547 if (!rt->fl.iif ||
548 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
549 score |= (1<<30);
550
551 return score;
552}
553
554static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
555{
714e85be
AV
556 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
557 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
47dcf0cb 558 (fl1->mark ^ fl2->mark) |
8238b218
DM
559 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
560 *(u16 *)&fl2->nl_u.ip4_u.tos) |
561 (fl1->oif ^ fl2->oif) |
562 (fl1->iif ^ fl2->iif)) == 0;
1da177e4
LT
563}
564
beb659bd
ED
565/*
566 * Perform a full scan of hash table and free all entries.
567 * Can be called by a softirq or a process.
568 * In the later case, we want to be reschedule if necessary
569 */
570static void rt_do_flush(int process_context)
571{
572 unsigned int i;
573 struct rtable *rth, *next;
574
575 for (i = 0; i <= rt_hash_mask; i++) {
576 if (process_context && need_resched())
577 cond_resched();
578 rth = rt_hash_table[i].chain;
579 if (!rth)
580 continue;
581
582 spin_lock_bh(rt_hash_lock_addr(i));
583 rth = rt_hash_table[i].chain;
584 rt_hash_table[i].chain = NULL;
585 spin_unlock_bh(rt_hash_lock_addr(i));
586
587 for (; rth; rth = next) {
588 next = rth->u.dst.rt_next;
589 rt_free(rth);
590 }
591 }
592}
593
594static void rt_check_expire(void)
1da177e4 595{
bb1d23b0
ED
596 static unsigned int rover;
597 unsigned int i = rover, goal;
1da177e4 598 struct rtable *rth, **rthp;
bb1d23b0
ED
599 u64 mult;
600
601 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
602 if (ip_rt_gc_timeout > 1)
603 do_div(mult, ip_rt_gc_timeout);
604 goal = (unsigned int)mult;
39c90ece
ED
605 if (goal > rt_hash_mask)
606 goal = rt_hash_mask + 1;
bb1d23b0 607 for (; goal > 0; goal--) {
1da177e4
LT
608 unsigned long tmo = ip_rt_gc_timeout;
609
610 i = (i + 1) & rt_hash_mask;
611 rthp = &rt_hash_table[i].chain;
612
d90bf5a9
ED
613 if (need_resched())
614 cond_resched();
615
cfcabdcc 616 if (*rthp == NULL)
bb1d23b0 617 continue;
39c90ece 618 spin_lock_bh(rt_hash_lock_addr(i));
1da177e4
LT
619 while ((rth = *rthp) != NULL) {
620 if (rth->u.dst.expires) {
621 /* Entry is expired even if it is in use */
39c90ece 622 if (time_before_eq(jiffies, rth->u.dst.expires)) {
1da177e4 623 tmo >>= 1;
093c2ca4 624 rthp = &rth->u.dst.rt_next;
1da177e4
LT
625 continue;
626 }
627 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
628 tmo >>= 1;
093c2ca4 629 rthp = &rth->u.dst.rt_next;
1da177e4
LT
630 continue;
631 }
632
633 /* Cleanup aged off entries. */
093c2ca4 634 *rthp = rth->u.dst.rt_next;
e905a9ed 635 rt_free(rth);
1da177e4 636 }
39c90ece 637 spin_unlock_bh(rt_hash_lock_addr(i));
1da177e4
LT
638 }
639 rover = i;
beb659bd
ED
640}
641
642/*
643 * rt_worker_func() is run in process context.
644 * If a whole flush was scheduled, it is done.
645 * Else, we call rt_check_expire() to scan part of the hash table
646 */
647static void rt_worker_func(struct work_struct *work)
648{
649 if (ip_rt_flush_expected) {
650 ip_rt_flush_expected = 0;
651 rt_do_flush(1);
652 } else
653 rt_check_expire();
39c90ece 654 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
1da177e4
LT
655}
656
657/* This can run from both BH and non-BH contexts, the latter
658 * in the case of a forced flush event.
659 */
beb659bd 660static void rt_run_flush(unsigned long process_context)
1da177e4 661{
1da177e4
LT
662 rt_deadline = 0;
663
664 get_random_bytes(&rt_hash_rnd, 4);
665
beb659bd 666 rt_do_flush(process_context);
1da177e4
LT
667}
668
669static DEFINE_SPINLOCK(rt_flush_lock);
670
671void rt_cache_flush(int delay)
672{
673 unsigned long now = jiffies;
674 int user_mode = !in_softirq();
675
676 if (delay < 0)
677 delay = ip_rt_min_delay;
678
1da177e4
LT
679 spin_lock_bh(&rt_flush_lock);
680
681 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
682 long tmo = (long)(rt_deadline - now);
683
684 /* If flush timer is already running
685 and flush request is not immediate (delay > 0):
686
687 if deadline is not achieved, prolongate timer to "delay",
688 otherwise fire it at deadline time.
689 */
690
691 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
692 tmo = 0;
e905a9ed 693
1da177e4
LT
694 if (delay > tmo)
695 delay = tmo;
696 }
697
698 if (delay <= 0) {
699 spin_unlock_bh(&rt_flush_lock);
beb659bd 700 rt_run_flush(user_mode);
1da177e4
LT
701 return;
702 }
703
704 if (rt_deadline == 0)
705 rt_deadline = now + ip_rt_max_delay;
706
707 mod_timer(&rt_flush_timer, now+delay);
708 spin_unlock_bh(&rt_flush_lock);
709}
710
beb659bd
ED
711/*
712 * We change rt_hash_rnd and ask next rt_worker_func() invocation
713 * to perform a flush in process context
714 */
1da177e4
LT
715static void rt_secret_rebuild(unsigned long dummy)
716{
beb659bd
ED
717 get_random_bytes(&rt_hash_rnd, 4);
718 ip_rt_flush_expected = 1;
719 cancel_delayed_work(&expires_work);
720 schedule_delayed_work(&expires_work, HZ/10);
721 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
1da177e4
LT
722}
723
724/*
725 Short description of GC goals.
726
727 We want to build algorithm, which will keep routing cache
728 at some equilibrium point, when number of aged off entries
729 is kept approximately equal to newly generated ones.
730
731 Current expiration strength is variable "expire".
732 We try to adjust it dynamically, so that if networking
733 is idle expires is large enough to keep enough of warm entries,
734 and when load increases it reduces to limit cache size.
735 */
736
737static int rt_garbage_collect(void)
738{
739 static unsigned long expire = RT_GC_TIMEOUT;
740 static unsigned long last_gc;
741 static int rover;
742 static int equilibrium;
743 struct rtable *rth, **rthp;
744 unsigned long now = jiffies;
745 int goal;
746
747 /*
748 * Garbage collection is pretty expensive,
749 * do not make it too frequently.
750 */
751
752 RT_CACHE_STAT_INC(gc_total);
753
754 if (now - last_gc < ip_rt_gc_min_interval &&
755 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
756 RT_CACHE_STAT_INC(gc_ignored);
757 goto out;
758 }
759
760 /* Calculate number of entries, which we want to expire now. */
761 goal = atomic_read(&ipv4_dst_ops.entries) -
762 (ip_rt_gc_elasticity << rt_hash_log);
763 if (goal <= 0) {
764 if (equilibrium < ipv4_dst_ops.gc_thresh)
765 equilibrium = ipv4_dst_ops.gc_thresh;
766 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
767 if (goal > 0) {
768 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
769 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
770 }
771 } else {
772 /* We are in dangerous area. Try to reduce cache really
773 * aggressively.
774 */
775 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
776 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
777 }
778
779 if (now - last_gc >= ip_rt_gc_min_interval)
780 last_gc = now;
781
782 if (goal <= 0) {
783 equilibrium += goal;
784 goto work_done;
785 }
786
787 do {
788 int i, k;
789
790 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
791 unsigned long tmo = expire;
792
793 k = (k + 1) & rt_hash_mask;
794 rthp = &rt_hash_table[k].chain;
22c047cc 795 spin_lock_bh(rt_hash_lock_addr(k));
1da177e4
LT
796 while ((rth = *rthp) != NULL) {
797 if (!rt_may_expire(rth, tmo, expire)) {
798 tmo >>= 1;
093c2ca4 799 rthp = &rth->u.dst.rt_next;
1da177e4
LT
800 continue;
801 }
093c2ca4 802 *rthp = rth->u.dst.rt_next;
1da177e4
LT
803 rt_free(rth);
804 goal--;
1da177e4 805 }
22c047cc 806 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
807 if (goal <= 0)
808 break;
809 }
810 rover = k;
811
812 if (goal <= 0)
813 goto work_done;
814
815 /* Goal is not achieved. We stop process if:
816
817 - if expire reduced to zero. Otherwise, expire is halfed.
818 - if table is not full.
819 - if we are called from interrupt.
820 - jiffies check is just fallback/debug loop breaker.
821 We will not spin here for long time in any case.
822 */
823
824 RT_CACHE_STAT_INC(gc_goal_miss);
825
826 if (expire == 0)
827 break;
828
829 expire >>= 1;
830#if RT_CACHE_DEBUG >= 2
831 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
832 atomic_read(&ipv4_dst_ops.entries), goal, i);
833#endif
834
835 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
836 goto out;
837 } while (!in_softirq() && time_before_eq(jiffies, now));
838
839 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
840 goto out;
841 if (net_ratelimit())
842 printk(KERN_WARNING "dst cache overflow\n");
843 RT_CACHE_STAT_INC(gc_dst_overflow);
844 return 1;
845
846work_done:
847 expire += ip_rt_gc_min_interval;
848 if (expire > ip_rt_gc_timeout ||
849 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
850 expire = ip_rt_gc_timeout;
851#if RT_CACHE_DEBUG >= 2
852 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
853 atomic_read(&ipv4_dst_ops.entries), goal, rover);
854#endif
855out: return 0;
856}
857
858static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
859{
860 struct rtable *rth, **rthp;
861 unsigned long now;
862 struct rtable *cand, **candp;
863 u32 min_score;
864 int chain_length;
865 int attempts = !in_softirq();
866
867restart:
868 chain_length = 0;
869 min_score = ~(u32)0;
870 cand = NULL;
871 candp = NULL;
872 now = jiffies;
873
874 rthp = &rt_hash_table[hash].chain;
875
22c047cc 876 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 877 while ((rth = *rthp) != NULL) {
1da177e4 878 if (compare_keys(&rth->fl, &rt->fl)) {
1da177e4 879 /* Put it first */
093c2ca4 880 *rthp = rth->u.dst.rt_next;
1da177e4
LT
881 /*
882 * Since lookup is lockfree, the deletion
883 * must be visible to another weakly ordered CPU before
884 * the insertion at the start of the hash chain.
885 */
093c2ca4 886 rcu_assign_pointer(rth->u.dst.rt_next,
1da177e4
LT
887 rt_hash_table[hash].chain);
888 /*
889 * Since lookup is lockfree, the update writes
890 * must be ordered for consistency on SMP.
891 */
892 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
893
03f49f34 894 dst_use(&rth->u.dst, now);
22c047cc 895 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
896
897 rt_drop(rt);
898 *rp = rth;
899 return 0;
900 }
901
902 if (!atomic_read(&rth->u.dst.__refcnt)) {
903 u32 score = rt_score(rth);
904
905 if (score <= min_score) {
906 cand = rth;
907 candp = rthp;
908 min_score = score;
909 }
910 }
911
912 chain_length++;
913
093c2ca4 914 rthp = &rth->u.dst.rt_next;
1da177e4
LT
915 }
916
917 if (cand) {
918 /* ip_rt_gc_elasticity used to be average length of chain
919 * length, when exceeded gc becomes really aggressive.
920 *
921 * The second limit is less certain. At the moment it allows
922 * only 2 entries per bucket. We will see.
923 */
924 if (chain_length > ip_rt_gc_elasticity) {
093c2ca4 925 *candp = cand->u.dst.rt_next;
1da177e4
LT
926 rt_free(cand);
927 }
928 }
929
930 /* Try to bind route to arp only if it is output
931 route or unicast forwarding path.
932 */
933 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
934 int err = arp_bind_neighbour(&rt->u.dst);
935 if (err) {
22c047cc 936 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
937
938 if (err != -ENOBUFS) {
939 rt_drop(rt);
940 return err;
941 }
942
943 /* Neighbour tables are full and nothing
944 can be released. Try to shrink route cache,
945 it is most likely it holds some neighbour records.
946 */
947 if (attempts-- > 0) {
948 int saved_elasticity = ip_rt_gc_elasticity;
949 int saved_int = ip_rt_gc_min_interval;
950 ip_rt_gc_elasticity = 1;
951 ip_rt_gc_min_interval = 0;
952 rt_garbage_collect();
953 ip_rt_gc_min_interval = saved_int;
954 ip_rt_gc_elasticity = saved_elasticity;
955 goto restart;
956 }
957
958 if (net_ratelimit())
959 printk(KERN_WARNING "Neighbour table overflow.\n");
960 rt_drop(rt);
961 return -ENOBUFS;
962 }
963 }
964
093c2ca4 965 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1da177e4 966#if RT_CACHE_DEBUG >= 2
093c2ca4 967 if (rt->u.dst.rt_next) {
1da177e4
LT
968 struct rtable *trt;
969 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
970 NIPQUAD(rt->rt_dst));
093c2ca4 971 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1da177e4
LT
972 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
973 printk("\n");
974 }
975#endif
976 rt_hash_table[hash].chain = rt;
22c047cc 977 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
978 *rp = rt;
979 return 0;
980}
981
982void rt_bind_peer(struct rtable *rt, int create)
983{
984 static DEFINE_SPINLOCK(rt_peer_lock);
985 struct inet_peer *peer;
986
987 peer = inet_getpeer(rt->rt_dst, create);
988
989 spin_lock_bh(&rt_peer_lock);
990 if (rt->peer == NULL) {
991 rt->peer = peer;
992 peer = NULL;
993 }
994 spin_unlock_bh(&rt_peer_lock);
995 if (peer)
996 inet_putpeer(peer);
997}
998
999/*
1000 * Peer allocation may fail only in serious out-of-memory conditions. However
1001 * we still can generate some output.
1002 * Random ID selection looks a bit dangerous because we have no chances to
1003 * select ID being unique in a reasonable period of time.
1004 * But broken packet identifier may be better than no packet at all.
1005 */
1006static void ip_select_fb_ident(struct iphdr *iph)
1007{
1008 static DEFINE_SPINLOCK(ip_fb_id_lock);
1009 static u32 ip_fallback_id;
1010 u32 salt;
1011
1012 spin_lock_bh(&ip_fb_id_lock);
e448515c 1013 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1014 iph->id = htons(salt & 0xFFFF);
1015 ip_fallback_id = salt;
1016 spin_unlock_bh(&ip_fb_id_lock);
1017}
1018
1019void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1020{
1021 struct rtable *rt = (struct rtable *) dst;
1022
1023 if (rt) {
1024 if (rt->peer == NULL)
1025 rt_bind_peer(rt, 1);
1026
1027 /* If peer is attached to destination, it is never detached,
1028 so that we need not to grab a lock to dereference it.
1029 */
1030 if (rt->peer) {
1031 iph->id = htons(inet_getid(rt->peer, more));
1032 return;
1033 }
1034 } else
e905a9ed 1035 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1036 __builtin_return_address(0));
1da177e4
LT
1037
1038 ip_select_fb_ident(iph);
1039}
1040
1041static void rt_del(unsigned hash, struct rtable *rt)
1042{
1043 struct rtable **rthp;
1044
22c047cc 1045 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1046 ip_rt_put(rt);
1047 for (rthp = &rt_hash_table[hash].chain; *rthp;
093c2ca4 1048 rthp = &(*rthp)->u.dst.rt_next)
1da177e4 1049 if (*rthp == rt) {
093c2ca4 1050 *rthp = rt->u.dst.rt_next;
1da177e4
LT
1051 rt_free(rt);
1052 break;
1053 }
22c047cc 1054 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1055}
1056
f7655229
AV
1057void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1058 __be32 saddr, struct net_device *dev)
1da177e4
LT
1059{
1060 int i, k;
1061 struct in_device *in_dev = in_dev_get(dev);
1062 struct rtable *rth, **rthp;
f7655229 1063 __be32 skeys[2] = { saddr, 0 };
1da177e4 1064 int ikeys[2] = { dev->ifindex, 0 };
8d71740c 1065 struct netevent_redirect netevent;
1da177e4 1066
1da177e4
LT
1067 if (!in_dev)
1068 return;
1069
1070 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1071 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1072 goto reject_redirect;
1073
1074 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1075 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1076 goto reject_redirect;
1077 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1078 goto reject_redirect;
1079 } else {
1080 if (inet_addr_type(new_gw) != RTN_UNICAST)
1081 goto reject_redirect;
1082 }
1083
1084 for (i = 0; i < 2; i++) {
1085 for (k = 0; k < 2; k++) {
8c7bc840 1086 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1da177e4
LT
1087
1088 rthp=&rt_hash_table[hash].chain;
1089
1090 rcu_read_lock();
1091 while ((rth = rcu_dereference(*rthp)) != NULL) {
1092 struct rtable *rt;
1093
1094 if (rth->fl.fl4_dst != daddr ||
1095 rth->fl.fl4_src != skeys[i] ||
1da177e4
LT
1096 rth->fl.oif != ikeys[k] ||
1097 rth->fl.iif != 0) {
093c2ca4 1098 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1099 continue;
1100 }
1101
1102 if (rth->rt_dst != daddr ||
1103 rth->rt_src != saddr ||
1104 rth->u.dst.error ||
1105 rth->rt_gateway != old_gw ||
1106 rth->u.dst.dev != dev)
1107 break;
1108
1109 dst_hold(&rth->u.dst);
1110 rcu_read_unlock();
1111
1112 rt = dst_alloc(&ipv4_dst_ops);
1113 if (rt == NULL) {
1114 ip_rt_put(rth);
1115 in_dev_put(in_dev);
1116 return;
1117 }
1118
1119 /* Copy all the information. */
1120 *rt = *rth;
e905a9ed 1121 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1da177e4
LT
1122 rt->u.dst.__use = 1;
1123 atomic_set(&rt->u.dst.__refcnt, 1);
1124 rt->u.dst.child = NULL;
1125 if (rt->u.dst.dev)
1126 dev_hold(rt->u.dst.dev);
1127 if (rt->idev)
1128 in_dev_hold(rt->idev);
1129 rt->u.dst.obsolete = 0;
1130 rt->u.dst.lastuse = jiffies;
1131 rt->u.dst.path = &rt->u.dst;
1132 rt->u.dst.neighbour = NULL;
1133 rt->u.dst.hh = NULL;
1134 rt->u.dst.xfrm = NULL;
1135
1136 rt->rt_flags |= RTCF_REDIRECTED;
1137
1138 /* Gateway is different ... */
1139 rt->rt_gateway = new_gw;
1140
1141 /* Redirect received -> path was valid */
1142 dst_confirm(&rth->u.dst);
1143
1144 if (rt->peer)
1145 atomic_inc(&rt->peer->refcnt);
1146
1147 if (arp_bind_neighbour(&rt->u.dst) ||
1148 !(rt->u.dst.neighbour->nud_state &
1149 NUD_VALID)) {
1150 if (rt->u.dst.neighbour)
1151 neigh_event_send(rt->u.dst.neighbour, NULL);
1152 ip_rt_put(rth);
1153 rt_drop(rt);
1154 goto do_next;
1155 }
e905a9ed 1156
8d71740c
TT
1157 netevent.old = &rth->u.dst;
1158 netevent.new = &rt->u.dst;
e905a9ed
YH
1159 call_netevent_notifiers(NETEVENT_REDIRECT,
1160 &netevent);
1da177e4
LT
1161
1162 rt_del(hash, rth);
1163 if (!rt_intern_hash(hash, rt, &rt))
1164 ip_rt_put(rt);
1165 goto do_next;
1166 }
1167 rcu_read_unlock();
1168 do_next:
1169 ;
1170 }
1171 }
1172 in_dev_put(in_dev);
1173 return;
1174
1175reject_redirect:
1176#ifdef CONFIG_IP_ROUTE_VERBOSE
1177 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1178 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1179 "%u.%u.%u.%u ignored.\n"
cef2685e 1180 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1da177e4 1181 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
cef2685e 1182 NIPQUAD(saddr), NIPQUAD(daddr));
1da177e4
LT
1183#endif
1184 in_dev_put(in_dev);
1185}
1186
1187static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1188{
1189 struct rtable *rt = (struct rtable*)dst;
1190 struct dst_entry *ret = dst;
1191
1192 if (rt) {
1193 if (dst->obsolete) {
1194 ip_rt_put(rt);
1195 ret = NULL;
1196 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1197 rt->u.dst.expires) {
8c7bc840
AV
1198 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1199 rt->fl.oif);
1da177e4 1200#if RT_CACHE_DEBUG >= 1
56c99d04 1201 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1da177e4
LT
1202 "%u.%u.%u.%u/%02x dropped\n",
1203 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1204#endif
1205 rt_del(hash, rt);
1206 ret = NULL;
1207 }
1208 }
1209 return ret;
1210}
1211
1212/*
1213 * Algorithm:
1214 * 1. The first ip_rt_redirect_number redirects are sent
1215 * with exponential backoff, then we stop sending them at all,
1216 * assuming that the host ignores our redirects.
1217 * 2. If we did not see packets requiring redirects
1218 * during ip_rt_redirect_silence, we assume that the host
1219 * forgot redirected route and start to send redirects again.
1220 *
1221 * This algorithm is much cheaper and more intelligent than dumb load limiting
1222 * in icmp.c.
1223 *
1224 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1225 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1226 */
1227
1228void ip_rt_send_redirect(struct sk_buff *skb)
1229{
1230 struct rtable *rt = (struct rtable*)skb->dst;
1231 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1232
1233 if (!in_dev)
1234 return;
1235
1236 if (!IN_DEV_TX_REDIRECTS(in_dev))
1237 goto out;
1238
1239 /* No redirected packets during ip_rt_redirect_silence;
1240 * reset the algorithm.
1241 */
1242 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1243 rt->u.dst.rate_tokens = 0;
1244
1245 /* Too many ignored redirects; do not send anything
1246 * set u.dst.rate_last to the last seen redirected packet.
1247 */
1248 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1249 rt->u.dst.rate_last = jiffies;
1250 goto out;
1251 }
1252
1253 /* Check for load limit; set rate_last to the latest sent
1254 * redirect.
1255 */
14fb8a76
LY
1256 if (rt->u.dst.rate_tokens == 0 ||
1257 time_after(jiffies,
1da177e4
LT
1258 (rt->u.dst.rate_last +
1259 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1260 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1261 rt->u.dst.rate_last = jiffies;
1262 ++rt->u.dst.rate_tokens;
1263#ifdef CONFIG_IP_ROUTE_VERBOSE
1264 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1265 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1266 net_ratelimit())
1267 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1268 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1269 NIPQUAD(rt->rt_src), rt->rt_iif,
1270 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1271#endif
1272 }
1273out:
e905a9ed 1274 in_dev_put(in_dev);
1da177e4
LT
1275}
1276
1277static int ip_error(struct sk_buff *skb)
1278{
1279 struct rtable *rt = (struct rtable*)skb->dst;
1280 unsigned long now;
1281 int code;
1282
1283 switch (rt->u.dst.error) {
1284 case EINVAL:
1285 default:
1286 goto out;
1287 case EHOSTUNREACH:
1288 code = ICMP_HOST_UNREACH;
1289 break;
1290 case ENETUNREACH:
1291 code = ICMP_NET_UNREACH;
7f53878d 1292 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1da177e4
LT
1293 break;
1294 case EACCES:
1295 code = ICMP_PKT_FILTERED;
1296 break;
1297 }
1298
1299 now = jiffies;
1300 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1301 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1302 rt->u.dst.rate_tokens = ip_rt_error_burst;
1303 rt->u.dst.rate_last = now;
1304 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1305 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1306 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1307 }
1308
1309out: kfree_skb(skb);
1310 return 0;
e905a9ed 1311}
1da177e4
LT
1312
1313/*
1314 * The last two values are not from the RFC but
1315 * are needed for AMPRnet AX.25 paths.
1316 */
1317
9b5b5cff 1318static const unsigned short mtu_plateau[] =
1da177e4
LT
1319{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1320
1321static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1322{
1323 int i;
e905a9ed 1324
1da177e4
LT
1325 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1326 if (old_mtu > mtu_plateau[i])
1327 return mtu_plateau[i];
1328 return 68;
1329}
1330
1331unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1332{
1333 int i;
1334 unsigned short old_mtu = ntohs(iph->tot_len);
1335 struct rtable *rth;
e448515c
AV
1336 __be32 skeys[2] = { iph->saddr, 0, };
1337 __be32 daddr = iph->daddr;
1da177e4
LT
1338 unsigned short est_mtu = 0;
1339
1340 if (ipv4_config.no_pmtu_disc)
1341 return 0;
1342
1343 for (i = 0; i < 2; i++) {
8c7bc840 1344 unsigned hash = rt_hash(daddr, skeys[i], 0);
1da177e4
LT
1345
1346 rcu_read_lock();
1347 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 1348 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
1349 if (rth->fl.fl4_dst == daddr &&
1350 rth->fl.fl4_src == skeys[i] &&
1351 rth->rt_dst == daddr &&
1352 rth->rt_src == iph->saddr &&
1da177e4
LT
1353 rth->fl.iif == 0 &&
1354 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1355 unsigned short mtu = new_mtu;
1356
1357 if (new_mtu < 68 || new_mtu >= old_mtu) {
1358
1359 /* BSD 4.2 compatibility hack :-( */
1360 if (mtu == 0 &&
1361 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1362 old_mtu >= 68 + (iph->ihl << 2))
1363 old_mtu -= iph->ihl << 2;
1364
1365 mtu = guess_mtu(old_mtu);
1366 }
1367 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
e905a9ed 1368 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1da177e4
LT
1369 dst_confirm(&rth->u.dst);
1370 if (mtu < ip_rt_min_pmtu) {
1371 mtu = ip_rt_min_pmtu;
1372 rth->u.dst.metrics[RTAX_LOCK-1] |=
1373 (1 << RTAX_MTU);
1374 }
1375 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1376 dst_set_expires(&rth->u.dst,
1377 ip_rt_mtu_expires);
1378 }
1379 est_mtu = mtu;
1380 }
1381 }
1382 }
1383 rcu_read_unlock();
1384 }
1385 return est_mtu ? : new_mtu;
1386}
1387
1388static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1389{
1390 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1391 !(dst_metric_locked(dst, RTAX_MTU))) {
1392 if (mtu < ip_rt_min_pmtu) {
1393 mtu = ip_rt_min_pmtu;
1394 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1395 }
1396 dst->metrics[RTAX_MTU-1] = mtu;
1397 dst_set_expires(dst, ip_rt_mtu_expires);
8d71740c 1398 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1da177e4
LT
1399 }
1400}
1401
1402static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1403{
1404 return NULL;
1405}
1406
1407static void ipv4_dst_destroy(struct dst_entry *dst)
1408{
1409 struct rtable *rt = (struct rtable *) dst;
1410 struct inet_peer *peer = rt->peer;
1411 struct in_device *idev = rt->idev;
1412
1413 if (peer) {
1414 rt->peer = NULL;
1415 inet_putpeer(peer);
1416 }
1417
1418 if (idev) {
1419 rt->idev = NULL;
1420 in_dev_put(idev);
1421 }
1422}
1423
1424static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1425 int how)
1426{
1427 struct rtable *rt = (struct rtable *) dst;
1428 struct in_device *idev = rt->idev;
2774c7ab
EB
1429 if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1430 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
1da177e4
LT
1431 if (loopback_idev) {
1432 rt->idev = loopback_idev;
1433 in_dev_put(idev);
1434 }
1435 }
1436}
1437
1438static void ipv4_link_failure(struct sk_buff *skb)
1439{
1440 struct rtable *rt;
1441
1442 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1443
1444 rt = (struct rtable *) skb->dst;
1445 if (rt)
1446 dst_set_expires(&rt->u.dst, 0);
1447}
1448
1449static int ip_rt_bug(struct sk_buff *skb)
1450{
1451 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
eddc9ec5 1452 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1da177e4
LT
1453 skb->dev ? skb->dev->name : "?");
1454 kfree_skb(skb);
1455 return 0;
1456}
1457
1458/*
1459 We do not cache source address of outgoing interface,
1460 because it is used only by IP RR, TS and SRR options,
1461 so that it out of fast path.
1462
1463 BTW remember: "addr" is allowed to be not aligned
1464 in IP options!
1465 */
1466
1467void ip_rt_get_source(u8 *addr, struct rtable *rt)
1468{
a61ced5d 1469 __be32 src;
1da177e4
LT
1470 struct fib_result res;
1471
1472 if (rt->fl.iif == 0)
1473 src = rt->rt_src;
1474 else if (fib_lookup(&rt->fl, &res) == 0) {
1475 src = FIB_RES_PREFSRC(res);
1476 fib_res_put(&res);
1477 } else
1478 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1479 RT_SCOPE_UNIVERSE);
1480 memcpy(addr, &src, 4);
1481}
1482
1483#ifdef CONFIG_NET_CLS_ROUTE
1484static void set_class_tag(struct rtable *rt, u32 tag)
1485{
1486 if (!(rt->u.dst.tclassid & 0xFFFF))
1487 rt->u.dst.tclassid |= tag & 0xFFFF;
1488 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1489 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1490}
1491#endif
1492
1493static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1494{
1495 struct fib_info *fi = res->fi;
1496
1497 if (fi) {
1498 if (FIB_RES_GW(*res) &&
1499 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1500 rt->rt_gateway = FIB_RES_GW(*res);
1501 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1502 sizeof(rt->u.dst.metrics));
1503 if (fi->fib_mtu == 0) {
1504 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1505 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1506 rt->rt_gateway != rt->rt_dst &&
1507 rt->u.dst.dev->mtu > 576)
1508 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1509 }
1510#ifdef CONFIG_NET_CLS_ROUTE
1511 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1512#endif
1513 } else
1514 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1515
1516 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1517 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1518 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1519 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1520 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1521 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1522 ip_rt_min_advmss);
1523 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1524 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1525
1526#ifdef CONFIG_NET_CLS_ROUTE
1527#ifdef CONFIG_IP_MULTIPLE_TABLES
1528 set_class_tag(rt, fib_rules_tclass(res));
1529#endif
1530 set_class_tag(rt, itag);
1531#endif
e905a9ed 1532 rt->rt_type = res->type;
1da177e4
LT
1533}
1534
9e12bb22 1535static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1536 u8 tos, struct net_device *dev, int our)
1537{
1538 unsigned hash;
1539 struct rtable *rth;
a61ced5d 1540 __be32 spec_dst;
1da177e4
LT
1541 struct in_device *in_dev = in_dev_get(dev);
1542 u32 itag = 0;
1543
1544 /* Primary sanity checks. */
1545
1546 if (in_dev == NULL)
1547 return -EINVAL;
1548
1549 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1550 skb->protocol != htons(ETH_P_IP))
1551 goto e_inval;
1552
1553 if (ZERONET(saddr)) {
1554 if (!LOCAL_MCAST(daddr))
1555 goto e_inval;
1556 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1557 } else if (fib_validate_source(saddr, 0, tos, 0,
1558 dev, &spec_dst, &itag) < 0)
1559 goto e_inval;
1560
1561 rth = dst_alloc(&ipv4_dst_ops);
1562 if (!rth)
1563 goto e_nobufs;
1564
1565 rth->u.dst.output= ip_rt_bug;
1566
1567 atomic_set(&rth->u.dst.__refcnt, 1);
1568 rth->u.dst.flags= DST_HOST;
42f811b8 1569 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
1570 rth->u.dst.flags |= DST_NOPOLICY;
1571 rth->fl.fl4_dst = daddr;
1572 rth->rt_dst = daddr;
1573 rth->fl.fl4_tos = tos;
47dcf0cb 1574 rth->fl.mark = skb->mark;
1da177e4
LT
1575 rth->fl.fl4_src = saddr;
1576 rth->rt_src = saddr;
1577#ifdef CONFIG_NET_CLS_ROUTE
1578 rth->u.dst.tclassid = itag;
1579#endif
1580 rth->rt_iif =
1581 rth->fl.iif = dev->ifindex;
2774c7ab 1582 rth->u.dst.dev = init_net.loopback_dev;
1da177e4
LT
1583 dev_hold(rth->u.dst.dev);
1584 rth->idev = in_dev_get(rth->u.dst.dev);
1585 rth->fl.oif = 0;
1586 rth->rt_gateway = daddr;
1587 rth->rt_spec_dst= spec_dst;
1588 rth->rt_type = RTN_MULTICAST;
1589 rth->rt_flags = RTCF_MULTICAST;
1590 if (our) {
1591 rth->u.dst.input= ip_local_deliver;
1592 rth->rt_flags |= RTCF_LOCAL;
1593 }
1594
1595#ifdef CONFIG_IP_MROUTE
1596 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1597 rth->u.dst.input = ip_mr_input;
1598#endif
1599 RT_CACHE_STAT_INC(in_slow_mc);
1600
1601 in_dev_put(in_dev);
8c7bc840 1602 hash = rt_hash(daddr, saddr, dev->ifindex);
1da177e4
LT
1603 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1604
1605e_nobufs:
1606 in_dev_put(in_dev);
1607 return -ENOBUFS;
1608
1609e_inval:
1610 in_dev_put(in_dev);
1611 return -EINVAL;
1612}
1613
1614
1615static void ip_handle_martian_source(struct net_device *dev,
1616 struct in_device *in_dev,
1617 struct sk_buff *skb,
9e12bb22
AV
1618 __be32 daddr,
1619 __be32 saddr)
1da177e4
LT
1620{
1621 RT_CACHE_STAT_INC(in_martian_src);
1622#ifdef CONFIG_IP_ROUTE_VERBOSE
1623 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1624 /*
1625 * RFC1812 recommendation, if source is martian,
1626 * the only hint is MAC header.
1627 */
1628 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1629 "%u.%u.%u.%u, on dev %s\n",
1630 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
98e399f8 1631 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 1632 int i;
98e399f8 1633 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
1634 printk(KERN_WARNING "ll header: ");
1635 for (i = 0; i < dev->hard_header_len; i++, p++) {
1636 printk("%02x", *p);
1637 if (i < (dev->hard_header_len - 1))
1638 printk(":");
1639 }
1640 printk("\n");
1641 }
1642 }
1643#endif
1644}
1645
e905a9ed
YH
1646static inline int __mkroute_input(struct sk_buff *skb,
1647 struct fib_result* res,
1648 struct in_device *in_dev,
9e12bb22 1649 __be32 daddr, __be32 saddr, u32 tos,
e905a9ed 1650 struct rtable **result)
1da177e4
LT
1651{
1652
1653 struct rtable *rth;
1654 int err;
1655 struct in_device *out_dev;
1656 unsigned flags = 0;
d9c9df8c
AV
1657 __be32 spec_dst;
1658 u32 itag;
1da177e4
LT
1659
1660 /* get a working reference to the output device */
1661 out_dev = in_dev_get(FIB_RES_DEV(*res));
1662 if (out_dev == NULL) {
1663 if (net_ratelimit())
1664 printk(KERN_CRIT "Bug in ip_route_input" \
1665 "_slow(). Please, report\n");
1666 return -EINVAL;
1667 }
1668
1669
e905a9ed 1670 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1da177e4
LT
1671 in_dev->dev, &spec_dst, &itag);
1672 if (err < 0) {
e905a9ed 1673 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1674 saddr);
e905a9ed 1675
1da177e4
LT
1676 err = -EINVAL;
1677 goto cleanup;
1678 }
1679
1680 if (err)
1681 flags |= RTCF_DIRECTSRC;
1682
1683 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1684 (IN_DEV_SHARED_MEDIA(out_dev) ||
1685 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1686 flags |= RTCF_DOREDIRECT;
1687
1688 if (skb->protocol != htons(ETH_P_IP)) {
1689 /* Not IP (i.e. ARP). Do not create route, if it is
1690 * invalid for proxy arp. DNAT routes are always valid.
1691 */
1692 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1693 err = -EINVAL;
1694 goto cleanup;
1695 }
1696 }
1697
1698
1699 rth = dst_alloc(&ipv4_dst_ops);
1700 if (!rth) {
1701 err = -ENOBUFS;
1702 goto cleanup;
1703 }
1704
ce723d8e 1705 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 1706 rth->u.dst.flags= DST_HOST;
42f811b8 1707 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4 1708 rth->u.dst.flags |= DST_NOPOLICY;
42f811b8 1709 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1da177e4
LT
1710 rth->u.dst.flags |= DST_NOXFRM;
1711 rth->fl.fl4_dst = daddr;
1712 rth->rt_dst = daddr;
1713 rth->fl.fl4_tos = tos;
47dcf0cb 1714 rth->fl.mark = skb->mark;
1da177e4
LT
1715 rth->fl.fl4_src = saddr;
1716 rth->rt_src = saddr;
1717 rth->rt_gateway = daddr;
1718 rth->rt_iif =
1719 rth->fl.iif = in_dev->dev->ifindex;
1720 rth->u.dst.dev = (out_dev)->dev;
1721 dev_hold(rth->u.dst.dev);
1722 rth->idev = in_dev_get(rth->u.dst.dev);
1723 rth->fl.oif = 0;
1724 rth->rt_spec_dst= spec_dst;
1725
1726 rth->u.dst.input = ip_forward;
1727 rth->u.dst.output = ip_output;
1728
1729 rt_set_nexthop(rth, res, itag);
1730
1731 rth->rt_flags = flags;
1732
1733 *result = rth;
1734 err = 0;
1735 cleanup:
1736 /* release the working reference to the output device */
1737 in_dev_put(out_dev);
1738 return err;
e905a9ed 1739}
1da177e4 1740
e06e7c61
DM
1741static inline int ip_mkroute_input(struct sk_buff *skb,
1742 struct fib_result* res,
1743 const struct flowi *fl,
1744 struct in_device *in_dev,
1745 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1746{
7abaa27c 1747 struct rtable* rth = NULL;
1da177e4
LT
1748 int err;
1749 unsigned hash;
1750
1751#ifdef CONFIG_IP_ROUTE_MULTIPATH
1752 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1753 fib_select_multipath(fl, res);
1754#endif
1755
1756 /* create a routing cache entry */
1757 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1758 if (err)
1759 return err;
1da177e4
LT
1760
1761 /* put it into the cache */
8c7bc840 1762 hash = rt_hash(daddr, saddr, fl->iif);
e905a9ed 1763 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1da177e4
LT
1764}
1765
1da177e4
LT
1766/*
1767 * NOTE. We drop all the packets that has local source
1768 * addresses, because every properly looped back packet
1769 * must have correct destination already attached by output routine.
1770 *
1771 * Such approach solves two big problems:
1772 * 1. Not simplex devices are handled properly.
1773 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1774 */
1775
9e12bb22 1776static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1777 u8 tos, struct net_device *dev)
1778{
1779 struct fib_result res;
1780 struct in_device *in_dev = in_dev_get(dev);
1781 struct flowi fl = { .nl_u = { .ip4_u =
1782 { .daddr = daddr,
1783 .saddr = saddr,
1784 .tos = tos,
1785 .scope = RT_SCOPE_UNIVERSE,
1da177e4 1786 } },
47dcf0cb 1787 .mark = skb->mark,
1da177e4
LT
1788 .iif = dev->ifindex };
1789 unsigned flags = 0;
1790 u32 itag = 0;
1791 struct rtable * rth;
1792 unsigned hash;
9e12bb22 1793 __be32 spec_dst;
1da177e4
LT
1794 int err = -EINVAL;
1795 int free_res = 0;
1796
1797 /* IP on this device is disabled. */
1798
1799 if (!in_dev)
1800 goto out;
1801
1802 /* Check for the most weird martians, which can be not detected
1803 by fib_lookup.
1804 */
1805
1806 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1807 goto martian_source;
1808
e448515c 1809 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1da177e4
LT
1810 goto brd_input;
1811
1812 /* Accept zero addresses only to limited broadcast;
1813 * I even do not know to fix it or not. Waiting for complains :-)
1814 */
1815 if (ZERONET(saddr))
1816 goto martian_source;
1817
1818 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1819 goto martian_destination;
1820
1821 /*
1822 * Now we are ready to route packet.
1823 */
1824 if ((err = fib_lookup(&fl, &res)) != 0) {
1825 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1826 goto e_hostunreach;
1da177e4
LT
1827 goto no_route;
1828 }
1829 free_res = 1;
1830
1831 RT_CACHE_STAT_INC(in_slow_tot);
1832
1833 if (res.type == RTN_BROADCAST)
1834 goto brd_input;
1835
1836 if (res.type == RTN_LOCAL) {
1837 int result;
1838 result = fib_validate_source(saddr, daddr, tos,
2774c7ab 1839 init_net.loopback_dev->ifindex,
1da177e4
LT
1840 dev, &spec_dst, &itag);
1841 if (result < 0)
1842 goto martian_source;
1843 if (result)
1844 flags |= RTCF_DIRECTSRC;
1845 spec_dst = daddr;
1846 goto local_input;
1847 }
1848
1849 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1850 goto e_hostunreach;
1da177e4
LT
1851 if (res.type != RTN_UNICAST)
1852 goto martian_destination;
1853
1854 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1da177e4
LT
1855done:
1856 in_dev_put(in_dev);
1857 if (free_res)
1858 fib_res_put(&res);
1859out: return err;
1860
1861brd_input:
1862 if (skb->protocol != htons(ETH_P_IP))
1863 goto e_inval;
1864
1865 if (ZERONET(saddr))
1866 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1867 else {
1868 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1869 &itag);
1870 if (err < 0)
1871 goto martian_source;
1872 if (err)
1873 flags |= RTCF_DIRECTSRC;
1874 }
1875 flags |= RTCF_BROADCAST;
1876 res.type = RTN_BROADCAST;
1877 RT_CACHE_STAT_INC(in_brd);
1878
1879local_input:
1880 rth = dst_alloc(&ipv4_dst_ops);
1881 if (!rth)
1882 goto e_nobufs;
1883
1884 rth->u.dst.output= ip_rt_bug;
1885
1886 atomic_set(&rth->u.dst.__refcnt, 1);
1887 rth->u.dst.flags= DST_HOST;
42f811b8 1888 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
1889 rth->u.dst.flags |= DST_NOPOLICY;
1890 rth->fl.fl4_dst = daddr;
1891 rth->rt_dst = daddr;
1892 rth->fl.fl4_tos = tos;
47dcf0cb 1893 rth->fl.mark = skb->mark;
1da177e4
LT
1894 rth->fl.fl4_src = saddr;
1895 rth->rt_src = saddr;
1896#ifdef CONFIG_NET_CLS_ROUTE
1897 rth->u.dst.tclassid = itag;
1898#endif
1899 rth->rt_iif =
1900 rth->fl.iif = dev->ifindex;
2774c7ab 1901 rth->u.dst.dev = init_net.loopback_dev;
1da177e4
LT
1902 dev_hold(rth->u.dst.dev);
1903 rth->idev = in_dev_get(rth->u.dst.dev);
1904 rth->rt_gateway = daddr;
1905 rth->rt_spec_dst= spec_dst;
1906 rth->u.dst.input= ip_local_deliver;
1907 rth->rt_flags = flags|RTCF_LOCAL;
1908 if (res.type == RTN_UNREACHABLE) {
1909 rth->u.dst.input= ip_error;
1910 rth->u.dst.error= -err;
1911 rth->rt_flags &= ~RTCF_LOCAL;
1912 }
1913 rth->rt_type = res.type;
8c7bc840 1914 hash = rt_hash(daddr, saddr, fl.iif);
1da177e4
LT
1915 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1916 goto done;
1917
1918no_route:
1919 RT_CACHE_STAT_INC(in_no_route);
1920 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1921 res.type = RTN_UNREACHABLE;
7f53878d
MC
1922 if (err == -ESRCH)
1923 err = -ENETUNREACH;
1da177e4
LT
1924 goto local_input;
1925
1926 /*
1927 * Do not cache martian addresses: they should be logged (RFC1812)
1928 */
1929martian_destination:
1930 RT_CACHE_STAT_INC(in_martian_dst);
1931#ifdef CONFIG_IP_ROUTE_VERBOSE
1932 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1933 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1934 "%u.%u.%u.%u, dev %s\n",
1935 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1936#endif
2c2910a4
DE
1937
1938e_hostunreach:
e905a9ed
YH
1939 err = -EHOSTUNREACH;
1940 goto done;
2c2910a4 1941
1da177e4
LT
1942e_inval:
1943 err = -EINVAL;
1944 goto done;
1945
1946e_nobufs:
1947 err = -ENOBUFS;
1948 goto done;
1949
1950martian_source:
1951 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1952 goto e_inval;
1953}
1954
9e12bb22 1955int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1956 u8 tos, struct net_device *dev)
1957{
1958 struct rtable * rth;
1959 unsigned hash;
1960 int iif = dev->ifindex;
1961
1962 tos &= IPTOS_RT_MASK;
8c7bc840 1963 hash = rt_hash(daddr, saddr, iif);
1da177e4
LT
1964
1965 rcu_read_lock();
1966 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 1967 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
1968 if (rth->fl.fl4_dst == daddr &&
1969 rth->fl.fl4_src == saddr &&
1970 rth->fl.iif == iif &&
1971 rth->fl.oif == 0 &&
47dcf0cb 1972 rth->fl.mark == skb->mark &&
1da177e4 1973 rth->fl.fl4_tos == tos) {
03f49f34 1974 dst_use(&rth->u.dst, jiffies);
1da177e4
LT
1975 RT_CACHE_STAT_INC(in_hit);
1976 rcu_read_unlock();
1977 skb->dst = (struct dst_entry*)rth;
1978 return 0;
1979 }
1980 RT_CACHE_STAT_INC(in_hlist_search);
1981 }
1982 rcu_read_unlock();
1983
1984 /* Multicast recognition logic is moved from route cache to here.
1985 The problem was that too many Ethernet cards have broken/missing
1986 hardware multicast filters :-( As result the host on multicasting
1987 network acquires a lot of useless route cache entries, sort of
1988 SDR messages from all the world. Now we try to get rid of them.
1989 Really, provided software IP multicast filter is organized
1990 reasonably (at least, hashed), it does not result in a slowdown
1991 comparing with route cache reject entries.
1992 Note, that multicast routers are not affected, because
1993 route cache entry is created eventually.
1994 */
1995 if (MULTICAST(daddr)) {
1996 struct in_device *in_dev;
1997
1998 rcu_read_lock();
e5ed6399 1999 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1da177e4 2000 int our = ip_check_mc(in_dev, daddr, saddr,
eddc9ec5 2001 ip_hdr(skb)->protocol);
1da177e4
LT
2002 if (our
2003#ifdef CONFIG_IP_MROUTE
2004 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2005#endif
2006 ) {
2007 rcu_read_unlock();
2008 return ip_route_input_mc(skb, daddr, saddr,
2009 tos, dev, our);
2010 }
2011 }
2012 rcu_read_unlock();
2013 return -EINVAL;
2014 }
2015 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2016}
2017
2018static inline int __mkroute_output(struct rtable **result,
e905a9ed 2019 struct fib_result* res,
1da177e4 2020 const struct flowi *fl,
e905a9ed
YH
2021 const struct flowi *oldflp,
2022 struct net_device *dev_out,
2023 unsigned flags)
1da177e4
LT
2024{
2025 struct rtable *rth;
2026 struct in_device *in_dev;
2027 u32 tos = RT_FL_TOS(oldflp);
2028 int err = 0;
2029
2030 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2031 return -EINVAL;
2032
e448515c 2033 if (fl->fl4_dst == htonl(0xFFFFFFFF))
1da177e4
LT
2034 res->type = RTN_BROADCAST;
2035 else if (MULTICAST(fl->fl4_dst))
2036 res->type = RTN_MULTICAST;
2037 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2038 return -EINVAL;
2039
2040 if (dev_out->flags & IFF_LOOPBACK)
2041 flags |= RTCF_LOCAL;
2042
2043 /* get work reference to inet device */
2044 in_dev = in_dev_get(dev_out);
2045 if (!in_dev)
2046 return -EINVAL;
2047
2048 if (res->type == RTN_BROADCAST) {
2049 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2050 if (res->fi) {
2051 fib_info_put(res->fi);
2052 res->fi = NULL;
2053 }
2054 } else if (res->type == RTN_MULTICAST) {
2055 flags |= RTCF_MULTICAST|RTCF_LOCAL;
e905a9ed 2056 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
1da177e4
LT
2057 oldflp->proto))
2058 flags &= ~RTCF_LOCAL;
2059 /* If multicast route do not exist use
2060 default one, but do not gateway in this case.
2061 Yes, it is hack.
2062 */
2063 if (res->fi && res->prefixlen < 4) {
2064 fib_info_put(res->fi);
2065 res->fi = NULL;
2066 }
2067 }
2068
2069
2070 rth = dst_alloc(&ipv4_dst_ops);
2071 if (!rth) {
2072 err = -ENOBUFS;
2073 goto cleanup;
e905a9ed 2074 }
1da177e4 2075
ce723d8e 2076 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 2077 rth->u.dst.flags= DST_HOST;
42f811b8 2078 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
1da177e4 2079 rth->u.dst.flags |= DST_NOXFRM;
42f811b8 2080 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
2081 rth->u.dst.flags |= DST_NOPOLICY;
2082
2083 rth->fl.fl4_dst = oldflp->fl4_dst;
2084 rth->fl.fl4_tos = tos;
2085 rth->fl.fl4_src = oldflp->fl4_src;
2086 rth->fl.oif = oldflp->oif;
47dcf0cb 2087 rth->fl.mark = oldflp->mark;
1da177e4
LT
2088 rth->rt_dst = fl->fl4_dst;
2089 rth->rt_src = fl->fl4_src;
2090 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
e905a9ed 2091 /* get references to the devices that are to be hold by the routing
1da177e4
LT
2092 cache entry */
2093 rth->u.dst.dev = dev_out;
2094 dev_hold(dev_out);
2095 rth->idev = in_dev_get(dev_out);
2096 rth->rt_gateway = fl->fl4_dst;
2097 rth->rt_spec_dst= fl->fl4_src;
2098
2099 rth->u.dst.output=ip_output;
2100
2101 RT_CACHE_STAT_INC(out_slow_tot);
2102
2103 if (flags & RTCF_LOCAL) {
2104 rth->u.dst.input = ip_local_deliver;
2105 rth->rt_spec_dst = fl->fl4_dst;
2106 }
2107 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2108 rth->rt_spec_dst = fl->fl4_src;
e905a9ed 2109 if (flags & RTCF_LOCAL &&
1da177e4
LT
2110 !(dev_out->flags & IFF_LOOPBACK)) {
2111 rth->u.dst.output = ip_mc_output;
2112 RT_CACHE_STAT_INC(out_slow_mc);
2113 }
2114#ifdef CONFIG_IP_MROUTE
2115 if (res->type == RTN_MULTICAST) {
2116 if (IN_DEV_MFORWARD(in_dev) &&
2117 !LOCAL_MCAST(oldflp->fl4_dst)) {
2118 rth->u.dst.input = ip_mr_input;
2119 rth->u.dst.output = ip_mc_output;
2120 }
2121 }
2122#endif
2123 }
2124
2125 rt_set_nexthop(rth, res, 0);
2126
2127 rth->rt_flags = flags;
2128
2129 *result = rth;
2130 cleanup:
2131 /* release work reference to inet device */
2132 in_dev_put(in_dev);
2133
2134 return err;
2135}
2136
e06e7c61
DM
2137static inline int ip_mkroute_output(struct rtable **rp,
2138 struct fib_result* res,
2139 const struct flowi *fl,
2140 const struct flowi *oldflp,
2141 struct net_device *dev_out,
2142 unsigned flags)
1da177e4 2143{
7abaa27c 2144 struct rtable *rth = NULL;
1da177e4
LT
2145 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2146 unsigned hash;
2147 if (err == 0) {
8c7bc840 2148 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
1da177e4
LT
2149 err = rt_intern_hash(hash, rth, rp);
2150 }
e905a9ed 2151
1da177e4
LT
2152 return err;
2153}
2154
1da177e4
LT
2155/*
2156 * Major route resolver routine.
2157 */
2158
2159static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2160{
2161 u32 tos = RT_FL_TOS(oldflp);
2162 struct flowi fl = { .nl_u = { .ip4_u =
2163 { .daddr = oldflp->fl4_dst,
2164 .saddr = oldflp->fl4_src,
2165 .tos = tos & IPTOS_RT_MASK,
2166 .scope = ((tos & RTO_ONLINK) ?
2167 RT_SCOPE_LINK :
2168 RT_SCOPE_UNIVERSE),
1da177e4 2169 } },
47dcf0cb 2170 .mark = oldflp->mark,
2774c7ab 2171 .iif = init_net.loopback_dev->ifindex,
1da177e4
LT
2172 .oif = oldflp->oif };
2173 struct fib_result res;
2174 unsigned flags = 0;
2175 struct net_device *dev_out = NULL;
2176 int free_res = 0;
2177 int err;
2178
2179
2180 res.fi = NULL;
2181#ifdef CONFIG_IP_MULTIPLE_TABLES
2182 res.r = NULL;
2183#endif
2184
2185 if (oldflp->fl4_src) {
2186 err = -EINVAL;
2187 if (MULTICAST(oldflp->fl4_src) ||
2188 BADCLASS(oldflp->fl4_src) ||
2189 ZERONET(oldflp->fl4_src))
2190 goto out;
2191
2192 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2193 dev_out = ip_dev_find(oldflp->fl4_src);
f6c5d736 2194 if (dev_out == NULL)
1da177e4
LT
2195 goto out;
2196
2197 /* I removed check for oif == dev_out->oif here.
2198 It was wrong for two reasons:
2199 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2200 assigned to multiple interfaces.
2201 2. Moreover, we are allowed to send packets with saddr
2202 of another iface. --ANK
2203 */
2204
f6c5d736 2205 if (oldflp->oif == 0
e448515c 2206 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
1da177e4
LT
2207 /* Special hack: user can direct multicasts
2208 and limited broadcast via necessary interface
2209 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2210 This hack is not just for fun, it allows
2211 vic,vat and friends to work.
2212 They bind socket to loopback, set ttl to zero
2213 and expect that it will work.
2214 From the viewpoint of routing cache they are broken,
2215 because we are not allowed to build multicast path
2216 with loopback source addr (look, routing cache
2217 cannot know, that ttl is zero, so that packet
2218 will not leave this host and route is valid).
2219 Luckily, this hack is good workaround.
2220 */
2221
2222 fl.oif = dev_out->ifindex;
2223 goto make_route;
2224 }
2225 if (dev_out)
2226 dev_put(dev_out);
2227 dev_out = NULL;
2228 }
2229
2230
2231 if (oldflp->oif) {
881d966b 2232 dev_out = dev_get_by_index(&init_net, oldflp->oif);
1da177e4
LT
2233 err = -ENODEV;
2234 if (dev_out == NULL)
2235 goto out;
e5ed6399
HX
2236
2237 /* RACE: Check return value of inet_select_addr instead. */
2238 if (__in_dev_get_rtnl(dev_out) == NULL) {
1da177e4
LT
2239 dev_put(dev_out);
2240 goto out; /* Wrong error code */
2241 }
2242
e448515c 2243 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
1da177e4
LT
2244 if (!fl.fl4_src)
2245 fl.fl4_src = inet_select_addr(dev_out, 0,
2246 RT_SCOPE_LINK);
2247 goto make_route;
2248 }
2249 if (!fl.fl4_src) {
2250 if (MULTICAST(oldflp->fl4_dst))
2251 fl.fl4_src = inet_select_addr(dev_out, 0,
2252 fl.fl4_scope);
2253 else if (!oldflp->fl4_dst)
2254 fl.fl4_src = inet_select_addr(dev_out, 0,
2255 RT_SCOPE_HOST);
2256 }
2257 }
2258
2259 if (!fl.fl4_dst) {
2260 fl.fl4_dst = fl.fl4_src;
2261 if (!fl.fl4_dst)
2262 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2263 if (dev_out)
2264 dev_put(dev_out);
2774c7ab 2265 dev_out = init_net.loopback_dev;
1da177e4 2266 dev_hold(dev_out);
2774c7ab 2267 fl.oif = init_net.loopback_dev->ifindex;
1da177e4
LT
2268 res.type = RTN_LOCAL;
2269 flags |= RTCF_LOCAL;
2270 goto make_route;
2271 }
2272
2273 if (fib_lookup(&fl, &res)) {
2274 res.fi = NULL;
2275 if (oldflp->oif) {
2276 /* Apparently, routing tables are wrong. Assume,
2277 that the destination is on link.
2278
2279 WHY? DW.
2280 Because we are allowed to send to iface
2281 even if it has NO routes and NO assigned
2282 addresses. When oif is specified, routing
2283 tables are looked up with only one purpose:
2284 to catch if destination is gatewayed, rather than
2285 direct. Moreover, if MSG_DONTROUTE is set,
2286 we send packet, ignoring both routing tables
2287 and ifaddr state. --ANK
2288
2289
2290 We could make it even if oif is unknown,
2291 likely IPv6, but we do not.
2292 */
2293
2294 if (fl.fl4_src == 0)
2295 fl.fl4_src = inet_select_addr(dev_out, 0,
2296 RT_SCOPE_LINK);
2297 res.type = RTN_UNICAST;
2298 goto make_route;
2299 }
2300 if (dev_out)
2301 dev_put(dev_out);
2302 err = -ENETUNREACH;
2303 goto out;
2304 }
2305 free_res = 1;
2306
2307 if (res.type == RTN_LOCAL) {
2308 if (!fl.fl4_src)
2309 fl.fl4_src = fl.fl4_dst;
2310 if (dev_out)
2311 dev_put(dev_out);
2774c7ab 2312 dev_out = init_net.loopback_dev;
1da177e4
LT
2313 dev_hold(dev_out);
2314 fl.oif = dev_out->ifindex;
2315 if (res.fi)
2316 fib_info_put(res.fi);
2317 res.fi = NULL;
2318 flags |= RTCF_LOCAL;
2319 goto make_route;
2320 }
2321
2322#ifdef CONFIG_IP_ROUTE_MULTIPATH
2323 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2324 fib_select_multipath(&fl, &res);
2325 else
2326#endif
2327 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2328 fib_select_default(&fl, &res);
2329
2330 if (!fl.fl4_src)
2331 fl.fl4_src = FIB_RES_PREFSRC(res);
2332
2333 if (dev_out)
2334 dev_put(dev_out);
2335 dev_out = FIB_RES_DEV(res);
2336 dev_hold(dev_out);
2337 fl.oif = dev_out->ifindex;
2338
2339
2340make_route:
2341 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2342
2343
2344 if (free_res)
2345 fib_res_put(&res);
2346 if (dev_out)
2347 dev_put(dev_out);
2348out: return err;
2349}
2350
2351int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2352{
2353 unsigned hash;
2354 struct rtable *rth;
2355
8c7bc840 2356 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
1da177e4
LT
2357
2358 rcu_read_lock_bh();
2359 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 2360 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
2361 if (rth->fl.fl4_dst == flp->fl4_dst &&
2362 rth->fl.fl4_src == flp->fl4_src &&
2363 rth->fl.iif == 0 &&
2364 rth->fl.oif == flp->oif &&
47dcf0cb 2365 rth->fl.mark == flp->mark &&
1da177e4
LT
2366 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2367 (IPTOS_RT_MASK | RTO_ONLINK))) {
03f49f34 2368 dst_use(&rth->u.dst, jiffies);
1da177e4
LT
2369 RT_CACHE_STAT_INC(out_hit);
2370 rcu_read_unlock_bh();
2371 *rp = rth;
2372 return 0;
2373 }
2374 RT_CACHE_STAT_INC(out_hlist_search);
2375 }
2376 rcu_read_unlock_bh();
2377
2378 return ip_route_output_slow(rp, flp);
2379}
2380
d8c97a94
ACM
2381EXPORT_SYMBOL_GPL(__ip_route_output_key);
2382
14e50e57
DM
2383static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2384{
2385}
2386
2387static struct dst_ops ipv4_dst_blackhole_ops = {
2388 .family = AF_INET,
2389 .protocol = __constant_htons(ETH_P_IP),
2390 .destroy = ipv4_dst_destroy,
2391 .check = ipv4_dst_check,
2392 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2393 .entry_size = sizeof(struct rtable),
2394};
2395
2396
14e50e57
DM
2397static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2398{
2399 struct rtable *ort = *rp;
2400 struct rtable *rt = (struct rtable *)
2401 dst_alloc(&ipv4_dst_blackhole_ops);
2402
2403 if (rt) {
2404 struct dst_entry *new = &rt->u.dst;
2405
2406 atomic_set(&new->__refcnt, 1);
2407 new->__use = 1;
352e512c
HX
2408 new->input = dst_discard;
2409 new->output = dst_discard;
14e50e57
DM
2410 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2411
2412 new->dev = ort->u.dst.dev;
2413 if (new->dev)
2414 dev_hold(new->dev);
2415
2416 rt->fl = ort->fl;
2417
2418 rt->idev = ort->idev;
2419 if (rt->idev)
2420 in_dev_hold(rt->idev);
2421 rt->rt_flags = ort->rt_flags;
2422 rt->rt_type = ort->rt_type;
2423 rt->rt_dst = ort->rt_dst;
2424 rt->rt_src = ort->rt_src;
2425 rt->rt_iif = ort->rt_iif;
2426 rt->rt_gateway = ort->rt_gateway;
2427 rt->rt_spec_dst = ort->rt_spec_dst;
2428 rt->peer = ort->peer;
2429 if (rt->peer)
2430 atomic_inc(&rt->peer->refcnt);
2431
2432 dst_free(new);
2433 }
2434
2435 dst_release(&(*rp)->u.dst);
2436 *rp = rt;
2437 return (rt ? 0 : -ENOMEM);
2438}
2439
1da177e4
LT
2440int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2441{
2442 int err;
2443
2444 if ((err = __ip_route_output_key(rp, flp)) != 0)
2445 return err;
2446
2447 if (flp->proto) {
2448 if (!flp->fl4_src)
2449 flp->fl4_src = (*rp)->rt_src;
2450 if (!flp->fl4_dst)
2451 flp->fl4_dst = (*rp)->rt_dst;
14e50e57
DM
2452 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2453 if (err == -EREMOTE)
2454 err = ipv4_dst_blackhole(rp, flp, sk);
2455
2456 return err;
1da177e4
LT
2457 }
2458
2459 return 0;
2460}
2461
d8c97a94
ACM
2462EXPORT_SYMBOL_GPL(ip_route_output_flow);
2463
1da177e4
LT
2464int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2465{
2466 return ip_route_output_flow(rp, flp, NULL, 0);
2467}
2468
2469static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2470 int nowait, unsigned int flags)
1da177e4
LT
2471{
2472 struct rtable *rt = (struct rtable*)skb->dst;
2473 struct rtmsg *r;
be403ea1 2474 struct nlmsghdr *nlh;
e3703b3d
TG
2475 long expires;
2476 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2477
2478 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2479 if (nlh == NULL)
26932566 2480 return -EMSGSIZE;
be403ea1
TG
2481
2482 r = nlmsg_data(nlh);
1da177e4
LT
2483 r->rtm_family = AF_INET;
2484 r->rtm_dst_len = 32;
2485 r->rtm_src_len = 0;
2486 r->rtm_tos = rt->fl.fl4_tos;
2487 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2488 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2489 r->rtm_type = rt->rt_type;
2490 r->rtm_scope = RT_SCOPE_UNIVERSE;
2491 r->rtm_protocol = RTPROT_UNSPEC;
2492 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2493 if (rt->rt_flags & RTCF_NOTIFY)
2494 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2495
17fb2c64 2496 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2497
1da177e4
LT
2498 if (rt->fl.fl4_src) {
2499 r->rtm_src_len = 32;
17fb2c64 2500 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
1da177e4
LT
2501 }
2502 if (rt->u.dst.dev)
be403ea1 2503 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
1da177e4
LT
2504#ifdef CONFIG_NET_CLS_ROUTE
2505 if (rt->u.dst.tclassid)
be403ea1 2506 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
1da177e4
LT
2507#endif
2508 if (rt->fl.iif)
17fb2c64 2509 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
1da177e4 2510 else if (rt->rt_src != rt->fl.fl4_src)
17fb2c64 2511 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2512
1da177e4 2513 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2514 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2515
1da177e4 2516 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
be403ea1
TG
2517 goto nla_put_failure;
2518
e3703b3d
TG
2519 error = rt->u.dst.error;
2520 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
1da177e4 2521 if (rt->peer) {
e3703b3d 2522 id = rt->peer->ip_id_count;
1da177e4 2523 if (rt->peer->tcp_ts_stamp) {
e3703b3d 2524 ts = rt->peer->tcp_ts;
9d729f72 2525 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
1da177e4
LT
2526 }
2527 }
be403ea1 2528
1da177e4
LT
2529 if (rt->fl.iif) {
2530#ifdef CONFIG_IP_MROUTE
e448515c 2531 __be32 dst = rt->rt_dst;
1da177e4
LT
2532
2533 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
42f811b8 2534 IPV4_DEVCONF_ALL(MC_FORWARDING)) {
1da177e4
LT
2535 int err = ipmr_get_route(skb, r, nowait);
2536 if (err <= 0) {
2537 if (!nowait) {
2538 if (err == 0)
2539 return 0;
be403ea1 2540 goto nla_put_failure;
1da177e4
LT
2541 } else {
2542 if (err == -EMSGSIZE)
be403ea1 2543 goto nla_put_failure;
e3703b3d 2544 error = err;
1da177e4
LT
2545 }
2546 }
2547 } else
2548#endif
be403ea1 2549 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
1da177e4
LT
2550 }
2551
e3703b3d
TG
2552 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2553 expires, error) < 0)
2554 goto nla_put_failure;
be403ea1
TG
2555
2556 return nlmsg_end(skb, nlh);
1da177e4 2557
be403ea1 2558nla_put_failure:
26932566
PM
2559 nlmsg_cancel(skb, nlh);
2560 return -EMSGSIZE;
1da177e4
LT
2561}
2562
63f3444f 2563static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 2564{
b854272b 2565 struct net *net = in_skb->sk->sk_net;
d889ce3b
TG
2566 struct rtmsg *rtm;
2567 struct nlattr *tb[RTA_MAX+1];
1da177e4 2568 struct rtable *rt = NULL;
9e12bb22
AV
2569 __be32 dst = 0;
2570 __be32 src = 0;
2571 u32 iif;
d889ce3b 2572 int err;
1da177e4
LT
2573 struct sk_buff *skb;
2574
b854272b
DL
2575 if (net != &init_net)
2576 return -EINVAL;
2577
d889ce3b
TG
2578 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2579 if (err < 0)
2580 goto errout;
2581
2582 rtm = nlmsg_data(nlh);
2583
1da177e4 2584 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2585 if (skb == NULL) {
2586 err = -ENOBUFS;
2587 goto errout;
2588 }
1da177e4
LT
2589
2590 /* Reserve room for dummy headers, this skb can pass
2591 through good chunk of routing engine.
2592 */
459a98ed 2593 skb_reset_mac_header(skb);
c1d2bbe1 2594 skb_reset_network_header(skb);
d2c962b8
SH
2595
2596 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2597 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2598 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2599
17fb2c64
AV
2600 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2601 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2602 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
1da177e4
LT
2603
2604 if (iif) {
d889ce3b
TG
2605 struct net_device *dev;
2606
881d966b 2607 dev = __dev_get_by_index(&init_net, iif);
d889ce3b
TG
2608 if (dev == NULL) {
2609 err = -ENODEV;
2610 goto errout_free;
2611 }
2612
1da177e4
LT
2613 skb->protocol = htons(ETH_P_IP);
2614 skb->dev = dev;
2615 local_bh_disable();
2616 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2617 local_bh_enable();
d889ce3b
TG
2618
2619 rt = (struct rtable*) skb->dst;
2620 if (err == 0 && rt->u.dst.error)
1da177e4
LT
2621 err = -rt->u.dst.error;
2622 } else {
d889ce3b
TG
2623 struct flowi fl = {
2624 .nl_u = {
2625 .ip4_u = {
2626 .daddr = dst,
2627 .saddr = src,
2628 .tos = rtm->rtm_tos,
2629 },
2630 },
2631 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2632 };
1da177e4
LT
2633 err = ip_route_output_key(&rt, &fl);
2634 }
d889ce3b 2635
1da177e4 2636 if (err)
d889ce3b 2637 goto errout_free;
1da177e4
LT
2638
2639 skb->dst = &rt->u.dst;
2640 if (rtm->rtm_flags & RTM_F_NOTIFY)
2641 rt->rt_flags |= RTCF_NOTIFY;
2642
1da177e4 2643 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
b6544c0b 2644 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2645 if (err <= 0)
2646 goto errout_free;
1da177e4 2647
97c53cac 2648 err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
d889ce3b 2649errout:
2942e900 2650 return err;
1da177e4 2651
d889ce3b 2652errout_free:
1da177e4 2653 kfree_skb(skb);
d889ce3b 2654 goto errout;
1da177e4
LT
2655}
2656
2657int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2658{
2659 struct rtable *rt;
2660 int h, s_h;
2661 int idx, s_idx;
2662
2663 s_h = cb->args[0];
d8c92830
ED
2664 if (s_h < 0)
2665 s_h = 0;
1da177e4 2666 s_idx = idx = cb->args[1];
d8c92830 2667 for (h = s_h; h <= rt_hash_mask; h++) {
1da177e4
LT
2668 rcu_read_lock_bh();
2669 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
093c2ca4 2670 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
1da177e4
LT
2671 if (idx < s_idx)
2672 continue;
2673 skb->dst = dst_clone(&rt->u.dst);
2674 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
e905a9ed 2675 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 2676 1, NLM_F_MULTI) <= 0) {
1da177e4
LT
2677 dst_release(xchg(&skb->dst, NULL));
2678 rcu_read_unlock_bh();
2679 goto done;
2680 }
2681 dst_release(xchg(&skb->dst, NULL));
2682 }
2683 rcu_read_unlock_bh();
d8c92830 2684 s_idx = 0;
1da177e4
LT
2685 }
2686
2687done:
2688 cb->args[0] = h;
2689 cb->args[1] = idx;
2690 return skb->len;
2691}
2692
2693void ip_rt_multicast_event(struct in_device *in_dev)
2694{
2695 rt_cache_flush(0);
2696}
2697
2698#ifdef CONFIG_SYSCTL
2699static int flush_delay;
2700
2701static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2702 struct file *filp, void __user *buffer,
2703 size_t *lenp, loff_t *ppos)
2704{
2705 if (write) {
2706 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2707 rt_cache_flush(flush_delay);
2708 return 0;
e905a9ed 2709 }
1da177e4
LT
2710
2711 return -EINVAL;
2712}
2713
2714static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2715 int __user *name,
2716 int nlen,
2717 void __user *oldval,
2718 size_t __user *oldlenp,
2719 void __user *newval,
1f29bcd7 2720 size_t newlen)
1da177e4
LT
2721{
2722 int delay;
2723 if (newlen != sizeof(int))
2724 return -EINVAL;
2725 if (get_user(delay, (int __user *)newval))
e905a9ed
YH
2726 return -EFAULT;
2727 rt_cache_flush(delay);
1da177e4
LT
2728 return 0;
2729}
2730
2731ctl_table ipv4_route_table[] = {
e905a9ed 2732 {
1da177e4
LT
2733 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2734 .procname = "flush",
2735 .data = &flush_delay,
2736 .maxlen = sizeof(int),
7e3e0360 2737 .mode = 0200,
1da177e4
LT
2738 .proc_handler = &ipv4_sysctl_rtcache_flush,
2739 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2740 },
2741 {
2742 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2743 .procname = "min_delay",
2744 .data = &ip_rt_min_delay,
2745 .maxlen = sizeof(int),
2746 .mode = 0644,
2747 .proc_handler = &proc_dointvec_jiffies,
2748 .strategy = &sysctl_jiffies,
2749 },
2750 {
2751 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2752 .procname = "max_delay",
2753 .data = &ip_rt_max_delay,
2754 .maxlen = sizeof(int),
2755 .mode = 0644,
2756 .proc_handler = &proc_dointvec_jiffies,
2757 .strategy = &sysctl_jiffies,
2758 },
2759 {
2760 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2761 .procname = "gc_thresh",
2762 .data = &ipv4_dst_ops.gc_thresh,
2763 .maxlen = sizeof(int),
2764 .mode = 0644,
2765 .proc_handler = &proc_dointvec,
2766 },
2767 {
2768 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2769 .procname = "max_size",
2770 .data = &ip_rt_max_size,
2771 .maxlen = sizeof(int),
2772 .mode = 0644,
2773 .proc_handler = &proc_dointvec,
2774 },
2775 {
2776 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2777
1da177e4
LT
2778 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2779 .procname = "gc_min_interval",
2780 .data = &ip_rt_gc_min_interval,
2781 .maxlen = sizeof(int),
2782 .mode = 0644,
2783 .proc_handler = &proc_dointvec_jiffies,
2784 .strategy = &sysctl_jiffies,
2785 },
2786 {
2787 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2788 .procname = "gc_min_interval_ms",
2789 .data = &ip_rt_gc_min_interval,
2790 .maxlen = sizeof(int),
2791 .mode = 0644,
2792 .proc_handler = &proc_dointvec_ms_jiffies,
2793 .strategy = &sysctl_ms_jiffies,
2794 },
2795 {
2796 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2797 .procname = "gc_timeout",
2798 .data = &ip_rt_gc_timeout,
2799 .maxlen = sizeof(int),
2800 .mode = 0644,
2801 .proc_handler = &proc_dointvec_jiffies,
2802 .strategy = &sysctl_jiffies,
2803 },
2804 {
2805 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2806 .procname = "gc_interval",
2807 .data = &ip_rt_gc_interval,
2808 .maxlen = sizeof(int),
2809 .mode = 0644,
2810 .proc_handler = &proc_dointvec_jiffies,
2811 .strategy = &sysctl_jiffies,
2812 },
2813 {
2814 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2815 .procname = "redirect_load",
2816 .data = &ip_rt_redirect_load,
2817 .maxlen = sizeof(int),
2818 .mode = 0644,
2819 .proc_handler = &proc_dointvec,
2820 },
2821 {
2822 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2823 .procname = "redirect_number",
2824 .data = &ip_rt_redirect_number,
2825 .maxlen = sizeof(int),
2826 .mode = 0644,
2827 .proc_handler = &proc_dointvec,
2828 },
2829 {
2830 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2831 .procname = "redirect_silence",
2832 .data = &ip_rt_redirect_silence,
2833 .maxlen = sizeof(int),
2834 .mode = 0644,
2835 .proc_handler = &proc_dointvec,
2836 },
2837 {
2838 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2839 .procname = "error_cost",
2840 .data = &ip_rt_error_cost,
2841 .maxlen = sizeof(int),
2842 .mode = 0644,
2843 .proc_handler = &proc_dointvec,
2844 },
2845 {
2846 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2847 .procname = "error_burst",
2848 .data = &ip_rt_error_burst,
2849 .maxlen = sizeof(int),
2850 .mode = 0644,
2851 .proc_handler = &proc_dointvec,
2852 },
2853 {
2854 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2855 .procname = "gc_elasticity",
2856 .data = &ip_rt_gc_elasticity,
2857 .maxlen = sizeof(int),
2858 .mode = 0644,
2859 .proc_handler = &proc_dointvec,
2860 },
2861 {
2862 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2863 .procname = "mtu_expires",
2864 .data = &ip_rt_mtu_expires,
2865 .maxlen = sizeof(int),
2866 .mode = 0644,
2867 .proc_handler = &proc_dointvec_jiffies,
2868 .strategy = &sysctl_jiffies,
2869 },
2870 {
2871 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2872 .procname = "min_pmtu",
2873 .data = &ip_rt_min_pmtu,
2874 .maxlen = sizeof(int),
2875 .mode = 0644,
2876 .proc_handler = &proc_dointvec,
2877 },
2878 {
2879 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2880 .procname = "min_adv_mss",
2881 .data = &ip_rt_min_advmss,
2882 .maxlen = sizeof(int),
2883 .mode = 0644,
2884 .proc_handler = &proc_dointvec,
2885 },
2886 {
2887 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2888 .procname = "secret_interval",
2889 .data = &ip_rt_secret_interval,
2890 .maxlen = sizeof(int),
2891 .mode = 0644,
2892 .proc_handler = &proc_dointvec_jiffies,
2893 .strategy = &sysctl_jiffies,
2894 },
2895 { .ctl_name = 0 }
2896};
2897#endif
2898
2899#ifdef CONFIG_NET_CLS_ROUTE
8dbde28d 2900struct ip_rt_acct *ip_rt_acct __read_mostly;
1da177e4
LT
2901
2902/* IP route accounting ptr for this logical cpu number. */
8dbde28d 2903#define IP_RT_ACCT_CPU(cpu) (per_cpu_ptr(ip_rt_acct, cpu))
1da177e4
LT
2904
2905#ifdef CONFIG_PROC_FS
2906static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2907 int length, int *eof, void *data)
2908{
2909 unsigned int i;
2910
2911 if ((offset & 3) || (length & 3))
2912 return -EIO;
2913
2914 if (offset >= sizeof(struct ip_rt_acct) * 256) {
2915 *eof = 1;
2916 return 0;
2917 }
2918
2919 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2920 length = sizeof(struct ip_rt_acct) * 256 - offset;
2921 *eof = 1;
2922 }
2923
2924 offset /= sizeof(u32);
2925
2926 if (length > 0) {
1da177e4
LT
2927 u32 *dst = (u32 *) buffer;
2928
1da177e4 2929 *start = buffer;
483b23ff 2930 memset(dst, 0, length);
1da177e4 2931
6f912042 2932 for_each_possible_cpu(i) {
1da177e4 2933 unsigned int j;
483b23ff 2934 u32 *src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
1da177e4
LT
2935
2936 for (j = 0; j < length/4; j++)
2937 dst[j] += src[j];
2938 }
2939 }
2940 return length;
2941}
2942#endif /* CONFIG_PROC_FS */
2943#endif /* CONFIG_NET_CLS_ROUTE */
2944
2945static __initdata unsigned long rhash_entries;
2946static int __init set_rhash_entries(char *str)
2947{
2948 if (!str)
2949 return 0;
2950 rhash_entries = simple_strtoul(str, &str, 0);
2951 return 1;
2952}
2953__setup("rhash_entries=", set_rhash_entries);
2954
2955int __init ip_rt_init(void)
2956{
424c4b70 2957 int rc = 0;
1da177e4
LT
2958
2959 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2960 (jiffies ^ (jiffies >> 7)));
2961
2962#ifdef CONFIG_NET_CLS_ROUTE
8dbde28d 2963 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
1da177e4
LT
2964 if (!ip_rt_acct)
2965 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
2966#endif
2967
e5d679f3
AD
2968 ipv4_dst_ops.kmem_cachep =
2969 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 2970 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 2971
14e50e57
DM
2972 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2973
424c4b70
ED
2974 rt_hash_table = (struct rt_hash_bucket *)
2975 alloc_large_system_hash("IP route cache",
2976 sizeof(struct rt_hash_bucket),
2977 rhash_entries,
2978 (num_physpages >= 128 * 1024) ?
18955cfc 2979 15 : 17,
8d1502de 2980 0,
424c4b70
ED
2981 &rt_hash_log,
2982 &rt_hash_mask,
2983 0);
22c047cc
ED
2984 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2985 rt_hash_lock_init();
1da177e4
LT
2986
2987 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2988 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2989
1da177e4
LT
2990 devinet_init();
2991 ip_fib_init();
2992
b24b8a24
PE
2993 setup_timer(&rt_flush_timer, rt_run_flush, 0);
2994 setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
1da177e4
LT
2995
2996 /* All the timers, started at system startup tend
2997 to synchronize. Perturb it a bit.
2998 */
39c90ece
ED
2999 schedule_delayed_work(&expires_work,
3000 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
1da177e4
LT
3001
3002 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3003 ip_rt_secret_interval;
3004 add_timer(&rt_secret_timer);
3005
3006#ifdef CONFIG_PROC_FS
3007 {
3008 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
457c4cbc 3009 if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
e905a9ed 3010 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
457c4cbc 3011 init_net.proc_net_stat))) {
1da177e4
LT
3012 return -ENOMEM;
3013 }
3014 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3015 }
3016#ifdef CONFIG_NET_CLS_ROUTE
457c4cbc 3017 create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
1da177e4
LT
3018#endif
3019#endif
3020#ifdef CONFIG_XFRM
3021 xfrm_init();
3022 xfrm4_init();
3023#endif
63f3444f
TG
3024 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3025
1da177e4
LT
3026 return rc;
3027}
3028
3029EXPORT_SYMBOL(__ip_select_ident);
3030EXPORT_SYMBOL(ip_route_input);
3031EXPORT_SYMBOL(ip_route_output_key);