]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/route.c
[PKTGEN]: pktgen should not print info that it is spinning
[net-next-2.6.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
e905a9ed 23 * Alan Cox : Super /proc >4K
1da177e4
LT
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
e905a9ed 41 *
1da177e4
LT
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
60 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
1da177e4
LT
67#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
1da177e4 73#include <linux/mm.h>
424c4b70 74#include <linux/bootmem.h>
1da177e4
LT
75#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
39c90ece 84#include <linux/workqueue.h>
1da177e4 85#include <linux/skbuff.h>
1da177e4
LT
86#include <linux/inetdevice.h>
87#include <linux/igmp.h>
88#include <linux/pkt_sched.h>
89#include <linux/mroute.h>
90#include <linux/netfilter_ipv4.h>
91#include <linux/random.h>
92#include <linux/jhash.h>
93#include <linux/rcupdate.h>
94#include <linux/times.h>
352e512c 95#include <net/dst.h>
457c4cbc 96#include <net/net_namespace.h>
1da177e4
LT
97#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
112
113#define RT_FL_TOS(oldflp) \
114 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116#define IP_MAX_MTU 0xFFF0
117
118#define RT_GC_TIMEOUT (300*HZ)
119
120static int ip_rt_min_delay = 2 * HZ;
121static int ip_rt_max_delay = 10 * HZ;
122static int ip_rt_max_size;
123static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
124static int ip_rt_gc_interval = 60 * HZ;
125static int ip_rt_gc_min_interval = HZ / 2;
126static int ip_rt_redirect_number = 9;
127static int ip_rt_redirect_load = HZ / 50;
128static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
129static int ip_rt_error_cost = HZ;
130static int ip_rt_error_burst = 5 * HZ;
131static int ip_rt_gc_elasticity = 8;
132static int ip_rt_mtu_expires = 10 * 60 * HZ;
133static int ip_rt_min_pmtu = 512 + 20 + 20;
134static int ip_rt_min_advmss = 256;
135static int ip_rt_secret_interval = 10 * 60 * HZ;
beb659bd 136static int ip_rt_flush_expected;
1da177e4
LT
137static unsigned long rt_deadline;
138
139#define RTprint(a...) printk(KERN_DEBUG a)
140
141static struct timer_list rt_flush_timer;
beb659bd
ED
142static void rt_worker_func(struct work_struct *work);
143static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
1da177e4
LT
144static struct timer_list rt_secret_timer;
145
146/*
147 * Interface to generic destination cache.
148 */
149
150static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151static void ipv4_dst_destroy(struct dst_entry *dst);
152static void ipv4_dst_ifdown(struct dst_entry *dst,
153 struct net_device *dev, int how);
154static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155static void ipv4_link_failure(struct sk_buff *skb);
156static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 157static int rt_garbage_collect(struct dst_ops *ops);
1da177e4
LT
158
159
160static struct dst_ops ipv4_dst_ops = {
161 .family = AF_INET,
162 .protocol = __constant_htons(ETH_P_IP),
163 .gc = rt_garbage_collect,
164 .check = ipv4_dst_check,
165 .destroy = ipv4_dst_destroy,
166 .ifdown = ipv4_dst_ifdown,
167 .negative_advice = ipv4_negative_advice,
168 .link_failure = ipv4_link_failure,
169 .update_pmtu = ip_rt_update_pmtu,
862b82c6 170 .local_out = ip_local_out,
1da177e4 171 .entry_size = sizeof(struct rtable),
e2422970 172 .entries = ATOMIC_INIT(0),
1da177e4
LT
173};
174
175#define ECN_OR_COST(class) TC_PRIO_##class
176
4839c52b 177const __u8 ip_tos2prio[16] = {
1da177e4
LT
178 TC_PRIO_BESTEFFORT,
179 ECN_OR_COST(FILLER),
180 TC_PRIO_BESTEFFORT,
181 ECN_OR_COST(BESTEFFORT),
182 TC_PRIO_BULK,
183 ECN_OR_COST(BULK),
184 TC_PRIO_BULK,
185 ECN_OR_COST(BULK),
186 TC_PRIO_INTERACTIVE,
187 ECN_OR_COST(INTERACTIVE),
188 TC_PRIO_INTERACTIVE,
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE_BULK,
191 ECN_OR_COST(INTERACTIVE_BULK),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK)
194};
195
196
197/*
198 * Route cache.
199 */
200
201/* The locking scheme is rather straight forward:
202 *
203 * 1) Read-Copy Update protects the buckets of the central route hash.
204 * 2) Only writers remove entries, and they hold the lock
205 * as they look at rtable reference counts.
206 * 3) Only readers acquire references to rtable entries,
207 * they do so with atomic increments and with the
208 * lock held.
209 */
210
211struct rt_hash_bucket {
212 struct rtable *chain;
22c047cc 213};
8a25d5de
IM
214#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
215 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
216/*
217 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
218 * The size of this table is a power of two and depends on the number of CPUS.
62051200 219 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 220 */
62051200
IM
221#ifdef CONFIG_LOCKDEP
222# define RT_HASH_LOCK_SZ 256
22c047cc 223#else
62051200
IM
224# if NR_CPUS >= 32
225# define RT_HASH_LOCK_SZ 4096
226# elif NR_CPUS >= 16
227# define RT_HASH_LOCK_SZ 2048
228# elif NR_CPUS >= 8
229# define RT_HASH_LOCK_SZ 1024
230# elif NR_CPUS >= 4
231# define RT_HASH_LOCK_SZ 512
232# else
233# define RT_HASH_LOCK_SZ 256
234# endif
22c047cc
ED
235#endif
236
237static spinlock_t *rt_hash_locks;
238# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
239
240static __init void rt_hash_lock_init(void)
241{
242 int i;
243
244 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
245 GFP_KERNEL);
246 if (!rt_hash_locks)
247 panic("IP: failed to allocate rt_hash_locks\n");
248
249 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
250 spin_lock_init(&rt_hash_locks[i]);
251}
22c047cc
ED
252#else
253# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
254
255static inline void rt_hash_lock_init(void)
256{
257}
22c047cc 258#endif
1da177e4
LT
259
260static struct rt_hash_bucket *rt_hash_table;
261static unsigned rt_hash_mask;
cfcabdcc 262static unsigned int rt_hash_log;
1da177e4
LT
263static unsigned int rt_hash_rnd;
264
2f970d83 265static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
dbd2915c 266#define RT_CACHE_STAT_INC(field) \
bfe5d834 267 (__raw_get_cpu_var(rt_cache_stat).field++)
1da177e4
LT
268
269static int rt_intern_hash(unsigned hash, struct rtable *rth,
270 struct rtable **res);
271
cef2685e 272static unsigned int rt_hash_code(u32 daddr, u32 saddr)
1da177e4 273{
cef2685e 274 return (jhash_2words(daddr, saddr, rt_hash_rnd)
1da177e4
LT
275 & rt_hash_mask);
276}
277
8c7bc840
AV
278#define rt_hash(daddr, saddr, idx) \
279 rt_hash_code((__force u32)(__be32)(daddr),\
280 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
281
1da177e4
LT
282#ifdef CONFIG_PROC_FS
283struct rt_cache_iter_state {
284 int bucket;
285};
286
287static struct rtable *rt_cache_get_first(struct seq_file *seq)
288{
289 struct rtable *r = NULL;
290 struct rt_cache_iter_state *st = seq->private;
291
292 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
293 rcu_read_lock_bh();
294 r = rt_hash_table[st->bucket].chain;
295 if (r)
296 break;
297 rcu_read_unlock_bh();
298 }
0bcceadc 299 return rcu_dereference(r);
1da177e4
LT
300}
301
302static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
303{
0bcceadc 304 struct rt_cache_iter_state *st = seq->private;
1da177e4 305
093c2ca4 306 r = r->u.dst.rt_next;
1da177e4
LT
307 while (!r) {
308 rcu_read_unlock_bh();
309 if (--st->bucket < 0)
310 break;
311 rcu_read_lock_bh();
312 r = rt_hash_table[st->bucket].chain;
313 }
0bcceadc 314 return rcu_dereference(r);
1da177e4
LT
315}
316
317static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
318{
319 struct rtable *r = rt_cache_get_first(seq);
320
321 if (r)
322 while (pos && (r = rt_cache_get_next(seq, r)))
323 --pos;
324 return pos ? NULL : r;
325}
326
327static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
328{
329 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
330}
331
332static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
333{
334 struct rtable *r = NULL;
335
336 if (v == SEQ_START_TOKEN)
337 r = rt_cache_get_first(seq);
338 else
339 r = rt_cache_get_next(seq, v);
340 ++*pos;
341 return r;
342}
343
344static void rt_cache_seq_stop(struct seq_file *seq, void *v)
345{
346 if (v && v != SEQ_START_TOKEN)
347 rcu_read_unlock_bh();
348}
349
350static int rt_cache_seq_show(struct seq_file *seq, void *v)
351{
352 if (v == SEQ_START_TOKEN)
353 seq_printf(seq, "%-127s\n",
354 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
355 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
356 "HHUptod\tSpecDst");
357 else {
358 struct rtable *r = v;
359 char temp[256];
360
361 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
362 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
363 r->u.dst.dev ? r->u.dst.dev->name : "*",
364 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
365 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
366 r->u.dst.__use, 0, (unsigned long)r->rt_src,
367 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
368 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
369 dst_metric(&r->u.dst, RTAX_WINDOW),
370 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
371 dst_metric(&r->u.dst, RTAX_RTTVAR)),
372 r->fl.fl4_tos,
373 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
374 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
375 dev_queue_xmit) : 0,
376 r->rt_spec_dst);
377 seq_printf(seq, "%-127s\n", temp);
e905a9ed
YH
378 }
379 return 0;
1da177e4
LT
380}
381
f690808e 382static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
383 .start = rt_cache_seq_start,
384 .next = rt_cache_seq_next,
385 .stop = rt_cache_seq_stop,
386 .show = rt_cache_seq_show,
387};
388
389static int rt_cache_seq_open(struct inode *inode, struct file *file)
390{
cf7732e4
PE
391 return seq_open_private(file, &rt_cache_seq_ops,
392 sizeof(struct rt_cache_iter_state));
1da177e4
LT
393}
394
9a32144e 395static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
396 .owner = THIS_MODULE,
397 .open = rt_cache_seq_open,
398 .read = seq_read,
399 .llseek = seq_lseek,
400 .release = seq_release_private,
401};
402
403
404static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
405{
406 int cpu;
407
408 if (*pos == 0)
409 return SEQ_START_TOKEN;
410
411 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
412 if (!cpu_possible(cpu))
413 continue;
414 *pos = cpu+1;
2f970d83 415 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
416 }
417 return NULL;
418}
419
420static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
421{
422 int cpu;
423
424 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
425 if (!cpu_possible(cpu))
426 continue;
427 *pos = cpu+1;
2f970d83 428 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
429 }
430 return NULL;
e905a9ed 431
1da177e4
LT
432}
433
434static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
435{
436
437}
438
439static int rt_cpu_seq_show(struct seq_file *seq, void *v)
440{
441 struct rt_cache_stat *st = v;
442
443 if (v == SEQ_START_TOKEN) {
5bec0039 444 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
445 return 0;
446 }
e905a9ed 447
1da177e4
LT
448 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
449 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
450 atomic_read(&ipv4_dst_ops.entries),
451 st->in_hit,
452 st->in_slow_tot,
453 st->in_slow_mc,
454 st->in_no_route,
455 st->in_brd,
456 st->in_martian_dst,
457 st->in_martian_src,
458
459 st->out_hit,
460 st->out_slow_tot,
e905a9ed 461 st->out_slow_mc,
1da177e4
LT
462
463 st->gc_total,
464 st->gc_ignored,
465 st->gc_goal_miss,
466 st->gc_dst_overflow,
467 st->in_hlist_search,
468 st->out_hlist_search
469 );
470 return 0;
471}
472
f690808e 473static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
474 .start = rt_cpu_seq_start,
475 .next = rt_cpu_seq_next,
476 .stop = rt_cpu_seq_stop,
477 .show = rt_cpu_seq_show,
478};
479
480
481static int rt_cpu_seq_open(struct inode *inode, struct file *file)
482{
483 return seq_open(file, &rt_cpu_seq_ops);
484}
485
9a32144e 486static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
487 .owner = THIS_MODULE,
488 .open = rt_cpu_seq_open,
489 .read = seq_read,
490 .llseek = seq_lseek,
491 .release = seq_release,
492};
493
78c686e9
PE
494#ifdef CONFIG_NET_CLS_ROUTE
495static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
496 int length, int *eof, void *data)
497{
498 unsigned int i;
499
500 if ((offset & 3) || (length & 3))
501 return -EIO;
502
503 if (offset >= sizeof(struct ip_rt_acct) * 256) {
504 *eof = 1;
505 return 0;
506 }
507
508 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
509 length = sizeof(struct ip_rt_acct) * 256 - offset;
510 *eof = 1;
511 }
512
513 offset /= sizeof(u32);
514
515 if (length > 0) {
516 u32 *dst = (u32 *) buffer;
517
518 *start = buffer;
519 memset(dst, 0, length);
520
521 for_each_possible_cpu(i) {
522 unsigned int j;
523 u32 *src;
524
525 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
526 for (j = 0; j < length/4; j++)
527 dst[j] += src[j];
528 }
529 }
530 return length;
531}
532#endif
107f1634
PE
533
534static __init int ip_rt_proc_init(struct net *net)
535{
536 struct proc_dir_entry *pde;
537
538 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
539 &rt_cache_seq_fops);
540 if (!pde)
541 goto err1;
542
543 pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat);
544 if (!pde)
545 goto err2;
546
547 pde->proc_fops = &rt_cpu_seq_fops;
548
549#ifdef CONFIG_NET_CLS_ROUTE
550 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
551 ip_rt_acct_read, NULL);
552 if (!pde)
553 goto err3;
554#endif
555 return 0;
556
557#ifdef CONFIG_NET_CLS_ROUTE
558err3:
559 remove_proc_entry("rt_cache", net->proc_net_stat);
560#endif
561err2:
562 remove_proc_entry("rt_cache", net->proc_net);
563err1:
564 return -ENOMEM;
565}
566#else
567static inline int ip_rt_proc_init(struct net *net)
568{
569 return 0;
570}
1da177e4 571#endif /* CONFIG_PROC_FS */
e905a9ed 572
1da177e4
LT
573static __inline__ void rt_free(struct rtable *rt)
574{
1da177e4
LT
575 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
576}
577
578static __inline__ void rt_drop(struct rtable *rt)
579{
1da177e4
LT
580 ip_rt_put(rt);
581 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
582}
583
584static __inline__ int rt_fast_clean(struct rtable *rth)
585{
586 /* Kill broadcast/multicast entries very aggresively, if they
587 collide in hash table with more useful entries */
588 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
093c2ca4 589 rth->fl.iif && rth->u.dst.rt_next;
1da177e4
LT
590}
591
592static __inline__ int rt_valuable(struct rtable *rth)
593{
594 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
595 rth->u.dst.expires;
596}
597
598static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
599{
600 unsigned long age;
601 int ret = 0;
602
603 if (atomic_read(&rth->u.dst.__refcnt))
604 goto out;
605
606 ret = 1;
607 if (rth->u.dst.expires &&
608 time_after_eq(jiffies, rth->u.dst.expires))
609 goto out;
610
611 age = jiffies - rth->u.dst.lastuse;
612 ret = 0;
613 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
614 (age <= tmo2 && rt_valuable(rth)))
615 goto out;
616 ret = 1;
617out: return ret;
618}
619
620/* Bits of score are:
621 * 31: very valuable
622 * 30: not quite useless
623 * 29..0: usage counter
624 */
625static inline u32 rt_score(struct rtable *rt)
626{
627 u32 score = jiffies - rt->u.dst.lastuse;
628
629 score = ~score & ~(3<<30);
630
631 if (rt_valuable(rt))
632 score |= (1<<31);
633
634 if (!rt->fl.iif ||
635 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
636 score |= (1<<30);
637
638 return score;
639}
640
641static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
642{
714e85be
AV
643 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
644 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
47dcf0cb 645 (fl1->mark ^ fl2->mark) |
8238b218
DM
646 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
647 *(u16 *)&fl2->nl_u.ip4_u.tos) |
648 (fl1->oif ^ fl2->oif) |
649 (fl1->iif ^ fl2->iif)) == 0;
1da177e4
LT
650}
651
b5921910
DL
652static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
653{
654 return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
655}
656
beb659bd
ED
657/*
658 * Perform a full scan of hash table and free all entries.
659 * Can be called by a softirq or a process.
660 * In the later case, we want to be reschedule if necessary
661 */
662static void rt_do_flush(int process_context)
663{
664 unsigned int i;
665 struct rtable *rth, *next;
666
667 for (i = 0; i <= rt_hash_mask; i++) {
668 if (process_context && need_resched())
669 cond_resched();
670 rth = rt_hash_table[i].chain;
671 if (!rth)
672 continue;
673
674 spin_lock_bh(rt_hash_lock_addr(i));
675 rth = rt_hash_table[i].chain;
676 rt_hash_table[i].chain = NULL;
677 spin_unlock_bh(rt_hash_lock_addr(i));
678
679 for (; rth; rth = next) {
680 next = rth->u.dst.rt_next;
681 rt_free(rth);
682 }
683 }
684}
685
686static void rt_check_expire(void)
1da177e4 687{
bb1d23b0
ED
688 static unsigned int rover;
689 unsigned int i = rover, goal;
1da177e4 690 struct rtable *rth, **rthp;
bb1d23b0
ED
691 u64 mult;
692
693 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
694 if (ip_rt_gc_timeout > 1)
695 do_div(mult, ip_rt_gc_timeout);
696 goal = (unsigned int)mult;
39c90ece
ED
697 if (goal > rt_hash_mask)
698 goal = rt_hash_mask + 1;
bb1d23b0 699 for (; goal > 0; goal--) {
1da177e4
LT
700 unsigned long tmo = ip_rt_gc_timeout;
701
702 i = (i + 1) & rt_hash_mask;
703 rthp = &rt_hash_table[i].chain;
704
d90bf5a9
ED
705 if (need_resched())
706 cond_resched();
707
cfcabdcc 708 if (*rthp == NULL)
bb1d23b0 709 continue;
39c90ece 710 spin_lock_bh(rt_hash_lock_addr(i));
1da177e4
LT
711 while ((rth = *rthp) != NULL) {
712 if (rth->u.dst.expires) {
713 /* Entry is expired even if it is in use */
39c90ece 714 if (time_before_eq(jiffies, rth->u.dst.expires)) {
1da177e4 715 tmo >>= 1;
093c2ca4 716 rthp = &rth->u.dst.rt_next;
1da177e4
LT
717 continue;
718 }
719 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
720 tmo >>= 1;
093c2ca4 721 rthp = &rth->u.dst.rt_next;
1da177e4
LT
722 continue;
723 }
724
725 /* Cleanup aged off entries. */
093c2ca4 726 *rthp = rth->u.dst.rt_next;
e905a9ed 727 rt_free(rth);
1da177e4 728 }
39c90ece 729 spin_unlock_bh(rt_hash_lock_addr(i));
1da177e4
LT
730 }
731 rover = i;
beb659bd
ED
732}
733
734/*
735 * rt_worker_func() is run in process context.
736 * If a whole flush was scheduled, it is done.
737 * Else, we call rt_check_expire() to scan part of the hash table
738 */
739static void rt_worker_func(struct work_struct *work)
740{
741 if (ip_rt_flush_expected) {
742 ip_rt_flush_expected = 0;
743 rt_do_flush(1);
744 } else
745 rt_check_expire();
39c90ece 746 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
1da177e4
LT
747}
748
749/* This can run from both BH and non-BH contexts, the latter
750 * in the case of a forced flush event.
751 */
beb659bd 752static void rt_run_flush(unsigned long process_context)
1da177e4 753{
1da177e4
LT
754 rt_deadline = 0;
755
756 get_random_bytes(&rt_hash_rnd, 4);
757
beb659bd 758 rt_do_flush(process_context);
1da177e4
LT
759}
760
761static DEFINE_SPINLOCK(rt_flush_lock);
762
763void rt_cache_flush(int delay)
764{
765 unsigned long now = jiffies;
766 int user_mode = !in_softirq();
767
768 if (delay < 0)
769 delay = ip_rt_min_delay;
770
1da177e4
LT
771 spin_lock_bh(&rt_flush_lock);
772
773 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
774 long tmo = (long)(rt_deadline - now);
775
776 /* If flush timer is already running
777 and flush request is not immediate (delay > 0):
778
779 if deadline is not achieved, prolongate timer to "delay",
780 otherwise fire it at deadline time.
781 */
782
783 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
784 tmo = 0;
e905a9ed 785
1da177e4
LT
786 if (delay > tmo)
787 delay = tmo;
788 }
789
790 if (delay <= 0) {
791 spin_unlock_bh(&rt_flush_lock);
beb659bd 792 rt_run_flush(user_mode);
1da177e4
LT
793 return;
794 }
795
796 if (rt_deadline == 0)
797 rt_deadline = now + ip_rt_max_delay;
798
799 mod_timer(&rt_flush_timer, now+delay);
800 spin_unlock_bh(&rt_flush_lock);
801}
802
beb659bd
ED
803/*
804 * We change rt_hash_rnd and ask next rt_worker_func() invocation
805 * to perform a flush in process context
806 */
1da177e4
LT
807static void rt_secret_rebuild(unsigned long dummy)
808{
beb659bd
ED
809 get_random_bytes(&rt_hash_rnd, 4);
810 ip_rt_flush_expected = 1;
811 cancel_delayed_work(&expires_work);
812 schedule_delayed_work(&expires_work, HZ/10);
813 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
1da177e4
LT
814}
815
816/*
817 Short description of GC goals.
818
819 We want to build algorithm, which will keep routing cache
820 at some equilibrium point, when number of aged off entries
821 is kept approximately equal to newly generated ones.
822
823 Current expiration strength is variable "expire".
824 We try to adjust it dynamically, so that if networking
825 is idle expires is large enough to keep enough of warm entries,
826 and when load increases it reduces to limit cache size.
827 */
828
569d3645 829static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
830{
831 static unsigned long expire = RT_GC_TIMEOUT;
832 static unsigned long last_gc;
833 static int rover;
834 static int equilibrium;
835 struct rtable *rth, **rthp;
836 unsigned long now = jiffies;
837 int goal;
838
839 /*
840 * Garbage collection is pretty expensive,
841 * do not make it too frequently.
842 */
843
844 RT_CACHE_STAT_INC(gc_total);
845
846 if (now - last_gc < ip_rt_gc_min_interval &&
847 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
848 RT_CACHE_STAT_INC(gc_ignored);
849 goto out;
850 }
851
852 /* Calculate number of entries, which we want to expire now. */
853 goal = atomic_read(&ipv4_dst_ops.entries) -
854 (ip_rt_gc_elasticity << rt_hash_log);
855 if (goal <= 0) {
856 if (equilibrium < ipv4_dst_ops.gc_thresh)
857 equilibrium = ipv4_dst_ops.gc_thresh;
858 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
859 if (goal > 0) {
b790cedd 860 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1da177e4
LT
861 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
862 }
863 } else {
864 /* We are in dangerous area. Try to reduce cache really
865 * aggressively.
866 */
b790cedd 867 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1da177e4
LT
868 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
869 }
870
871 if (now - last_gc >= ip_rt_gc_min_interval)
872 last_gc = now;
873
874 if (goal <= 0) {
875 equilibrium += goal;
876 goto work_done;
877 }
878
879 do {
880 int i, k;
881
882 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
883 unsigned long tmo = expire;
884
885 k = (k + 1) & rt_hash_mask;
886 rthp = &rt_hash_table[k].chain;
22c047cc 887 spin_lock_bh(rt_hash_lock_addr(k));
1da177e4
LT
888 while ((rth = *rthp) != NULL) {
889 if (!rt_may_expire(rth, tmo, expire)) {
890 tmo >>= 1;
093c2ca4 891 rthp = &rth->u.dst.rt_next;
1da177e4
LT
892 continue;
893 }
093c2ca4 894 *rthp = rth->u.dst.rt_next;
1da177e4
LT
895 rt_free(rth);
896 goal--;
1da177e4 897 }
22c047cc 898 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
899 if (goal <= 0)
900 break;
901 }
902 rover = k;
903
904 if (goal <= 0)
905 goto work_done;
906
907 /* Goal is not achieved. We stop process if:
908
909 - if expire reduced to zero. Otherwise, expire is halfed.
910 - if table is not full.
911 - if we are called from interrupt.
912 - jiffies check is just fallback/debug loop breaker.
913 We will not spin here for long time in any case.
914 */
915
916 RT_CACHE_STAT_INC(gc_goal_miss);
917
918 if (expire == 0)
919 break;
920
921 expire >>= 1;
922#if RT_CACHE_DEBUG >= 2
923 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
924 atomic_read(&ipv4_dst_ops.entries), goal, i);
925#endif
926
927 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
928 goto out;
929 } while (!in_softirq() && time_before_eq(jiffies, now));
930
931 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
932 goto out;
933 if (net_ratelimit())
934 printk(KERN_WARNING "dst cache overflow\n");
935 RT_CACHE_STAT_INC(gc_dst_overflow);
936 return 1;
937
938work_done:
939 expire += ip_rt_gc_min_interval;
940 if (expire > ip_rt_gc_timeout ||
941 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
942 expire = ip_rt_gc_timeout;
943#if RT_CACHE_DEBUG >= 2
944 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
945 atomic_read(&ipv4_dst_ops.entries), goal, rover);
946#endif
947out: return 0;
948}
949
950static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
951{
952 struct rtable *rth, **rthp;
953 unsigned long now;
954 struct rtable *cand, **candp;
955 u32 min_score;
956 int chain_length;
957 int attempts = !in_softirq();
958
959restart:
960 chain_length = 0;
961 min_score = ~(u32)0;
962 cand = NULL;
963 candp = NULL;
964 now = jiffies;
965
966 rthp = &rt_hash_table[hash].chain;
967
22c047cc 968 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 969 while ((rth = *rthp) != NULL) {
b5921910 970 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1da177e4 971 /* Put it first */
093c2ca4 972 *rthp = rth->u.dst.rt_next;
1da177e4
LT
973 /*
974 * Since lookup is lockfree, the deletion
975 * must be visible to another weakly ordered CPU before
976 * the insertion at the start of the hash chain.
977 */
093c2ca4 978 rcu_assign_pointer(rth->u.dst.rt_next,
1da177e4
LT
979 rt_hash_table[hash].chain);
980 /*
981 * Since lookup is lockfree, the update writes
982 * must be ordered for consistency on SMP.
983 */
984 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
985
03f49f34 986 dst_use(&rth->u.dst, now);
22c047cc 987 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
988
989 rt_drop(rt);
990 *rp = rth;
991 return 0;
992 }
993
994 if (!atomic_read(&rth->u.dst.__refcnt)) {
995 u32 score = rt_score(rth);
996
997 if (score <= min_score) {
998 cand = rth;
999 candp = rthp;
1000 min_score = score;
1001 }
1002 }
1003
1004 chain_length++;
1005
093c2ca4 1006 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1007 }
1008
1009 if (cand) {
1010 /* ip_rt_gc_elasticity used to be average length of chain
1011 * length, when exceeded gc becomes really aggressive.
1012 *
1013 * The second limit is less certain. At the moment it allows
1014 * only 2 entries per bucket. We will see.
1015 */
1016 if (chain_length > ip_rt_gc_elasticity) {
093c2ca4 1017 *candp = cand->u.dst.rt_next;
1da177e4
LT
1018 rt_free(cand);
1019 }
1020 }
1021
1022 /* Try to bind route to arp only if it is output
1023 route or unicast forwarding path.
1024 */
1025 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1026 int err = arp_bind_neighbour(&rt->u.dst);
1027 if (err) {
22c047cc 1028 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1029
1030 if (err != -ENOBUFS) {
1031 rt_drop(rt);
1032 return err;
1033 }
1034
1035 /* Neighbour tables are full and nothing
1036 can be released. Try to shrink route cache,
1037 it is most likely it holds some neighbour records.
1038 */
1039 if (attempts-- > 0) {
1040 int saved_elasticity = ip_rt_gc_elasticity;
1041 int saved_int = ip_rt_gc_min_interval;
1042 ip_rt_gc_elasticity = 1;
1043 ip_rt_gc_min_interval = 0;
569d3645 1044 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1045 ip_rt_gc_min_interval = saved_int;
1046 ip_rt_gc_elasticity = saved_elasticity;
1047 goto restart;
1048 }
1049
1050 if (net_ratelimit())
1051 printk(KERN_WARNING "Neighbour table overflow.\n");
1052 rt_drop(rt);
1053 return -ENOBUFS;
1054 }
1055 }
1056
093c2ca4 1057 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1da177e4 1058#if RT_CACHE_DEBUG >= 2
093c2ca4 1059 if (rt->u.dst.rt_next) {
1da177e4
LT
1060 struct rtable *trt;
1061 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1062 NIPQUAD(rt->rt_dst));
093c2ca4 1063 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1da177e4
LT
1064 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1065 printk("\n");
1066 }
1067#endif
1068 rt_hash_table[hash].chain = rt;
22c047cc 1069 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1070 *rp = rt;
1071 return 0;
1072}
1073
1074void rt_bind_peer(struct rtable *rt, int create)
1075{
1076 static DEFINE_SPINLOCK(rt_peer_lock);
1077 struct inet_peer *peer;
1078
1079 peer = inet_getpeer(rt->rt_dst, create);
1080
1081 spin_lock_bh(&rt_peer_lock);
1082 if (rt->peer == NULL) {
1083 rt->peer = peer;
1084 peer = NULL;
1085 }
1086 spin_unlock_bh(&rt_peer_lock);
1087 if (peer)
1088 inet_putpeer(peer);
1089}
1090
1091/*
1092 * Peer allocation may fail only in serious out-of-memory conditions. However
1093 * we still can generate some output.
1094 * Random ID selection looks a bit dangerous because we have no chances to
1095 * select ID being unique in a reasonable period of time.
1096 * But broken packet identifier may be better than no packet at all.
1097 */
1098static void ip_select_fb_ident(struct iphdr *iph)
1099{
1100 static DEFINE_SPINLOCK(ip_fb_id_lock);
1101 static u32 ip_fallback_id;
1102 u32 salt;
1103
1104 spin_lock_bh(&ip_fb_id_lock);
e448515c 1105 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1106 iph->id = htons(salt & 0xFFFF);
1107 ip_fallback_id = salt;
1108 spin_unlock_bh(&ip_fb_id_lock);
1109}
1110
1111void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1112{
1113 struct rtable *rt = (struct rtable *) dst;
1114
1115 if (rt) {
1116 if (rt->peer == NULL)
1117 rt_bind_peer(rt, 1);
1118
1119 /* If peer is attached to destination, it is never detached,
1120 so that we need not to grab a lock to dereference it.
1121 */
1122 if (rt->peer) {
1123 iph->id = htons(inet_getid(rt->peer, more));
1124 return;
1125 }
1126 } else
e905a9ed 1127 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1128 __builtin_return_address(0));
1da177e4
LT
1129
1130 ip_select_fb_ident(iph);
1131}
1132
1133static void rt_del(unsigned hash, struct rtable *rt)
1134{
1135 struct rtable **rthp;
1136
22c047cc 1137 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1138 ip_rt_put(rt);
1139 for (rthp = &rt_hash_table[hash].chain; *rthp;
093c2ca4 1140 rthp = &(*rthp)->u.dst.rt_next)
1da177e4 1141 if (*rthp == rt) {
093c2ca4 1142 *rthp = rt->u.dst.rt_next;
1da177e4
LT
1143 rt_free(rt);
1144 break;
1145 }
22c047cc 1146 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1147}
1148
f7655229
AV
1149void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1150 __be32 saddr, struct net_device *dev)
1da177e4
LT
1151{
1152 int i, k;
1153 struct in_device *in_dev = in_dev_get(dev);
1154 struct rtable *rth, **rthp;
f7655229 1155 __be32 skeys[2] = { saddr, 0 };
1da177e4 1156 int ikeys[2] = { dev->ifindex, 0 };
8d71740c 1157 struct netevent_redirect netevent;
1da177e4 1158
1da177e4
LT
1159 if (!in_dev)
1160 return;
1161
1162 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1e637c74 1163 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
f97c1e0c 1164 || ipv4_is_zeronet(new_gw))
1da177e4
LT
1165 goto reject_redirect;
1166
1167 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1168 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1169 goto reject_redirect;
1170 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1171 goto reject_redirect;
1172 } else {
6b175b26 1173 if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST)
1da177e4
LT
1174 goto reject_redirect;
1175 }
1176
1177 for (i = 0; i < 2; i++) {
1178 for (k = 0; k < 2; k++) {
8c7bc840 1179 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1da177e4
LT
1180
1181 rthp=&rt_hash_table[hash].chain;
1182
1183 rcu_read_lock();
1184 while ((rth = rcu_dereference(*rthp)) != NULL) {
1185 struct rtable *rt;
1186
1187 if (rth->fl.fl4_dst != daddr ||
1188 rth->fl.fl4_src != skeys[i] ||
1da177e4
LT
1189 rth->fl.oif != ikeys[k] ||
1190 rth->fl.iif != 0) {
093c2ca4 1191 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1192 continue;
1193 }
1194
1195 if (rth->rt_dst != daddr ||
1196 rth->rt_src != saddr ||
1197 rth->u.dst.error ||
1198 rth->rt_gateway != old_gw ||
1199 rth->u.dst.dev != dev)
1200 break;
1201
1202 dst_hold(&rth->u.dst);
1203 rcu_read_unlock();
1204
1205 rt = dst_alloc(&ipv4_dst_ops);
1206 if (rt == NULL) {
1207 ip_rt_put(rth);
1208 in_dev_put(in_dev);
1209 return;
1210 }
1211
1212 /* Copy all the information. */
1213 *rt = *rth;
e905a9ed 1214 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1da177e4
LT
1215 rt->u.dst.__use = 1;
1216 atomic_set(&rt->u.dst.__refcnt, 1);
1217 rt->u.dst.child = NULL;
1218 if (rt->u.dst.dev)
1219 dev_hold(rt->u.dst.dev);
1220 if (rt->idev)
1221 in_dev_hold(rt->idev);
1222 rt->u.dst.obsolete = 0;
1223 rt->u.dst.lastuse = jiffies;
1224 rt->u.dst.path = &rt->u.dst;
1225 rt->u.dst.neighbour = NULL;
1226 rt->u.dst.hh = NULL;
1227 rt->u.dst.xfrm = NULL;
1228
1229 rt->rt_flags |= RTCF_REDIRECTED;
1230
1231 /* Gateway is different ... */
1232 rt->rt_gateway = new_gw;
1233
1234 /* Redirect received -> path was valid */
1235 dst_confirm(&rth->u.dst);
1236
1237 if (rt->peer)
1238 atomic_inc(&rt->peer->refcnt);
1239
1240 if (arp_bind_neighbour(&rt->u.dst) ||
1241 !(rt->u.dst.neighbour->nud_state &
1242 NUD_VALID)) {
1243 if (rt->u.dst.neighbour)
1244 neigh_event_send(rt->u.dst.neighbour, NULL);
1245 ip_rt_put(rth);
1246 rt_drop(rt);
1247 goto do_next;
1248 }
e905a9ed 1249
8d71740c
TT
1250 netevent.old = &rth->u.dst;
1251 netevent.new = &rt->u.dst;
e905a9ed
YH
1252 call_netevent_notifiers(NETEVENT_REDIRECT,
1253 &netevent);
1da177e4
LT
1254
1255 rt_del(hash, rth);
1256 if (!rt_intern_hash(hash, rt, &rt))
1257 ip_rt_put(rt);
1258 goto do_next;
1259 }
1260 rcu_read_unlock();
1261 do_next:
1262 ;
1263 }
1264 }
1265 in_dev_put(in_dev);
1266 return;
1267
1268reject_redirect:
1269#ifdef CONFIG_IP_ROUTE_VERBOSE
1270 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1271 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1272 "%u.%u.%u.%u ignored.\n"
cef2685e 1273 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1da177e4 1274 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
cef2685e 1275 NIPQUAD(saddr), NIPQUAD(daddr));
1da177e4
LT
1276#endif
1277 in_dev_put(in_dev);
1278}
1279
1280static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1281{
1282 struct rtable *rt = (struct rtable*)dst;
1283 struct dst_entry *ret = dst;
1284
1285 if (rt) {
1286 if (dst->obsolete) {
1287 ip_rt_put(rt);
1288 ret = NULL;
1289 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1290 rt->u.dst.expires) {
8c7bc840
AV
1291 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1292 rt->fl.oif);
1da177e4 1293#if RT_CACHE_DEBUG >= 1
56c99d04 1294 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1da177e4
LT
1295 "%u.%u.%u.%u/%02x dropped\n",
1296 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1297#endif
1298 rt_del(hash, rt);
1299 ret = NULL;
1300 }
1301 }
1302 return ret;
1303}
1304
1305/*
1306 * Algorithm:
1307 * 1. The first ip_rt_redirect_number redirects are sent
1308 * with exponential backoff, then we stop sending them at all,
1309 * assuming that the host ignores our redirects.
1310 * 2. If we did not see packets requiring redirects
1311 * during ip_rt_redirect_silence, we assume that the host
1312 * forgot redirected route and start to send redirects again.
1313 *
1314 * This algorithm is much cheaper and more intelligent than dumb load limiting
1315 * in icmp.c.
1316 *
1317 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1318 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1319 */
1320
1321void ip_rt_send_redirect(struct sk_buff *skb)
1322{
1323 struct rtable *rt = (struct rtable*)skb->dst;
1324 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1325
1326 if (!in_dev)
1327 return;
1328
1329 if (!IN_DEV_TX_REDIRECTS(in_dev))
1330 goto out;
1331
1332 /* No redirected packets during ip_rt_redirect_silence;
1333 * reset the algorithm.
1334 */
1335 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1336 rt->u.dst.rate_tokens = 0;
1337
1338 /* Too many ignored redirects; do not send anything
1339 * set u.dst.rate_last to the last seen redirected packet.
1340 */
1341 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1342 rt->u.dst.rate_last = jiffies;
1343 goto out;
1344 }
1345
1346 /* Check for load limit; set rate_last to the latest sent
1347 * redirect.
1348 */
14fb8a76
LY
1349 if (rt->u.dst.rate_tokens == 0 ||
1350 time_after(jiffies,
1da177e4
LT
1351 (rt->u.dst.rate_last +
1352 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1353 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1354 rt->u.dst.rate_last = jiffies;
1355 ++rt->u.dst.rate_tokens;
1356#ifdef CONFIG_IP_ROUTE_VERBOSE
1357 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1358 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1359 net_ratelimit())
1360 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1361 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1362 NIPQUAD(rt->rt_src), rt->rt_iif,
1363 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1364#endif
1365 }
1366out:
e905a9ed 1367 in_dev_put(in_dev);
1da177e4
LT
1368}
1369
1370static int ip_error(struct sk_buff *skb)
1371{
1372 struct rtable *rt = (struct rtable*)skb->dst;
1373 unsigned long now;
1374 int code;
1375
1376 switch (rt->u.dst.error) {
1377 case EINVAL:
1378 default:
1379 goto out;
1380 case EHOSTUNREACH:
1381 code = ICMP_HOST_UNREACH;
1382 break;
1383 case ENETUNREACH:
1384 code = ICMP_NET_UNREACH;
7f53878d 1385 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1da177e4
LT
1386 break;
1387 case EACCES:
1388 code = ICMP_PKT_FILTERED;
1389 break;
1390 }
1391
1392 now = jiffies;
1393 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1394 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1395 rt->u.dst.rate_tokens = ip_rt_error_burst;
1396 rt->u.dst.rate_last = now;
1397 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1398 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1399 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1400 }
1401
1402out: kfree_skb(skb);
1403 return 0;
e905a9ed 1404}
1da177e4
LT
1405
1406/*
1407 * The last two values are not from the RFC but
1408 * are needed for AMPRnet AX.25 paths.
1409 */
1410
9b5b5cff 1411static const unsigned short mtu_plateau[] =
1da177e4
LT
1412{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1413
1414static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1415{
1416 int i;
e905a9ed 1417
1da177e4
LT
1418 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1419 if (old_mtu > mtu_plateau[i])
1420 return mtu_plateau[i];
1421 return 68;
1422}
1423
b5921910
DL
1424unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1425 unsigned short new_mtu)
1da177e4
LT
1426{
1427 int i;
1428 unsigned short old_mtu = ntohs(iph->tot_len);
1429 struct rtable *rth;
e448515c
AV
1430 __be32 skeys[2] = { iph->saddr, 0, };
1431 __be32 daddr = iph->daddr;
1da177e4
LT
1432 unsigned short est_mtu = 0;
1433
1434 if (ipv4_config.no_pmtu_disc)
1435 return 0;
1436
1437 for (i = 0; i < 2; i++) {
8c7bc840 1438 unsigned hash = rt_hash(daddr, skeys[i], 0);
1da177e4
LT
1439
1440 rcu_read_lock();
1441 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 1442 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
1443 if (rth->fl.fl4_dst == daddr &&
1444 rth->fl.fl4_src == skeys[i] &&
1445 rth->rt_dst == daddr &&
1446 rth->rt_src == iph->saddr &&
1da177e4 1447 rth->fl.iif == 0 &&
b5921910
DL
1448 !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1449 rth->u.dst.dev->nd_net == net) {
1da177e4
LT
1450 unsigned short mtu = new_mtu;
1451
1452 if (new_mtu < 68 || new_mtu >= old_mtu) {
1453
1454 /* BSD 4.2 compatibility hack :-( */
1455 if (mtu == 0 &&
1456 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1457 old_mtu >= 68 + (iph->ihl << 2))
1458 old_mtu -= iph->ihl << 2;
1459
1460 mtu = guess_mtu(old_mtu);
1461 }
1462 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
e905a9ed 1463 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1da177e4
LT
1464 dst_confirm(&rth->u.dst);
1465 if (mtu < ip_rt_min_pmtu) {
1466 mtu = ip_rt_min_pmtu;
1467 rth->u.dst.metrics[RTAX_LOCK-1] |=
1468 (1 << RTAX_MTU);
1469 }
1470 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1471 dst_set_expires(&rth->u.dst,
1472 ip_rt_mtu_expires);
1473 }
1474 est_mtu = mtu;
1475 }
1476 }
1477 }
1478 rcu_read_unlock();
1479 }
1480 return est_mtu ? : new_mtu;
1481}
1482
1483static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1484{
1485 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1486 !(dst_metric_locked(dst, RTAX_MTU))) {
1487 if (mtu < ip_rt_min_pmtu) {
1488 mtu = ip_rt_min_pmtu;
1489 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1490 }
1491 dst->metrics[RTAX_MTU-1] = mtu;
1492 dst_set_expires(dst, ip_rt_mtu_expires);
8d71740c 1493 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1da177e4
LT
1494 }
1495}
1496
1497static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1498{
1499 return NULL;
1500}
1501
1502static void ipv4_dst_destroy(struct dst_entry *dst)
1503{
1504 struct rtable *rt = (struct rtable *) dst;
1505 struct inet_peer *peer = rt->peer;
1506 struct in_device *idev = rt->idev;
1507
1508 if (peer) {
1509 rt->peer = NULL;
1510 inet_putpeer(peer);
1511 }
1512
1513 if (idev) {
1514 rt->idev = NULL;
1515 in_dev_put(idev);
1516 }
1517}
1518
1519static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1520 int how)
1521{
1522 struct rtable *rt = (struct rtable *) dst;
1523 struct in_device *idev = rt->idev;
5a3e55d6
DL
1524 if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1525 struct in_device *loopback_idev =
1526 in_dev_get(dev->nd_net->loopback_dev);
1da177e4
LT
1527 if (loopback_idev) {
1528 rt->idev = loopback_idev;
1529 in_dev_put(idev);
1530 }
1531 }
1532}
1533
1534static void ipv4_link_failure(struct sk_buff *skb)
1535{
1536 struct rtable *rt;
1537
1538 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1539
1540 rt = (struct rtable *) skb->dst;
1541 if (rt)
1542 dst_set_expires(&rt->u.dst, 0);
1543}
1544
1545static int ip_rt_bug(struct sk_buff *skb)
1546{
1547 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
eddc9ec5 1548 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1da177e4
LT
1549 skb->dev ? skb->dev->name : "?");
1550 kfree_skb(skb);
1551 return 0;
1552}
1553
1554/*
1555 We do not cache source address of outgoing interface,
1556 because it is used only by IP RR, TS and SRR options,
1557 so that it out of fast path.
1558
1559 BTW remember: "addr" is allowed to be not aligned
1560 in IP options!
1561 */
1562
1563void ip_rt_get_source(u8 *addr, struct rtable *rt)
1564{
a61ced5d 1565 __be32 src;
1da177e4
LT
1566 struct fib_result res;
1567
1568 if (rt->fl.iif == 0)
1569 src = rt->rt_src;
ecfdc8c5 1570 else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
1da177e4
LT
1571 src = FIB_RES_PREFSRC(res);
1572 fib_res_put(&res);
1573 } else
1574 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1575 RT_SCOPE_UNIVERSE);
1576 memcpy(addr, &src, 4);
1577}
1578
1579#ifdef CONFIG_NET_CLS_ROUTE
1580static void set_class_tag(struct rtable *rt, u32 tag)
1581{
1582 if (!(rt->u.dst.tclassid & 0xFFFF))
1583 rt->u.dst.tclassid |= tag & 0xFFFF;
1584 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1585 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1586}
1587#endif
1588
1589static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1590{
1591 struct fib_info *fi = res->fi;
1592
1593 if (fi) {
1594 if (FIB_RES_GW(*res) &&
1595 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1596 rt->rt_gateway = FIB_RES_GW(*res);
1597 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1598 sizeof(rt->u.dst.metrics));
1599 if (fi->fib_mtu == 0) {
1600 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1601 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1602 rt->rt_gateway != rt->rt_dst &&
1603 rt->u.dst.dev->mtu > 576)
1604 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1605 }
1606#ifdef CONFIG_NET_CLS_ROUTE
1607 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1608#endif
1609 } else
1610 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1611
1612 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1613 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1614 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1615 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1616 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1617 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1618 ip_rt_min_advmss);
1619 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1620 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1621
1622#ifdef CONFIG_NET_CLS_ROUTE
1623#ifdef CONFIG_IP_MULTIPLE_TABLES
1624 set_class_tag(rt, fib_rules_tclass(res));
1625#endif
1626 set_class_tag(rt, itag);
1627#endif
e905a9ed 1628 rt->rt_type = res->type;
1da177e4
LT
1629}
1630
9e12bb22 1631static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1632 u8 tos, struct net_device *dev, int our)
1633{
1634 unsigned hash;
1635 struct rtable *rth;
a61ced5d 1636 __be32 spec_dst;
1da177e4
LT
1637 struct in_device *in_dev = in_dev_get(dev);
1638 u32 itag = 0;
1639
1640 /* Primary sanity checks. */
1641
1642 if (in_dev == NULL)
1643 return -EINVAL;
1644
1e637c74 1645 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 1646 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1647 goto e_inval;
1648
f97c1e0c
JP
1649 if (ipv4_is_zeronet(saddr)) {
1650 if (!ipv4_is_local_multicast(daddr))
1da177e4
LT
1651 goto e_inval;
1652 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1653 } else if (fib_validate_source(saddr, 0, tos, 0,
1654 dev, &spec_dst, &itag) < 0)
1655 goto e_inval;
1656
1657 rth = dst_alloc(&ipv4_dst_ops);
1658 if (!rth)
1659 goto e_nobufs;
1660
1661 rth->u.dst.output= ip_rt_bug;
1662
1663 atomic_set(&rth->u.dst.__refcnt, 1);
1664 rth->u.dst.flags= DST_HOST;
42f811b8 1665 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
1666 rth->u.dst.flags |= DST_NOPOLICY;
1667 rth->fl.fl4_dst = daddr;
1668 rth->rt_dst = daddr;
1669 rth->fl.fl4_tos = tos;
47dcf0cb 1670 rth->fl.mark = skb->mark;
1da177e4
LT
1671 rth->fl.fl4_src = saddr;
1672 rth->rt_src = saddr;
1673#ifdef CONFIG_NET_CLS_ROUTE
1674 rth->u.dst.tclassid = itag;
1675#endif
1676 rth->rt_iif =
1677 rth->fl.iif = dev->ifindex;
2774c7ab 1678 rth->u.dst.dev = init_net.loopback_dev;
1da177e4
LT
1679 dev_hold(rth->u.dst.dev);
1680 rth->idev = in_dev_get(rth->u.dst.dev);
1681 rth->fl.oif = 0;
1682 rth->rt_gateway = daddr;
1683 rth->rt_spec_dst= spec_dst;
1684 rth->rt_type = RTN_MULTICAST;
1685 rth->rt_flags = RTCF_MULTICAST;
1686 if (our) {
1687 rth->u.dst.input= ip_local_deliver;
1688 rth->rt_flags |= RTCF_LOCAL;
1689 }
1690
1691#ifdef CONFIG_IP_MROUTE
f97c1e0c 1692 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1da177e4
LT
1693 rth->u.dst.input = ip_mr_input;
1694#endif
1695 RT_CACHE_STAT_INC(in_slow_mc);
1696
1697 in_dev_put(in_dev);
8c7bc840 1698 hash = rt_hash(daddr, saddr, dev->ifindex);
1da177e4
LT
1699 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1700
1701e_nobufs:
1702 in_dev_put(in_dev);
1703 return -ENOBUFS;
1704
1705e_inval:
1706 in_dev_put(in_dev);
1707 return -EINVAL;
1708}
1709
1710
1711static void ip_handle_martian_source(struct net_device *dev,
1712 struct in_device *in_dev,
1713 struct sk_buff *skb,
9e12bb22
AV
1714 __be32 daddr,
1715 __be32 saddr)
1da177e4
LT
1716{
1717 RT_CACHE_STAT_INC(in_martian_src);
1718#ifdef CONFIG_IP_ROUTE_VERBOSE
1719 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1720 /*
1721 * RFC1812 recommendation, if source is martian,
1722 * the only hint is MAC header.
1723 */
1724 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1725 "%u.%u.%u.%u, on dev %s\n",
1726 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
98e399f8 1727 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 1728 int i;
98e399f8 1729 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
1730 printk(KERN_WARNING "ll header: ");
1731 for (i = 0; i < dev->hard_header_len; i++, p++) {
1732 printk("%02x", *p);
1733 if (i < (dev->hard_header_len - 1))
1734 printk(":");
1735 }
1736 printk("\n");
1737 }
1738 }
1739#endif
1740}
1741
e905a9ed
YH
1742static inline int __mkroute_input(struct sk_buff *skb,
1743 struct fib_result* res,
1744 struct in_device *in_dev,
9e12bb22 1745 __be32 daddr, __be32 saddr, u32 tos,
e905a9ed 1746 struct rtable **result)
1da177e4
LT
1747{
1748
1749 struct rtable *rth;
1750 int err;
1751 struct in_device *out_dev;
1752 unsigned flags = 0;
d9c9df8c
AV
1753 __be32 spec_dst;
1754 u32 itag;
1da177e4
LT
1755
1756 /* get a working reference to the output device */
1757 out_dev = in_dev_get(FIB_RES_DEV(*res));
1758 if (out_dev == NULL) {
1759 if (net_ratelimit())
1760 printk(KERN_CRIT "Bug in ip_route_input" \
1761 "_slow(). Please, report\n");
1762 return -EINVAL;
1763 }
1764
1765
e905a9ed 1766 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1da177e4
LT
1767 in_dev->dev, &spec_dst, &itag);
1768 if (err < 0) {
e905a9ed 1769 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1770 saddr);
e905a9ed 1771
1da177e4
LT
1772 err = -EINVAL;
1773 goto cleanup;
1774 }
1775
1776 if (err)
1777 flags |= RTCF_DIRECTSRC;
1778
cb7928a5 1779 if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1da177e4
LT
1780 (IN_DEV_SHARED_MEDIA(out_dev) ||
1781 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1782 flags |= RTCF_DOREDIRECT;
1783
1784 if (skb->protocol != htons(ETH_P_IP)) {
1785 /* Not IP (i.e. ARP). Do not create route, if it is
1786 * invalid for proxy arp. DNAT routes are always valid.
1787 */
cb7928a5 1788 if (out_dev == in_dev) {
1da177e4
LT
1789 err = -EINVAL;
1790 goto cleanup;
1791 }
1792 }
1793
1794
1795 rth = dst_alloc(&ipv4_dst_ops);
1796 if (!rth) {
1797 err = -ENOBUFS;
1798 goto cleanup;
1799 }
1800
ce723d8e 1801 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 1802 rth->u.dst.flags= DST_HOST;
42f811b8 1803 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4 1804 rth->u.dst.flags |= DST_NOPOLICY;
42f811b8 1805 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1da177e4
LT
1806 rth->u.dst.flags |= DST_NOXFRM;
1807 rth->fl.fl4_dst = daddr;
1808 rth->rt_dst = daddr;
1809 rth->fl.fl4_tos = tos;
47dcf0cb 1810 rth->fl.mark = skb->mark;
1da177e4
LT
1811 rth->fl.fl4_src = saddr;
1812 rth->rt_src = saddr;
1813 rth->rt_gateway = daddr;
1814 rth->rt_iif =
1815 rth->fl.iif = in_dev->dev->ifindex;
1816 rth->u.dst.dev = (out_dev)->dev;
1817 dev_hold(rth->u.dst.dev);
1818 rth->idev = in_dev_get(rth->u.dst.dev);
1819 rth->fl.oif = 0;
1820 rth->rt_spec_dst= spec_dst;
1821
1822 rth->u.dst.input = ip_forward;
1823 rth->u.dst.output = ip_output;
1824
1825 rt_set_nexthop(rth, res, itag);
1826
1827 rth->rt_flags = flags;
1828
1829 *result = rth;
1830 err = 0;
1831 cleanup:
1832 /* release the working reference to the output device */
1833 in_dev_put(out_dev);
1834 return err;
e905a9ed 1835}
1da177e4 1836
e06e7c61
DM
1837static inline int ip_mkroute_input(struct sk_buff *skb,
1838 struct fib_result* res,
1839 const struct flowi *fl,
1840 struct in_device *in_dev,
1841 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1842{
7abaa27c 1843 struct rtable* rth = NULL;
1da177e4
LT
1844 int err;
1845 unsigned hash;
1846
1847#ifdef CONFIG_IP_ROUTE_MULTIPATH
1848 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1849 fib_select_multipath(fl, res);
1850#endif
1851
1852 /* create a routing cache entry */
1853 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1854 if (err)
1855 return err;
1da177e4
LT
1856
1857 /* put it into the cache */
8c7bc840 1858 hash = rt_hash(daddr, saddr, fl->iif);
e905a9ed 1859 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1da177e4
LT
1860}
1861
1da177e4
LT
1862/*
1863 * NOTE. We drop all the packets that has local source
1864 * addresses, because every properly looped back packet
1865 * must have correct destination already attached by output routine.
1866 *
1867 * Such approach solves two big problems:
1868 * 1. Not simplex devices are handled properly.
1869 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1870 */
1871
9e12bb22 1872static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1873 u8 tos, struct net_device *dev)
1874{
1875 struct fib_result res;
1876 struct in_device *in_dev = in_dev_get(dev);
1877 struct flowi fl = { .nl_u = { .ip4_u =
1878 { .daddr = daddr,
1879 .saddr = saddr,
1880 .tos = tos,
1881 .scope = RT_SCOPE_UNIVERSE,
1da177e4 1882 } },
47dcf0cb 1883 .mark = skb->mark,
1da177e4
LT
1884 .iif = dev->ifindex };
1885 unsigned flags = 0;
1886 u32 itag = 0;
1887 struct rtable * rth;
1888 unsigned hash;
9e12bb22 1889 __be32 spec_dst;
1da177e4
LT
1890 int err = -EINVAL;
1891 int free_res = 0;
84a885f4 1892 struct net * net = dev->nd_net;
1da177e4
LT
1893
1894 /* IP on this device is disabled. */
1895
1896 if (!in_dev)
1897 goto out;
1898
1899 /* Check for the most weird martians, which can be not detected
1900 by fib_lookup.
1901 */
1902
1e637c74 1903 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 1904 ipv4_is_loopback(saddr))
1da177e4
LT
1905 goto martian_source;
1906
e448515c 1907 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1da177e4
LT
1908 goto brd_input;
1909
1910 /* Accept zero addresses only to limited broadcast;
1911 * I even do not know to fix it or not. Waiting for complains :-)
1912 */
f97c1e0c 1913 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1914 goto martian_source;
1915
1e637c74 1916 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
f97c1e0c 1917 ipv4_is_loopback(daddr))
1da177e4
LT
1918 goto martian_destination;
1919
1920 /*
1921 * Now we are ready to route packet.
1922 */
84a885f4 1923 if ((err = fib_lookup(net, &fl, &res)) != 0) {
1da177e4 1924 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1925 goto e_hostunreach;
1da177e4
LT
1926 goto no_route;
1927 }
1928 free_res = 1;
1929
1930 RT_CACHE_STAT_INC(in_slow_tot);
1931
1932 if (res.type == RTN_BROADCAST)
1933 goto brd_input;
1934
1935 if (res.type == RTN_LOCAL) {
1936 int result;
1937 result = fib_validate_source(saddr, daddr, tos,
84a885f4 1938 net->loopback_dev->ifindex,
1da177e4
LT
1939 dev, &spec_dst, &itag);
1940 if (result < 0)
1941 goto martian_source;
1942 if (result)
1943 flags |= RTCF_DIRECTSRC;
1944 spec_dst = daddr;
1945 goto local_input;
1946 }
1947
1948 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1949 goto e_hostunreach;
1da177e4
LT
1950 if (res.type != RTN_UNICAST)
1951 goto martian_destination;
1952
1953 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1da177e4
LT
1954done:
1955 in_dev_put(in_dev);
1956 if (free_res)
1957 fib_res_put(&res);
1958out: return err;
1959
1960brd_input:
1961 if (skb->protocol != htons(ETH_P_IP))
1962 goto e_inval;
1963
f97c1e0c 1964 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1965 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1966 else {
1967 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1968 &itag);
1969 if (err < 0)
1970 goto martian_source;
1971 if (err)
1972 flags |= RTCF_DIRECTSRC;
1973 }
1974 flags |= RTCF_BROADCAST;
1975 res.type = RTN_BROADCAST;
1976 RT_CACHE_STAT_INC(in_brd);
1977
1978local_input:
1979 rth = dst_alloc(&ipv4_dst_ops);
1980 if (!rth)
1981 goto e_nobufs;
1982
1983 rth->u.dst.output= ip_rt_bug;
1984
1985 atomic_set(&rth->u.dst.__refcnt, 1);
1986 rth->u.dst.flags= DST_HOST;
42f811b8 1987 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
1988 rth->u.dst.flags |= DST_NOPOLICY;
1989 rth->fl.fl4_dst = daddr;
1990 rth->rt_dst = daddr;
1991 rth->fl.fl4_tos = tos;
47dcf0cb 1992 rth->fl.mark = skb->mark;
1da177e4
LT
1993 rth->fl.fl4_src = saddr;
1994 rth->rt_src = saddr;
1995#ifdef CONFIG_NET_CLS_ROUTE
1996 rth->u.dst.tclassid = itag;
1997#endif
1998 rth->rt_iif =
1999 rth->fl.iif = dev->ifindex;
84a885f4 2000 rth->u.dst.dev = net->loopback_dev;
1da177e4
LT
2001 dev_hold(rth->u.dst.dev);
2002 rth->idev = in_dev_get(rth->u.dst.dev);
2003 rth->rt_gateway = daddr;
2004 rth->rt_spec_dst= spec_dst;
2005 rth->u.dst.input= ip_local_deliver;
2006 rth->rt_flags = flags|RTCF_LOCAL;
2007 if (res.type == RTN_UNREACHABLE) {
2008 rth->u.dst.input= ip_error;
2009 rth->u.dst.error= -err;
2010 rth->rt_flags &= ~RTCF_LOCAL;
2011 }
2012 rth->rt_type = res.type;
8c7bc840 2013 hash = rt_hash(daddr, saddr, fl.iif);
1da177e4
LT
2014 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2015 goto done;
2016
2017no_route:
2018 RT_CACHE_STAT_INC(in_no_route);
2019 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2020 res.type = RTN_UNREACHABLE;
7f53878d
MC
2021 if (err == -ESRCH)
2022 err = -ENETUNREACH;
1da177e4
LT
2023 goto local_input;
2024
2025 /*
2026 * Do not cache martian addresses: they should be logged (RFC1812)
2027 */
2028martian_destination:
2029 RT_CACHE_STAT_INC(in_martian_dst);
2030#ifdef CONFIG_IP_ROUTE_VERBOSE
2031 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2032 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2033 "%u.%u.%u.%u, dev %s\n",
2034 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2035#endif
2c2910a4
DE
2036
2037e_hostunreach:
e905a9ed
YH
2038 err = -EHOSTUNREACH;
2039 goto done;
2c2910a4 2040
1da177e4
LT
2041e_inval:
2042 err = -EINVAL;
2043 goto done;
2044
2045e_nobufs:
2046 err = -ENOBUFS;
2047 goto done;
2048
2049martian_source:
2050 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2051 goto e_inval;
2052}
2053
9e12bb22 2054int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2055 u8 tos, struct net_device *dev)
2056{
2057 struct rtable * rth;
2058 unsigned hash;
2059 int iif = dev->ifindex;
b5921910 2060 struct net *net;
1da177e4 2061
b5921910 2062 net = skb->dev->nd_net;
1da177e4 2063 tos &= IPTOS_RT_MASK;
8c7bc840 2064 hash = rt_hash(daddr, saddr, iif);
1da177e4
LT
2065
2066 rcu_read_lock();
2067 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 2068 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
2069 if (rth->fl.fl4_dst == daddr &&
2070 rth->fl.fl4_src == saddr &&
2071 rth->fl.iif == iif &&
2072 rth->fl.oif == 0 &&
47dcf0cb 2073 rth->fl.mark == skb->mark &&
b5921910
DL
2074 rth->fl.fl4_tos == tos &&
2075 rth->u.dst.dev->nd_net == net) {
03f49f34 2076 dst_use(&rth->u.dst, jiffies);
1da177e4
LT
2077 RT_CACHE_STAT_INC(in_hit);
2078 rcu_read_unlock();
2079 skb->dst = (struct dst_entry*)rth;
2080 return 0;
2081 }
2082 RT_CACHE_STAT_INC(in_hlist_search);
2083 }
2084 rcu_read_unlock();
2085
2086 /* Multicast recognition logic is moved from route cache to here.
2087 The problem was that too many Ethernet cards have broken/missing
2088 hardware multicast filters :-( As result the host on multicasting
2089 network acquires a lot of useless route cache entries, sort of
2090 SDR messages from all the world. Now we try to get rid of them.
2091 Really, provided software IP multicast filter is organized
2092 reasonably (at least, hashed), it does not result in a slowdown
2093 comparing with route cache reject entries.
2094 Note, that multicast routers are not affected, because
2095 route cache entry is created eventually.
2096 */
f97c1e0c 2097 if (ipv4_is_multicast(daddr)) {
1da177e4
LT
2098 struct in_device *in_dev;
2099
2100 rcu_read_lock();
e5ed6399 2101 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1da177e4 2102 int our = ip_check_mc(in_dev, daddr, saddr,
eddc9ec5 2103 ip_hdr(skb)->protocol);
1da177e4
LT
2104 if (our
2105#ifdef CONFIG_IP_MROUTE
f97c1e0c
JP
2106 || (!ipv4_is_local_multicast(daddr) &&
2107 IN_DEV_MFORWARD(in_dev))
1da177e4
LT
2108#endif
2109 ) {
2110 rcu_read_unlock();
2111 return ip_route_input_mc(skb, daddr, saddr,
2112 tos, dev, our);
2113 }
2114 }
2115 rcu_read_unlock();
2116 return -EINVAL;
2117 }
2118 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2119}
2120
2121static inline int __mkroute_output(struct rtable **result,
e905a9ed 2122 struct fib_result* res,
1da177e4 2123 const struct flowi *fl,
e905a9ed
YH
2124 const struct flowi *oldflp,
2125 struct net_device *dev_out,
2126 unsigned flags)
1da177e4
LT
2127{
2128 struct rtable *rth;
2129 struct in_device *in_dev;
2130 u32 tos = RT_FL_TOS(oldflp);
2131 int err = 0;
2132
f97c1e0c 2133 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
1da177e4
LT
2134 return -EINVAL;
2135
e448515c 2136 if (fl->fl4_dst == htonl(0xFFFFFFFF))
1da177e4 2137 res->type = RTN_BROADCAST;
f97c1e0c 2138 else if (ipv4_is_multicast(fl->fl4_dst))
1da177e4 2139 res->type = RTN_MULTICAST;
1e637c74 2140 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
1da177e4
LT
2141 return -EINVAL;
2142
2143 if (dev_out->flags & IFF_LOOPBACK)
2144 flags |= RTCF_LOCAL;
2145
2146 /* get work reference to inet device */
2147 in_dev = in_dev_get(dev_out);
2148 if (!in_dev)
2149 return -EINVAL;
2150
2151 if (res->type == RTN_BROADCAST) {
2152 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2153 if (res->fi) {
2154 fib_info_put(res->fi);
2155 res->fi = NULL;
2156 }
2157 } else if (res->type == RTN_MULTICAST) {
2158 flags |= RTCF_MULTICAST|RTCF_LOCAL;
e905a9ed 2159 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
1da177e4
LT
2160 oldflp->proto))
2161 flags &= ~RTCF_LOCAL;
2162 /* If multicast route do not exist use
2163 default one, but do not gateway in this case.
2164 Yes, it is hack.
2165 */
2166 if (res->fi && res->prefixlen < 4) {
2167 fib_info_put(res->fi);
2168 res->fi = NULL;
2169 }
2170 }
2171
2172
2173 rth = dst_alloc(&ipv4_dst_ops);
2174 if (!rth) {
2175 err = -ENOBUFS;
2176 goto cleanup;
e905a9ed 2177 }
1da177e4 2178
ce723d8e 2179 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 2180 rth->u.dst.flags= DST_HOST;
42f811b8 2181 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
1da177e4 2182 rth->u.dst.flags |= DST_NOXFRM;
42f811b8 2183 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
2184 rth->u.dst.flags |= DST_NOPOLICY;
2185
2186 rth->fl.fl4_dst = oldflp->fl4_dst;
2187 rth->fl.fl4_tos = tos;
2188 rth->fl.fl4_src = oldflp->fl4_src;
2189 rth->fl.oif = oldflp->oif;
47dcf0cb 2190 rth->fl.mark = oldflp->mark;
1da177e4
LT
2191 rth->rt_dst = fl->fl4_dst;
2192 rth->rt_src = fl->fl4_src;
2193 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
e905a9ed 2194 /* get references to the devices that are to be hold by the routing
1da177e4
LT
2195 cache entry */
2196 rth->u.dst.dev = dev_out;
2197 dev_hold(dev_out);
2198 rth->idev = in_dev_get(dev_out);
2199 rth->rt_gateway = fl->fl4_dst;
2200 rth->rt_spec_dst= fl->fl4_src;
2201
2202 rth->u.dst.output=ip_output;
2203
2204 RT_CACHE_STAT_INC(out_slow_tot);
2205
2206 if (flags & RTCF_LOCAL) {
2207 rth->u.dst.input = ip_local_deliver;
2208 rth->rt_spec_dst = fl->fl4_dst;
2209 }
2210 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2211 rth->rt_spec_dst = fl->fl4_src;
e905a9ed 2212 if (flags & RTCF_LOCAL &&
1da177e4
LT
2213 !(dev_out->flags & IFF_LOOPBACK)) {
2214 rth->u.dst.output = ip_mc_output;
2215 RT_CACHE_STAT_INC(out_slow_mc);
2216 }
2217#ifdef CONFIG_IP_MROUTE
2218 if (res->type == RTN_MULTICAST) {
2219 if (IN_DEV_MFORWARD(in_dev) &&
f97c1e0c 2220 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
1da177e4
LT
2221 rth->u.dst.input = ip_mr_input;
2222 rth->u.dst.output = ip_mc_output;
2223 }
2224 }
2225#endif
2226 }
2227
2228 rt_set_nexthop(rth, res, 0);
2229
2230 rth->rt_flags = flags;
2231
2232 *result = rth;
2233 cleanup:
2234 /* release work reference to inet device */
2235 in_dev_put(in_dev);
2236
2237 return err;
2238}
2239
e06e7c61
DM
2240static inline int ip_mkroute_output(struct rtable **rp,
2241 struct fib_result* res,
2242 const struct flowi *fl,
2243 const struct flowi *oldflp,
2244 struct net_device *dev_out,
2245 unsigned flags)
1da177e4 2246{
7abaa27c 2247 struct rtable *rth = NULL;
1da177e4
LT
2248 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2249 unsigned hash;
2250 if (err == 0) {
8c7bc840 2251 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
1da177e4
LT
2252 err = rt_intern_hash(hash, rth, rp);
2253 }
e905a9ed 2254
1da177e4
LT
2255 return err;
2256}
2257
1da177e4
LT
2258/*
2259 * Major route resolver routine.
2260 */
2261
b40afd0e
DL
2262static int ip_route_output_slow(struct net *net, struct rtable **rp,
2263 const struct flowi *oldflp)
1da177e4
LT
2264{
2265 u32 tos = RT_FL_TOS(oldflp);
2266 struct flowi fl = { .nl_u = { .ip4_u =
2267 { .daddr = oldflp->fl4_dst,
2268 .saddr = oldflp->fl4_src,
2269 .tos = tos & IPTOS_RT_MASK,
2270 .scope = ((tos & RTO_ONLINK) ?
2271 RT_SCOPE_LINK :
2272 RT_SCOPE_UNIVERSE),
1da177e4 2273 } },
47dcf0cb 2274 .mark = oldflp->mark,
b40afd0e 2275 .iif = net->loopback_dev->ifindex,
1da177e4
LT
2276 .oif = oldflp->oif };
2277 struct fib_result res;
2278 unsigned flags = 0;
2279 struct net_device *dev_out = NULL;
2280 int free_res = 0;
2281 int err;
2282
2283
2284 res.fi = NULL;
2285#ifdef CONFIG_IP_MULTIPLE_TABLES
2286 res.r = NULL;
2287#endif
2288
2289 if (oldflp->fl4_src) {
2290 err = -EINVAL;
f97c1e0c 2291 if (ipv4_is_multicast(oldflp->fl4_src) ||
1e637c74 2292 ipv4_is_lbcast(oldflp->fl4_src) ||
f97c1e0c 2293 ipv4_is_zeronet(oldflp->fl4_src))
1da177e4
LT
2294 goto out;
2295
2296 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
b40afd0e 2297 dev_out = ip_dev_find(net, oldflp->fl4_src);
f6c5d736 2298 if (dev_out == NULL)
1da177e4
LT
2299 goto out;
2300
2301 /* I removed check for oif == dev_out->oif here.
2302 It was wrong for two reasons:
1ab35276
DL
2303 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2304 is assigned to multiple interfaces.
1da177e4
LT
2305 2. Moreover, we are allowed to send packets with saddr
2306 of another iface. --ANK
2307 */
2308
f6c5d736 2309 if (oldflp->oif == 0
f97c1e0c
JP
2310 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2311 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
1da177e4
LT
2312 /* Special hack: user can direct multicasts
2313 and limited broadcast via necessary interface
2314 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2315 This hack is not just for fun, it allows
2316 vic,vat and friends to work.
2317 They bind socket to loopback, set ttl to zero
2318 and expect that it will work.
2319 From the viewpoint of routing cache they are broken,
2320 because we are not allowed to build multicast path
2321 with loopback source addr (look, routing cache
2322 cannot know, that ttl is zero, so that packet
2323 will not leave this host and route is valid).
2324 Luckily, this hack is good workaround.
2325 */
2326
2327 fl.oif = dev_out->ifindex;
2328 goto make_route;
2329 }
2330 if (dev_out)
2331 dev_put(dev_out);
2332 dev_out = NULL;
2333 }
2334
2335
2336 if (oldflp->oif) {
b40afd0e 2337 dev_out = dev_get_by_index(net, oldflp->oif);
1da177e4
LT
2338 err = -ENODEV;
2339 if (dev_out == NULL)
2340 goto out;
e5ed6399
HX
2341
2342 /* RACE: Check return value of inet_select_addr instead. */
2343 if (__in_dev_get_rtnl(dev_out) == NULL) {
1da177e4
LT
2344 dev_put(dev_out);
2345 goto out; /* Wrong error code */
2346 }
2347
f97c1e0c
JP
2348 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2349 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
1da177e4
LT
2350 if (!fl.fl4_src)
2351 fl.fl4_src = inet_select_addr(dev_out, 0,
2352 RT_SCOPE_LINK);
2353 goto make_route;
2354 }
2355 if (!fl.fl4_src) {
f97c1e0c 2356 if (ipv4_is_multicast(oldflp->fl4_dst))
1da177e4
LT
2357 fl.fl4_src = inet_select_addr(dev_out, 0,
2358 fl.fl4_scope);
2359 else if (!oldflp->fl4_dst)
2360 fl.fl4_src = inet_select_addr(dev_out, 0,
2361 RT_SCOPE_HOST);
2362 }
2363 }
2364
2365 if (!fl.fl4_dst) {
2366 fl.fl4_dst = fl.fl4_src;
2367 if (!fl.fl4_dst)
2368 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2369 if (dev_out)
2370 dev_put(dev_out);
b40afd0e 2371 dev_out = net->loopback_dev;
1da177e4 2372 dev_hold(dev_out);
b40afd0e 2373 fl.oif = net->loopback_dev->ifindex;
1da177e4
LT
2374 res.type = RTN_LOCAL;
2375 flags |= RTCF_LOCAL;
2376 goto make_route;
2377 }
2378
b40afd0e 2379 if (fib_lookup(net, &fl, &res)) {
1da177e4
LT
2380 res.fi = NULL;
2381 if (oldflp->oif) {
2382 /* Apparently, routing tables are wrong. Assume,
2383 that the destination is on link.
2384
2385 WHY? DW.
2386 Because we are allowed to send to iface
2387 even if it has NO routes and NO assigned
2388 addresses. When oif is specified, routing
2389 tables are looked up with only one purpose:
2390 to catch if destination is gatewayed, rather than
2391 direct. Moreover, if MSG_DONTROUTE is set,
2392 we send packet, ignoring both routing tables
2393 and ifaddr state. --ANK
2394
2395
2396 We could make it even if oif is unknown,
2397 likely IPv6, but we do not.
2398 */
2399
2400 if (fl.fl4_src == 0)
2401 fl.fl4_src = inet_select_addr(dev_out, 0,
2402 RT_SCOPE_LINK);
2403 res.type = RTN_UNICAST;
2404 goto make_route;
2405 }
2406 if (dev_out)
2407 dev_put(dev_out);
2408 err = -ENETUNREACH;
2409 goto out;
2410 }
2411 free_res = 1;
2412
2413 if (res.type == RTN_LOCAL) {
2414 if (!fl.fl4_src)
2415 fl.fl4_src = fl.fl4_dst;
2416 if (dev_out)
2417 dev_put(dev_out);
b40afd0e 2418 dev_out = net->loopback_dev;
1da177e4
LT
2419 dev_hold(dev_out);
2420 fl.oif = dev_out->ifindex;
2421 if (res.fi)
2422 fib_info_put(res.fi);
2423 res.fi = NULL;
2424 flags |= RTCF_LOCAL;
2425 goto make_route;
2426 }
2427
2428#ifdef CONFIG_IP_ROUTE_MULTIPATH
2429 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2430 fib_select_multipath(&fl, &res);
2431 else
2432#endif
2433 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
b40afd0e 2434 fib_select_default(net, &fl, &res);
1da177e4
LT
2435
2436 if (!fl.fl4_src)
2437 fl.fl4_src = FIB_RES_PREFSRC(res);
2438
2439 if (dev_out)
2440 dev_put(dev_out);
2441 dev_out = FIB_RES_DEV(res);
2442 dev_hold(dev_out);
2443 fl.oif = dev_out->ifindex;
2444
2445
2446make_route:
2447 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2448
2449
2450 if (free_res)
2451 fib_res_put(&res);
2452 if (dev_out)
2453 dev_put(dev_out);
2454out: return err;
2455}
2456
611c183e
DL
2457int __ip_route_output_key(struct net *net, struct rtable **rp,
2458 const struct flowi *flp)
1da177e4
LT
2459{
2460 unsigned hash;
2461 struct rtable *rth;
2462
8c7bc840 2463 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
1da177e4
LT
2464
2465 rcu_read_lock_bh();
2466 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 2467 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
2468 if (rth->fl.fl4_dst == flp->fl4_dst &&
2469 rth->fl.fl4_src == flp->fl4_src &&
2470 rth->fl.iif == 0 &&
2471 rth->fl.oif == flp->oif &&
47dcf0cb 2472 rth->fl.mark == flp->mark &&
1da177e4 2473 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
b5921910
DL
2474 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2475 rth->u.dst.dev->nd_net == net) {
03f49f34 2476 dst_use(&rth->u.dst, jiffies);
1da177e4
LT
2477 RT_CACHE_STAT_INC(out_hit);
2478 rcu_read_unlock_bh();
2479 *rp = rth;
2480 return 0;
2481 }
2482 RT_CACHE_STAT_INC(out_hlist_search);
2483 }
2484 rcu_read_unlock_bh();
2485
611c183e 2486 return ip_route_output_slow(net, rp, flp);
1da177e4
LT
2487}
2488
d8c97a94
ACM
2489EXPORT_SYMBOL_GPL(__ip_route_output_key);
2490
14e50e57
DM
2491static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2492{
2493}
2494
2495static struct dst_ops ipv4_dst_blackhole_ops = {
2496 .family = AF_INET,
2497 .protocol = __constant_htons(ETH_P_IP),
2498 .destroy = ipv4_dst_destroy,
2499 .check = ipv4_dst_check,
2500 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2501 .entry_size = sizeof(struct rtable),
e2422970 2502 .entries = ATOMIC_INIT(0),
14e50e57
DM
2503};
2504
2505
14e50e57
DM
2506static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2507{
2508 struct rtable *ort = *rp;
2509 struct rtable *rt = (struct rtable *)
2510 dst_alloc(&ipv4_dst_blackhole_ops);
2511
2512 if (rt) {
2513 struct dst_entry *new = &rt->u.dst;
2514
2515 atomic_set(&new->__refcnt, 1);
2516 new->__use = 1;
352e512c
HX
2517 new->input = dst_discard;
2518 new->output = dst_discard;
14e50e57
DM
2519 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2520
2521 new->dev = ort->u.dst.dev;
2522 if (new->dev)
2523 dev_hold(new->dev);
2524
2525 rt->fl = ort->fl;
2526
2527 rt->idev = ort->idev;
2528 if (rt->idev)
2529 in_dev_hold(rt->idev);
2530 rt->rt_flags = ort->rt_flags;
2531 rt->rt_type = ort->rt_type;
2532 rt->rt_dst = ort->rt_dst;
2533 rt->rt_src = ort->rt_src;
2534 rt->rt_iif = ort->rt_iif;
2535 rt->rt_gateway = ort->rt_gateway;
2536 rt->rt_spec_dst = ort->rt_spec_dst;
2537 rt->peer = ort->peer;
2538 if (rt->peer)
2539 atomic_inc(&rt->peer->refcnt);
2540
2541 dst_free(new);
2542 }
2543
2544 dst_release(&(*rp)->u.dst);
2545 *rp = rt;
2546 return (rt ? 0 : -ENOMEM);
2547}
2548
f1b050bf
DL
2549int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2550 struct sock *sk, int flags)
1da177e4
LT
2551{
2552 int err;
2553
f1b050bf 2554 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
1da177e4
LT
2555 return err;
2556
2557 if (flp->proto) {
2558 if (!flp->fl4_src)
2559 flp->fl4_src = (*rp)->rt_src;
2560 if (!flp->fl4_dst)
2561 flp->fl4_dst = (*rp)->rt_dst;
bb72845e
HX
2562 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2563 flags ? XFRM_LOOKUP_WAIT : 0);
14e50e57
DM
2564 if (err == -EREMOTE)
2565 err = ipv4_dst_blackhole(rp, flp, sk);
2566
2567 return err;
1da177e4
LT
2568 }
2569
2570 return 0;
2571}
2572
d8c97a94
ACM
2573EXPORT_SYMBOL_GPL(ip_route_output_flow);
2574
f206351a 2575int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
1da177e4 2576{
f206351a 2577 return ip_route_output_flow(net, rp, flp, NULL, 0);
1da177e4
LT
2578}
2579
2580static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2581 int nowait, unsigned int flags)
1da177e4
LT
2582{
2583 struct rtable *rt = (struct rtable*)skb->dst;
2584 struct rtmsg *r;
be403ea1 2585 struct nlmsghdr *nlh;
e3703b3d
TG
2586 long expires;
2587 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2588
2589 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2590 if (nlh == NULL)
26932566 2591 return -EMSGSIZE;
be403ea1
TG
2592
2593 r = nlmsg_data(nlh);
1da177e4
LT
2594 r->rtm_family = AF_INET;
2595 r->rtm_dst_len = 32;
2596 r->rtm_src_len = 0;
2597 r->rtm_tos = rt->fl.fl4_tos;
2598 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2599 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2600 r->rtm_type = rt->rt_type;
2601 r->rtm_scope = RT_SCOPE_UNIVERSE;
2602 r->rtm_protocol = RTPROT_UNSPEC;
2603 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2604 if (rt->rt_flags & RTCF_NOTIFY)
2605 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2606
17fb2c64 2607 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2608
1da177e4
LT
2609 if (rt->fl.fl4_src) {
2610 r->rtm_src_len = 32;
17fb2c64 2611 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
1da177e4
LT
2612 }
2613 if (rt->u.dst.dev)
be403ea1 2614 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
1da177e4
LT
2615#ifdef CONFIG_NET_CLS_ROUTE
2616 if (rt->u.dst.tclassid)
be403ea1 2617 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
1da177e4
LT
2618#endif
2619 if (rt->fl.iif)
17fb2c64 2620 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
1da177e4 2621 else if (rt->rt_src != rt->fl.fl4_src)
17fb2c64 2622 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2623
1da177e4 2624 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2625 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2626
1da177e4 2627 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
be403ea1
TG
2628 goto nla_put_failure;
2629
e3703b3d
TG
2630 error = rt->u.dst.error;
2631 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
1da177e4 2632 if (rt->peer) {
e3703b3d 2633 id = rt->peer->ip_id_count;
1da177e4 2634 if (rt->peer->tcp_ts_stamp) {
e3703b3d 2635 ts = rt->peer->tcp_ts;
9d729f72 2636 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
1da177e4
LT
2637 }
2638 }
be403ea1 2639
1da177e4
LT
2640 if (rt->fl.iif) {
2641#ifdef CONFIG_IP_MROUTE
e448515c 2642 __be32 dst = rt->rt_dst;
1da177e4 2643
f97c1e0c 2644 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
586f1211 2645 IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
1da177e4
LT
2646 int err = ipmr_get_route(skb, r, nowait);
2647 if (err <= 0) {
2648 if (!nowait) {
2649 if (err == 0)
2650 return 0;
be403ea1 2651 goto nla_put_failure;
1da177e4
LT
2652 } else {
2653 if (err == -EMSGSIZE)
be403ea1 2654 goto nla_put_failure;
e3703b3d 2655 error = err;
1da177e4
LT
2656 }
2657 }
2658 } else
2659#endif
be403ea1 2660 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
1da177e4
LT
2661 }
2662
e3703b3d
TG
2663 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2664 expires, error) < 0)
2665 goto nla_put_failure;
be403ea1
TG
2666
2667 return nlmsg_end(skb, nlh);
1da177e4 2668
be403ea1 2669nla_put_failure:
26932566
PM
2670 nlmsg_cancel(skb, nlh);
2671 return -EMSGSIZE;
1da177e4
LT
2672}
2673
63f3444f 2674static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 2675{
b854272b 2676 struct net *net = in_skb->sk->sk_net;
d889ce3b
TG
2677 struct rtmsg *rtm;
2678 struct nlattr *tb[RTA_MAX+1];
1da177e4 2679 struct rtable *rt = NULL;
9e12bb22
AV
2680 __be32 dst = 0;
2681 __be32 src = 0;
2682 u32 iif;
d889ce3b 2683 int err;
1da177e4
LT
2684 struct sk_buff *skb;
2685
b854272b
DL
2686 if (net != &init_net)
2687 return -EINVAL;
2688
d889ce3b
TG
2689 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2690 if (err < 0)
2691 goto errout;
2692
2693 rtm = nlmsg_data(nlh);
2694
1da177e4 2695 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2696 if (skb == NULL) {
2697 err = -ENOBUFS;
2698 goto errout;
2699 }
1da177e4
LT
2700
2701 /* Reserve room for dummy headers, this skb can pass
2702 through good chunk of routing engine.
2703 */
459a98ed 2704 skb_reset_mac_header(skb);
c1d2bbe1 2705 skb_reset_network_header(skb);
d2c962b8
SH
2706
2707 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2708 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2709 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2710
17fb2c64
AV
2711 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2712 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2713 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
1da177e4
LT
2714
2715 if (iif) {
d889ce3b
TG
2716 struct net_device *dev;
2717
881d966b 2718 dev = __dev_get_by_index(&init_net, iif);
d889ce3b
TG
2719 if (dev == NULL) {
2720 err = -ENODEV;
2721 goto errout_free;
2722 }
2723
1da177e4
LT
2724 skb->protocol = htons(ETH_P_IP);
2725 skb->dev = dev;
2726 local_bh_disable();
2727 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2728 local_bh_enable();
d889ce3b
TG
2729
2730 rt = (struct rtable*) skb->dst;
2731 if (err == 0 && rt->u.dst.error)
1da177e4
LT
2732 err = -rt->u.dst.error;
2733 } else {
d889ce3b
TG
2734 struct flowi fl = {
2735 .nl_u = {
2736 .ip4_u = {
2737 .daddr = dst,
2738 .saddr = src,
2739 .tos = rtm->rtm_tos,
2740 },
2741 },
2742 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2743 };
f206351a 2744 err = ip_route_output_key(&init_net, &rt, &fl);
1da177e4 2745 }
d889ce3b 2746
1da177e4 2747 if (err)
d889ce3b 2748 goto errout_free;
1da177e4
LT
2749
2750 skb->dst = &rt->u.dst;
2751 if (rtm->rtm_flags & RTM_F_NOTIFY)
2752 rt->rt_flags |= RTCF_NOTIFY;
2753
1da177e4 2754 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
b6544c0b 2755 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2756 if (err <= 0)
2757 goto errout_free;
1da177e4 2758
97c53cac 2759 err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
d889ce3b 2760errout:
2942e900 2761 return err;
1da177e4 2762
d889ce3b 2763errout_free:
1da177e4 2764 kfree_skb(skb);
d889ce3b 2765 goto errout;
1da177e4
LT
2766}
2767
2768int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2769{
2770 struct rtable *rt;
2771 int h, s_h;
2772 int idx, s_idx;
2773
2774 s_h = cb->args[0];
d8c92830
ED
2775 if (s_h < 0)
2776 s_h = 0;
1da177e4 2777 s_idx = idx = cb->args[1];
d8c92830 2778 for (h = s_h; h <= rt_hash_mask; h++) {
1da177e4
LT
2779 rcu_read_lock_bh();
2780 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
093c2ca4 2781 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
1da177e4
LT
2782 if (idx < s_idx)
2783 continue;
2784 skb->dst = dst_clone(&rt->u.dst);
2785 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
e905a9ed 2786 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 2787 1, NLM_F_MULTI) <= 0) {
1da177e4
LT
2788 dst_release(xchg(&skb->dst, NULL));
2789 rcu_read_unlock_bh();
2790 goto done;
2791 }
2792 dst_release(xchg(&skb->dst, NULL));
2793 }
2794 rcu_read_unlock_bh();
d8c92830 2795 s_idx = 0;
1da177e4
LT
2796 }
2797
2798done:
2799 cb->args[0] = h;
2800 cb->args[1] = idx;
2801 return skb->len;
2802}
2803
2804void ip_rt_multicast_event(struct in_device *in_dev)
2805{
2806 rt_cache_flush(0);
2807}
2808
2809#ifdef CONFIG_SYSCTL
2810static int flush_delay;
2811
2812static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2813 struct file *filp, void __user *buffer,
2814 size_t *lenp, loff_t *ppos)
2815{
2816 if (write) {
2817 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2818 rt_cache_flush(flush_delay);
2819 return 0;
e905a9ed 2820 }
1da177e4
LT
2821
2822 return -EINVAL;
2823}
2824
2825static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2826 int __user *name,
2827 int nlen,
2828 void __user *oldval,
2829 size_t __user *oldlenp,
2830 void __user *newval,
1f29bcd7 2831 size_t newlen)
1da177e4
LT
2832{
2833 int delay;
2834 if (newlen != sizeof(int))
2835 return -EINVAL;
2836 if (get_user(delay, (int __user *)newval))
e905a9ed
YH
2837 return -EFAULT;
2838 rt_cache_flush(delay);
1da177e4
LT
2839 return 0;
2840}
2841
2842ctl_table ipv4_route_table[] = {
e905a9ed 2843 {
1da177e4
LT
2844 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2845 .procname = "flush",
2846 .data = &flush_delay,
2847 .maxlen = sizeof(int),
7e3e0360 2848 .mode = 0200,
1da177e4
LT
2849 .proc_handler = &ipv4_sysctl_rtcache_flush,
2850 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2851 },
2852 {
2853 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2854 .procname = "min_delay",
2855 .data = &ip_rt_min_delay,
2856 .maxlen = sizeof(int),
2857 .mode = 0644,
2858 .proc_handler = &proc_dointvec_jiffies,
2859 .strategy = &sysctl_jiffies,
2860 },
2861 {
2862 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2863 .procname = "max_delay",
2864 .data = &ip_rt_max_delay,
2865 .maxlen = sizeof(int),
2866 .mode = 0644,
2867 .proc_handler = &proc_dointvec_jiffies,
2868 .strategy = &sysctl_jiffies,
2869 },
2870 {
2871 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2872 .procname = "gc_thresh",
2873 .data = &ipv4_dst_ops.gc_thresh,
2874 .maxlen = sizeof(int),
2875 .mode = 0644,
2876 .proc_handler = &proc_dointvec,
2877 },
2878 {
2879 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2880 .procname = "max_size",
2881 .data = &ip_rt_max_size,
2882 .maxlen = sizeof(int),
2883 .mode = 0644,
2884 .proc_handler = &proc_dointvec,
2885 },
2886 {
2887 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2888
1da177e4
LT
2889 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2890 .procname = "gc_min_interval",
2891 .data = &ip_rt_gc_min_interval,
2892 .maxlen = sizeof(int),
2893 .mode = 0644,
2894 .proc_handler = &proc_dointvec_jiffies,
2895 .strategy = &sysctl_jiffies,
2896 },
2897 {
2898 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2899 .procname = "gc_min_interval_ms",
2900 .data = &ip_rt_gc_min_interval,
2901 .maxlen = sizeof(int),
2902 .mode = 0644,
2903 .proc_handler = &proc_dointvec_ms_jiffies,
2904 .strategy = &sysctl_ms_jiffies,
2905 },
2906 {
2907 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2908 .procname = "gc_timeout",
2909 .data = &ip_rt_gc_timeout,
2910 .maxlen = sizeof(int),
2911 .mode = 0644,
2912 .proc_handler = &proc_dointvec_jiffies,
2913 .strategy = &sysctl_jiffies,
2914 },
2915 {
2916 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2917 .procname = "gc_interval",
2918 .data = &ip_rt_gc_interval,
2919 .maxlen = sizeof(int),
2920 .mode = 0644,
2921 .proc_handler = &proc_dointvec_jiffies,
2922 .strategy = &sysctl_jiffies,
2923 },
2924 {
2925 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2926 .procname = "redirect_load",
2927 .data = &ip_rt_redirect_load,
2928 .maxlen = sizeof(int),
2929 .mode = 0644,
2930 .proc_handler = &proc_dointvec,
2931 },
2932 {
2933 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2934 .procname = "redirect_number",
2935 .data = &ip_rt_redirect_number,
2936 .maxlen = sizeof(int),
2937 .mode = 0644,
2938 .proc_handler = &proc_dointvec,
2939 },
2940 {
2941 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2942 .procname = "redirect_silence",
2943 .data = &ip_rt_redirect_silence,
2944 .maxlen = sizeof(int),
2945 .mode = 0644,
2946 .proc_handler = &proc_dointvec,
2947 },
2948 {
2949 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2950 .procname = "error_cost",
2951 .data = &ip_rt_error_cost,
2952 .maxlen = sizeof(int),
2953 .mode = 0644,
2954 .proc_handler = &proc_dointvec,
2955 },
2956 {
2957 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2958 .procname = "error_burst",
2959 .data = &ip_rt_error_burst,
2960 .maxlen = sizeof(int),
2961 .mode = 0644,
2962 .proc_handler = &proc_dointvec,
2963 },
2964 {
2965 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2966 .procname = "gc_elasticity",
2967 .data = &ip_rt_gc_elasticity,
2968 .maxlen = sizeof(int),
2969 .mode = 0644,
2970 .proc_handler = &proc_dointvec,
2971 },
2972 {
2973 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2974 .procname = "mtu_expires",
2975 .data = &ip_rt_mtu_expires,
2976 .maxlen = sizeof(int),
2977 .mode = 0644,
2978 .proc_handler = &proc_dointvec_jiffies,
2979 .strategy = &sysctl_jiffies,
2980 },
2981 {
2982 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2983 .procname = "min_pmtu",
2984 .data = &ip_rt_min_pmtu,
2985 .maxlen = sizeof(int),
2986 .mode = 0644,
2987 .proc_handler = &proc_dointvec,
2988 },
2989 {
2990 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2991 .procname = "min_adv_mss",
2992 .data = &ip_rt_min_advmss,
2993 .maxlen = sizeof(int),
2994 .mode = 0644,
2995 .proc_handler = &proc_dointvec,
2996 },
2997 {
2998 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2999 .procname = "secret_interval",
3000 .data = &ip_rt_secret_interval,
3001 .maxlen = sizeof(int),
3002 .mode = 0644,
3003 .proc_handler = &proc_dointvec_jiffies,
3004 .strategy = &sysctl_jiffies,
3005 },
3006 { .ctl_name = 0 }
3007};
3008#endif
3009
3010#ifdef CONFIG_NET_CLS_ROUTE
8dbde28d 3011struct ip_rt_acct *ip_rt_acct __read_mostly;
1da177e4
LT
3012#endif /* CONFIG_NET_CLS_ROUTE */
3013
3014static __initdata unsigned long rhash_entries;
3015static int __init set_rhash_entries(char *str)
3016{
3017 if (!str)
3018 return 0;
3019 rhash_entries = simple_strtoul(str, &str, 0);
3020 return 1;
3021}
3022__setup("rhash_entries=", set_rhash_entries);
3023
3024int __init ip_rt_init(void)
3025{
424c4b70 3026 int rc = 0;
1da177e4
LT
3027
3028 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3029 (jiffies ^ (jiffies >> 7)));
3030
3031#ifdef CONFIG_NET_CLS_ROUTE
8dbde28d 3032 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
1da177e4
LT
3033 if (!ip_rt_acct)
3034 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3035#endif
3036
e5d679f3
AD
3037 ipv4_dst_ops.kmem_cachep =
3038 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3039 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3040
14e50e57
DM
3041 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3042
424c4b70
ED
3043 rt_hash_table = (struct rt_hash_bucket *)
3044 alloc_large_system_hash("IP route cache",
3045 sizeof(struct rt_hash_bucket),
3046 rhash_entries,
3047 (num_physpages >= 128 * 1024) ?
18955cfc 3048 15 : 17,
8d1502de 3049 0,
424c4b70
ED
3050 &rt_hash_log,
3051 &rt_hash_mask,
3052 0);
22c047cc
ED
3053 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3054 rt_hash_lock_init();
1da177e4
LT
3055
3056 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3057 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3058
1da177e4
LT
3059 devinet_init();
3060 ip_fib_init();
3061
b24b8a24
PE
3062 setup_timer(&rt_flush_timer, rt_run_flush, 0);
3063 setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
1da177e4
LT
3064
3065 /* All the timers, started at system startup tend
3066 to synchronize. Perturb it a bit.
3067 */
39c90ece
ED
3068 schedule_delayed_work(&expires_work,
3069 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
1da177e4
LT
3070
3071 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3072 ip_rt_secret_interval;
3073 add_timer(&rt_secret_timer);
3074
107f1634
PE
3075 if (ip_rt_proc_init(&init_net))
3076 printk(KERN_ERR "Unable to create route proc files\n");
1da177e4
LT
3077#ifdef CONFIG_XFRM
3078 xfrm_init();
3079 xfrm4_init();
3080#endif
63f3444f
TG
3081 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3082
1da177e4
LT
3083 return rc;
3084}
3085
3086EXPORT_SYMBOL(__ip_select_ident);
3087EXPORT_SYMBOL(ip_route_input);
3088EXPORT_SYMBOL(ip_route_output_key);