]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/route.c
socket: sk_filter deinline
[net-next-2.6.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
e905a9ed 23 * Alan Cox : Super /proc >4K
1da177e4
LT
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
e905a9ed 41 *
1da177e4
LT
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
60 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
1da177e4
LT
67#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
1da177e4 73#include <linux/mm.h>
424c4b70 74#include <linux/bootmem.h>
1da177e4
LT
75#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
39c90ece 84#include <linux/workqueue.h>
1da177e4 85#include <linux/skbuff.h>
1da177e4
LT
86#include <linux/inetdevice.h>
87#include <linux/igmp.h>
88#include <linux/pkt_sched.h>
89#include <linux/mroute.h>
90#include <linux/netfilter_ipv4.h>
91#include <linux/random.h>
92#include <linux/jhash.h>
93#include <linux/rcupdate.h>
94#include <linux/times.h>
352e512c 95#include <net/dst.h>
457c4cbc 96#include <net/net_namespace.h>
1da177e4
LT
97#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
112
113#define RT_FL_TOS(oldflp) \
114 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116#define IP_MAX_MTU 0xFFF0
117
118#define RT_GC_TIMEOUT (300*HZ)
119
1da177e4 120static int ip_rt_max_size;
817bc4db
SH
121static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
122static int ip_rt_gc_interval __read_mostly = 60 * HZ;
123static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
124static int ip_rt_redirect_number __read_mostly = 9;
125static int ip_rt_redirect_load __read_mostly = HZ / 50;
126static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127static int ip_rt_error_cost __read_mostly = HZ;
128static int ip_rt_error_burst __read_mostly = 5 * HZ;
129static int ip_rt_gc_elasticity __read_mostly = 8;
130static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132static int ip_rt_min_advmss __read_mostly = 256;
133static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
1da177e4 134
beb659bd
ED
135static void rt_worker_func(struct work_struct *work);
136static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
1da177e4
LT
137static struct timer_list rt_secret_timer;
138
139/*
140 * Interface to generic destination cache.
141 */
142
143static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144static void ipv4_dst_destroy(struct dst_entry *dst);
145static void ipv4_dst_ifdown(struct dst_entry *dst,
146 struct net_device *dev, int how);
147static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148static void ipv4_link_failure(struct sk_buff *skb);
149static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 150static int rt_garbage_collect(struct dst_ops *ops);
1da177e4
LT
151
152
153static struct dst_ops ipv4_dst_ops = {
154 .family = AF_INET,
155 .protocol = __constant_htons(ETH_P_IP),
156 .gc = rt_garbage_collect,
157 .check = ipv4_dst_check,
158 .destroy = ipv4_dst_destroy,
159 .ifdown = ipv4_dst_ifdown,
160 .negative_advice = ipv4_negative_advice,
161 .link_failure = ipv4_link_failure,
162 .update_pmtu = ip_rt_update_pmtu,
862b82c6 163 .local_out = ip_local_out,
1da177e4 164 .entry_size = sizeof(struct rtable),
e2422970 165 .entries = ATOMIC_INIT(0),
1da177e4
LT
166};
167
168#define ECN_OR_COST(class) TC_PRIO_##class
169
4839c52b 170const __u8 ip_tos2prio[16] = {
1da177e4
LT
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(FILLER),
173 TC_PRIO_BESTEFFORT,
174 ECN_OR_COST(BESTEFFORT),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_BULK,
178 ECN_OR_COST(BULK),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE,
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK)
187};
188
189
190/*
191 * Route cache.
192 */
193
194/* The locking scheme is rather straight forward:
195 *
196 * 1) Read-Copy Update protects the buckets of the central route hash.
197 * 2) Only writers remove entries, and they hold the lock
198 * as they look at rtable reference counts.
199 * 3) Only readers acquire references to rtable entries,
200 * they do so with atomic increments and with the
201 * lock held.
202 */
203
204struct rt_hash_bucket {
205 struct rtable *chain;
22c047cc 206};
8a25d5de
IM
207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
208 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
209/*
210 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
211 * The size of this table is a power of two and depends on the number of CPUS.
62051200 212 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 213 */
62051200
IM
214#ifdef CONFIG_LOCKDEP
215# define RT_HASH_LOCK_SZ 256
22c047cc 216#else
62051200
IM
217# if NR_CPUS >= 32
218# define RT_HASH_LOCK_SZ 4096
219# elif NR_CPUS >= 16
220# define RT_HASH_LOCK_SZ 2048
221# elif NR_CPUS >= 8
222# define RT_HASH_LOCK_SZ 1024
223# elif NR_CPUS >= 4
224# define RT_HASH_LOCK_SZ 512
225# else
226# define RT_HASH_LOCK_SZ 256
227# endif
22c047cc
ED
228#endif
229
230static spinlock_t *rt_hash_locks;
231# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
232
233static __init void rt_hash_lock_init(void)
234{
235 int i;
236
237 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
238 GFP_KERNEL);
239 if (!rt_hash_locks)
240 panic("IP: failed to allocate rt_hash_locks\n");
241
242 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
243 spin_lock_init(&rt_hash_locks[i]);
244}
22c047cc
ED
245#else
246# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
247
248static inline void rt_hash_lock_init(void)
249{
250}
22c047cc 251#endif
1da177e4 252
817bc4db
SH
253static struct rt_hash_bucket *rt_hash_table __read_mostly;
254static unsigned rt_hash_mask __read_mostly;
255static unsigned int rt_hash_log __read_mostly;
256static atomic_t rt_genid __read_mostly;
1da177e4 257
2f970d83 258static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
dbd2915c 259#define RT_CACHE_STAT_INC(field) \
bfe5d834 260 (__raw_get_cpu_var(rt_cache_stat).field++)
1da177e4 261
cef2685e 262static unsigned int rt_hash_code(u32 daddr, u32 saddr)
1da177e4 263{
29e75252
ED
264 return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
265 & rt_hash_mask;
1da177e4
LT
266}
267
8c7bc840
AV
268#define rt_hash(daddr, saddr, idx) \
269 rt_hash_code((__force u32)(__be32)(daddr),\
270 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
271
1da177e4
LT
272#ifdef CONFIG_PROC_FS
273struct rt_cache_iter_state {
a75e936f 274 struct seq_net_private p;
1da177e4 275 int bucket;
29e75252 276 int genid;
1da177e4
LT
277};
278
1218854a 279static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 280{
1218854a 281 struct rt_cache_iter_state *st = seq->private;
1da177e4 282 struct rtable *r = NULL;
1da177e4
LT
283
284 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
285 rcu_read_lock_bh();
29e75252
ED
286 r = rcu_dereference(rt_hash_table[st->bucket].chain);
287 while (r) {
1218854a 288 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
a75e936f 289 r->rt_genid == st->genid)
29e75252
ED
290 return r;
291 r = rcu_dereference(r->u.dst.rt_next);
292 }
1da177e4
LT
293 rcu_read_unlock_bh();
294 }
29e75252 295 return r;
1da177e4
LT
296}
297
1218854a 298static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 299 struct rtable *r)
1da177e4 300{
1218854a 301 struct rt_cache_iter_state *st = seq->private;
093c2ca4 302 r = r->u.dst.rt_next;
1da177e4
LT
303 while (!r) {
304 rcu_read_unlock_bh();
305 if (--st->bucket < 0)
306 break;
307 rcu_read_lock_bh();
308 r = rt_hash_table[st->bucket].chain;
309 }
0bcceadc 310 return rcu_dereference(r);
1da177e4
LT
311}
312
1218854a 313static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
314 struct rtable *r)
315{
1218854a
YH
316 struct rt_cache_iter_state *st = seq->private;
317 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
318 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
a75e936f 319 continue;
642d6318
DL
320 if (r->rt_genid == st->genid)
321 break;
322 }
323 return r;
324}
325
1218854a 326static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 327{
1218854a 328 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
329
330 if (r)
1218854a 331 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
332 --pos;
333 return pos ? NULL : r;
334}
335
336static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
337{
29e75252 338 struct rt_cache_iter_state *st = seq->private;
29e75252 339 if (*pos)
1218854a 340 return rt_cache_get_idx(seq, *pos - 1);
29e75252
ED
341 st->genid = atomic_read(&rt_genid);
342 return SEQ_START_TOKEN;
1da177e4
LT
343}
344
345static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
346{
29e75252 347 struct rtable *r;
1da177e4
LT
348
349 if (v == SEQ_START_TOKEN)
1218854a 350 r = rt_cache_get_first(seq);
1da177e4 351 else
1218854a 352 r = rt_cache_get_next(seq, v);
1da177e4
LT
353 ++*pos;
354 return r;
355}
356
357static void rt_cache_seq_stop(struct seq_file *seq, void *v)
358{
359 if (v && v != SEQ_START_TOKEN)
360 rcu_read_unlock_bh();
361}
362
363static int rt_cache_seq_show(struct seq_file *seq, void *v)
364{
365 if (v == SEQ_START_TOKEN)
366 seq_printf(seq, "%-127s\n",
367 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
368 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
369 "HHUptod\tSpecDst");
370 else {
371 struct rtable *r = v;
372 char temp[256];
373
374 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
375 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
376 r->u.dst.dev ? r->u.dst.dev->name : "*",
377 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
378 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
379 r->u.dst.__use, 0, (unsigned long)r->rt_src,
380 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
381 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
382 dst_metric(&r->u.dst, RTAX_WINDOW),
383 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
384 dst_metric(&r->u.dst, RTAX_RTTVAR)),
385 r->fl.fl4_tos,
386 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
387 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
388 dev_queue_xmit) : 0,
389 r->rt_spec_dst);
390 seq_printf(seq, "%-127s\n", temp);
e905a9ed
YH
391 }
392 return 0;
1da177e4
LT
393}
394
f690808e 395static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
396 .start = rt_cache_seq_start,
397 .next = rt_cache_seq_next,
398 .stop = rt_cache_seq_stop,
399 .show = rt_cache_seq_show,
400};
401
402static int rt_cache_seq_open(struct inode *inode, struct file *file)
403{
a75e936f 404 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 405 sizeof(struct rt_cache_iter_state));
1da177e4
LT
406}
407
9a32144e 408static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
409 .owner = THIS_MODULE,
410 .open = rt_cache_seq_open,
411 .read = seq_read,
412 .llseek = seq_lseek,
a75e936f 413 .release = seq_release_net,
1da177e4
LT
414};
415
416
417static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
418{
419 int cpu;
420
421 if (*pos == 0)
422 return SEQ_START_TOKEN;
423
424 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
425 if (!cpu_possible(cpu))
426 continue;
427 *pos = cpu+1;
2f970d83 428 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
429 }
430 return NULL;
431}
432
433static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
434{
435 int cpu;
436
437 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
438 if (!cpu_possible(cpu))
439 continue;
440 *pos = cpu+1;
2f970d83 441 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
442 }
443 return NULL;
e905a9ed 444
1da177e4
LT
445}
446
447static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
448{
449
450}
451
452static int rt_cpu_seq_show(struct seq_file *seq, void *v)
453{
454 struct rt_cache_stat *st = v;
455
456 if (v == SEQ_START_TOKEN) {
5bec0039 457 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
458 return 0;
459 }
e905a9ed 460
1da177e4
LT
461 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
462 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
463 atomic_read(&ipv4_dst_ops.entries),
464 st->in_hit,
465 st->in_slow_tot,
466 st->in_slow_mc,
467 st->in_no_route,
468 st->in_brd,
469 st->in_martian_dst,
470 st->in_martian_src,
471
472 st->out_hit,
473 st->out_slow_tot,
e905a9ed 474 st->out_slow_mc,
1da177e4
LT
475
476 st->gc_total,
477 st->gc_ignored,
478 st->gc_goal_miss,
479 st->gc_dst_overflow,
480 st->in_hlist_search,
481 st->out_hlist_search
482 );
483 return 0;
484}
485
f690808e 486static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
487 .start = rt_cpu_seq_start,
488 .next = rt_cpu_seq_next,
489 .stop = rt_cpu_seq_stop,
490 .show = rt_cpu_seq_show,
491};
492
493
494static int rt_cpu_seq_open(struct inode *inode, struct file *file)
495{
496 return seq_open(file, &rt_cpu_seq_ops);
497}
498
9a32144e 499static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
500 .owner = THIS_MODULE,
501 .open = rt_cpu_seq_open,
502 .read = seq_read,
503 .llseek = seq_lseek,
504 .release = seq_release,
505};
506
78c686e9
PE
507#ifdef CONFIG_NET_CLS_ROUTE
508static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
509 int length, int *eof, void *data)
510{
511 unsigned int i;
512
513 if ((offset & 3) || (length & 3))
514 return -EIO;
515
516 if (offset >= sizeof(struct ip_rt_acct) * 256) {
517 *eof = 1;
518 return 0;
519 }
520
521 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
522 length = sizeof(struct ip_rt_acct) * 256 - offset;
523 *eof = 1;
524 }
525
526 offset /= sizeof(u32);
527
528 if (length > 0) {
529 u32 *dst = (u32 *) buffer;
530
531 *start = buffer;
532 memset(dst, 0, length);
533
534 for_each_possible_cpu(i) {
535 unsigned int j;
536 u32 *src;
537
538 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
539 for (j = 0; j < length/4; j++)
540 dst[j] += src[j];
541 }
542 }
543 return length;
544}
545#endif
107f1634 546
73b38711 547static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
548{
549 struct proc_dir_entry *pde;
550
551 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
552 &rt_cache_seq_fops);
553 if (!pde)
554 goto err1;
555
77020720
WC
556 pde = proc_create("rt_cache", S_IRUGO,
557 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
558 if (!pde)
559 goto err2;
560
107f1634
PE
561#ifdef CONFIG_NET_CLS_ROUTE
562 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
563 ip_rt_acct_read, NULL);
564 if (!pde)
565 goto err3;
566#endif
567 return 0;
568
569#ifdef CONFIG_NET_CLS_ROUTE
570err3:
571 remove_proc_entry("rt_cache", net->proc_net_stat);
572#endif
573err2:
574 remove_proc_entry("rt_cache", net->proc_net);
575err1:
576 return -ENOMEM;
577}
73b38711
DL
578
579static void __net_exit ip_rt_do_proc_exit(struct net *net)
580{
581 remove_proc_entry("rt_cache", net->proc_net_stat);
582 remove_proc_entry("rt_cache", net->proc_net);
583 remove_proc_entry("rt_acct", net->proc_net);
584}
585
586static struct pernet_operations ip_rt_proc_ops __net_initdata = {
587 .init = ip_rt_do_proc_init,
588 .exit = ip_rt_do_proc_exit,
589};
590
591static int __init ip_rt_proc_init(void)
592{
593 return register_pernet_subsys(&ip_rt_proc_ops);
594}
595
107f1634 596#else
73b38711 597static inline int ip_rt_proc_init(void)
107f1634
PE
598{
599 return 0;
600}
1da177e4 601#endif /* CONFIG_PROC_FS */
e905a9ed 602
1da177e4
LT
603static __inline__ void rt_free(struct rtable *rt)
604{
1da177e4
LT
605 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
606}
607
608static __inline__ void rt_drop(struct rtable *rt)
609{
1da177e4
LT
610 ip_rt_put(rt);
611 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
612}
613
614static __inline__ int rt_fast_clean(struct rtable *rth)
615{
616 /* Kill broadcast/multicast entries very aggresively, if they
617 collide in hash table with more useful entries */
618 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
093c2ca4 619 rth->fl.iif && rth->u.dst.rt_next;
1da177e4
LT
620}
621
622static __inline__ int rt_valuable(struct rtable *rth)
623{
624 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
625 rth->u.dst.expires;
626}
627
628static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
629{
630 unsigned long age;
631 int ret = 0;
632
633 if (atomic_read(&rth->u.dst.__refcnt))
634 goto out;
635
636 ret = 1;
637 if (rth->u.dst.expires &&
638 time_after_eq(jiffies, rth->u.dst.expires))
639 goto out;
640
641 age = jiffies - rth->u.dst.lastuse;
642 ret = 0;
643 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
644 (age <= tmo2 && rt_valuable(rth)))
645 goto out;
646 ret = 1;
647out: return ret;
648}
649
650/* Bits of score are:
651 * 31: very valuable
652 * 30: not quite useless
653 * 29..0: usage counter
654 */
655static inline u32 rt_score(struct rtable *rt)
656{
657 u32 score = jiffies - rt->u.dst.lastuse;
658
659 score = ~score & ~(3<<30);
660
661 if (rt_valuable(rt))
662 score |= (1<<31);
663
664 if (!rt->fl.iif ||
665 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
666 score |= (1<<30);
667
668 return score;
669}
670
671static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
672{
714e85be
AV
673 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
674 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
47dcf0cb 675 (fl1->mark ^ fl2->mark) |
8238b218
DM
676 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
677 *(u16 *)&fl2->nl_u.ip4_u.tos) |
678 (fl1->oif ^ fl2->oif) |
679 (fl1->iif ^ fl2->iif)) == 0;
1da177e4
LT
680}
681
b5921910
DL
682static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
683{
c346dca1 684 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
b5921910
DL
685}
686
beb659bd
ED
687/*
688 * Perform a full scan of hash table and free all entries.
689 * Can be called by a softirq or a process.
690 * In the later case, we want to be reschedule if necessary
691 */
692static void rt_do_flush(int process_context)
693{
694 unsigned int i;
695 struct rtable *rth, *next;
696
697 for (i = 0; i <= rt_hash_mask; i++) {
698 if (process_context && need_resched())
699 cond_resched();
700 rth = rt_hash_table[i].chain;
701 if (!rth)
702 continue;
703
704 spin_lock_bh(rt_hash_lock_addr(i));
705 rth = rt_hash_table[i].chain;
706 rt_hash_table[i].chain = NULL;
707 spin_unlock_bh(rt_hash_lock_addr(i));
708
709 for (; rth; rth = next) {
710 next = rth->u.dst.rt_next;
711 rt_free(rth);
712 }
713 }
714}
715
716static void rt_check_expire(void)
1da177e4 717{
bb1d23b0
ED
718 static unsigned int rover;
719 unsigned int i = rover, goal;
1da177e4 720 struct rtable *rth, **rthp;
bb1d23b0
ED
721 u64 mult;
722
723 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
724 if (ip_rt_gc_timeout > 1)
725 do_div(mult, ip_rt_gc_timeout);
726 goal = (unsigned int)mult;
39c90ece
ED
727 if (goal > rt_hash_mask)
728 goal = rt_hash_mask + 1;
bb1d23b0 729 for (; goal > 0; goal--) {
1da177e4
LT
730 unsigned long tmo = ip_rt_gc_timeout;
731
732 i = (i + 1) & rt_hash_mask;
733 rthp = &rt_hash_table[i].chain;
734
d90bf5a9
ED
735 if (need_resched())
736 cond_resched();
737
cfcabdcc 738 if (*rthp == NULL)
bb1d23b0 739 continue;
39c90ece 740 spin_lock_bh(rt_hash_lock_addr(i));
1da177e4 741 while ((rth = *rthp) != NULL) {
29e75252
ED
742 if (rth->rt_genid != atomic_read(&rt_genid)) {
743 *rthp = rth->u.dst.rt_next;
744 rt_free(rth);
745 continue;
746 }
1da177e4
LT
747 if (rth->u.dst.expires) {
748 /* Entry is expired even if it is in use */
39c90ece 749 if (time_before_eq(jiffies, rth->u.dst.expires)) {
1da177e4 750 tmo >>= 1;
093c2ca4 751 rthp = &rth->u.dst.rt_next;
1da177e4
LT
752 continue;
753 }
754 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
755 tmo >>= 1;
093c2ca4 756 rthp = &rth->u.dst.rt_next;
1da177e4
LT
757 continue;
758 }
759
760 /* Cleanup aged off entries. */
093c2ca4 761 *rthp = rth->u.dst.rt_next;
e905a9ed 762 rt_free(rth);
1da177e4 763 }
39c90ece 764 spin_unlock_bh(rt_hash_lock_addr(i));
1da177e4
LT
765 }
766 rover = i;
beb659bd
ED
767}
768
769/*
770 * rt_worker_func() is run in process context.
29e75252 771 * we call rt_check_expire() to scan part of the hash table
beb659bd
ED
772 */
773static void rt_worker_func(struct work_struct *work)
774{
29e75252 775 rt_check_expire();
39c90ece 776 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
1da177e4
LT
777}
778
29e75252
ED
779/*
780 * Pertubation of rt_genid by a small quantity [1..256]
781 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
782 * many times (2^24) without giving recent rt_genid.
783 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 784 */
29e75252 785static void rt_cache_invalidate(void)
1da177e4 786{
29e75252 787 unsigned char shuffle;
1da177e4 788
29e75252
ED
789 get_random_bytes(&shuffle, sizeof(shuffle));
790 atomic_add(shuffle + 1U, &rt_genid);
1da177e4
LT
791}
792
29e75252
ED
793/*
794 * delay < 0 : invalidate cache (fast : entries will be deleted later)
795 * delay >= 0 : invalidate & flush cache (can be long)
796 */
1da177e4
LT
797void rt_cache_flush(int delay)
798{
29e75252
ED
799 rt_cache_invalidate();
800 if (delay >= 0)
801 rt_do_flush(!in_softirq());
1da177e4
LT
802}
803
beb659bd 804/*
29e75252 805 * We change rt_genid and let gc do the cleanup
beb659bd 806 */
1da177e4
LT
807static void rt_secret_rebuild(unsigned long dummy)
808{
29e75252 809 rt_cache_invalidate();
beb659bd 810 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
1da177e4
LT
811}
812
813/*
814 Short description of GC goals.
815
816 We want to build algorithm, which will keep routing cache
817 at some equilibrium point, when number of aged off entries
818 is kept approximately equal to newly generated ones.
819
820 Current expiration strength is variable "expire".
821 We try to adjust it dynamically, so that if networking
822 is idle expires is large enough to keep enough of warm entries,
823 and when load increases it reduces to limit cache size.
824 */
825
569d3645 826static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
827{
828 static unsigned long expire = RT_GC_TIMEOUT;
829 static unsigned long last_gc;
830 static int rover;
831 static int equilibrium;
832 struct rtable *rth, **rthp;
833 unsigned long now = jiffies;
834 int goal;
835
836 /*
837 * Garbage collection is pretty expensive,
838 * do not make it too frequently.
839 */
840
841 RT_CACHE_STAT_INC(gc_total);
842
843 if (now - last_gc < ip_rt_gc_min_interval &&
844 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
845 RT_CACHE_STAT_INC(gc_ignored);
846 goto out;
847 }
848
849 /* Calculate number of entries, which we want to expire now. */
850 goal = atomic_read(&ipv4_dst_ops.entries) -
851 (ip_rt_gc_elasticity << rt_hash_log);
852 if (goal <= 0) {
853 if (equilibrium < ipv4_dst_ops.gc_thresh)
854 equilibrium = ipv4_dst_ops.gc_thresh;
855 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
856 if (goal > 0) {
b790cedd 857 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1da177e4
LT
858 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
859 }
860 } else {
861 /* We are in dangerous area. Try to reduce cache really
862 * aggressively.
863 */
b790cedd 864 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1da177e4
LT
865 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
866 }
867
868 if (now - last_gc >= ip_rt_gc_min_interval)
869 last_gc = now;
870
871 if (goal <= 0) {
872 equilibrium += goal;
873 goto work_done;
874 }
875
876 do {
877 int i, k;
878
879 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
880 unsigned long tmo = expire;
881
882 k = (k + 1) & rt_hash_mask;
883 rthp = &rt_hash_table[k].chain;
22c047cc 884 spin_lock_bh(rt_hash_lock_addr(k));
1da177e4 885 while ((rth = *rthp) != NULL) {
29e75252
ED
886 if (rth->rt_genid == atomic_read(&rt_genid) &&
887 !rt_may_expire(rth, tmo, expire)) {
1da177e4 888 tmo >>= 1;
093c2ca4 889 rthp = &rth->u.dst.rt_next;
1da177e4
LT
890 continue;
891 }
093c2ca4 892 *rthp = rth->u.dst.rt_next;
1da177e4
LT
893 rt_free(rth);
894 goal--;
1da177e4 895 }
22c047cc 896 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
897 if (goal <= 0)
898 break;
899 }
900 rover = k;
901
902 if (goal <= 0)
903 goto work_done;
904
905 /* Goal is not achieved. We stop process if:
906
907 - if expire reduced to zero. Otherwise, expire is halfed.
908 - if table is not full.
909 - if we are called from interrupt.
910 - jiffies check is just fallback/debug loop breaker.
911 We will not spin here for long time in any case.
912 */
913
914 RT_CACHE_STAT_INC(gc_goal_miss);
915
916 if (expire == 0)
917 break;
918
919 expire >>= 1;
920#if RT_CACHE_DEBUG >= 2
921 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
922 atomic_read(&ipv4_dst_ops.entries), goal, i);
923#endif
924
925 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
926 goto out;
927 } while (!in_softirq() && time_before_eq(jiffies, now));
928
929 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
930 goto out;
931 if (net_ratelimit())
932 printk(KERN_WARNING "dst cache overflow\n");
933 RT_CACHE_STAT_INC(gc_dst_overflow);
934 return 1;
935
936work_done:
937 expire += ip_rt_gc_min_interval;
938 if (expire > ip_rt_gc_timeout ||
939 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
940 expire = ip_rt_gc_timeout;
941#if RT_CACHE_DEBUG >= 2
942 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
943 atomic_read(&ipv4_dst_ops.entries), goal, rover);
944#endif
945out: return 0;
946}
947
948static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
949{
950 struct rtable *rth, **rthp;
951 unsigned long now;
952 struct rtable *cand, **candp;
953 u32 min_score;
954 int chain_length;
955 int attempts = !in_softirq();
956
957restart:
958 chain_length = 0;
959 min_score = ~(u32)0;
960 cand = NULL;
961 candp = NULL;
962 now = jiffies;
963
964 rthp = &rt_hash_table[hash].chain;
965
22c047cc 966 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 967 while ((rth = *rthp) != NULL) {
29e75252
ED
968 if (rth->rt_genid != atomic_read(&rt_genid)) {
969 *rthp = rth->u.dst.rt_next;
970 rt_free(rth);
971 continue;
972 }
b5921910 973 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1da177e4 974 /* Put it first */
093c2ca4 975 *rthp = rth->u.dst.rt_next;
1da177e4
LT
976 /*
977 * Since lookup is lockfree, the deletion
978 * must be visible to another weakly ordered CPU before
979 * the insertion at the start of the hash chain.
980 */
093c2ca4 981 rcu_assign_pointer(rth->u.dst.rt_next,
1da177e4
LT
982 rt_hash_table[hash].chain);
983 /*
984 * Since lookup is lockfree, the update writes
985 * must be ordered for consistency on SMP.
986 */
987 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
988
03f49f34 989 dst_use(&rth->u.dst, now);
22c047cc 990 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
991
992 rt_drop(rt);
993 *rp = rth;
994 return 0;
995 }
996
997 if (!atomic_read(&rth->u.dst.__refcnt)) {
998 u32 score = rt_score(rth);
999
1000 if (score <= min_score) {
1001 cand = rth;
1002 candp = rthp;
1003 min_score = score;
1004 }
1005 }
1006
1007 chain_length++;
1008
093c2ca4 1009 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1010 }
1011
1012 if (cand) {
1013 /* ip_rt_gc_elasticity used to be average length of chain
1014 * length, when exceeded gc becomes really aggressive.
1015 *
1016 * The second limit is less certain. At the moment it allows
1017 * only 2 entries per bucket. We will see.
1018 */
1019 if (chain_length > ip_rt_gc_elasticity) {
093c2ca4 1020 *candp = cand->u.dst.rt_next;
1da177e4
LT
1021 rt_free(cand);
1022 }
1023 }
1024
1025 /* Try to bind route to arp only if it is output
1026 route or unicast forwarding path.
1027 */
1028 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1029 int err = arp_bind_neighbour(&rt->u.dst);
1030 if (err) {
22c047cc 1031 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1032
1033 if (err != -ENOBUFS) {
1034 rt_drop(rt);
1035 return err;
1036 }
1037
1038 /* Neighbour tables are full and nothing
1039 can be released. Try to shrink route cache,
1040 it is most likely it holds some neighbour records.
1041 */
1042 if (attempts-- > 0) {
1043 int saved_elasticity = ip_rt_gc_elasticity;
1044 int saved_int = ip_rt_gc_min_interval;
1045 ip_rt_gc_elasticity = 1;
1046 ip_rt_gc_min_interval = 0;
569d3645 1047 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1048 ip_rt_gc_min_interval = saved_int;
1049 ip_rt_gc_elasticity = saved_elasticity;
1050 goto restart;
1051 }
1052
1053 if (net_ratelimit())
1054 printk(KERN_WARNING "Neighbour table overflow.\n");
1055 rt_drop(rt);
1056 return -ENOBUFS;
1057 }
1058 }
1059
093c2ca4 1060 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1da177e4 1061#if RT_CACHE_DEBUG >= 2
093c2ca4 1062 if (rt->u.dst.rt_next) {
1da177e4
LT
1063 struct rtable *trt;
1064 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1065 NIPQUAD(rt->rt_dst));
093c2ca4 1066 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1da177e4
LT
1067 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1068 printk("\n");
1069 }
1070#endif
1071 rt_hash_table[hash].chain = rt;
22c047cc 1072 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1073 *rp = rt;
1074 return 0;
1075}
1076
1077void rt_bind_peer(struct rtable *rt, int create)
1078{
1079 static DEFINE_SPINLOCK(rt_peer_lock);
1080 struct inet_peer *peer;
1081
1082 peer = inet_getpeer(rt->rt_dst, create);
1083
1084 spin_lock_bh(&rt_peer_lock);
1085 if (rt->peer == NULL) {
1086 rt->peer = peer;
1087 peer = NULL;
1088 }
1089 spin_unlock_bh(&rt_peer_lock);
1090 if (peer)
1091 inet_putpeer(peer);
1092}
1093
1094/*
1095 * Peer allocation may fail only in serious out-of-memory conditions. However
1096 * we still can generate some output.
1097 * Random ID selection looks a bit dangerous because we have no chances to
1098 * select ID being unique in a reasonable period of time.
1099 * But broken packet identifier may be better than no packet at all.
1100 */
1101static void ip_select_fb_ident(struct iphdr *iph)
1102{
1103 static DEFINE_SPINLOCK(ip_fb_id_lock);
1104 static u32 ip_fallback_id;
1105 u32 salt;
1106
1107 spin_lock_bh(&ip_fb_id_lock);
e448515c 1108 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1109 iph->id = htons(salt & 0xFFFF);
1110 ip_fallback_id = salt;
1111 spin_unlock_bh(&ip_fb_id_lock);
1112}
1113
1114void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1115{
1116 struct rtable *rt = (struct rtable *) dst;
1117
1118 if (rt) {
1119 if (rt->peer == NULL)
1120 rt_bind_peer(rt, 1);
1121
1122 /* If peer is attached to destination, it is never detached,
1123 so that we need not to grab a lock to dereference it.
1124 */
1125 if (rt->peer) {
1126 iph->id = htons(inet_getid(rt->peer, more));
1127 return;
1128 }
1129 } else
e905a9ed 1130 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1131 __builtin_return_address(0));
1da177e4
LT
1132
1133 ip_select_fb_ident(iph);
1134}
1135
1136static void rt_del(unsigned hash, struct rtable *rt)
1137{
29e75252 1138 struct rtable **rthp, *aux;
1da177e4 1139
29e75252 1140 rthp = &rt_hash_table[hash].chain;
22c047cc 1141 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1142 ip_rt_put(rt);
29e75252
ED
1143 while ((aux = *rthp) != NULL) {
1144 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1145 *rthp = aux->u.dst.rt_next;
1146 rt_free(aux);
1147 continue;
1da177e4 1148 }
29e75252
ED
1149 rthp = &aux->u.dst.rt_next;
1150 }
22c047cc 1151 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1152}
1153
f7655229
AV
1154void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1155 __be32 saddr, struct net_device *dev)
1da177e4
LT
1156{
1157 int i, k;
1158 struct in_device *in_dev = in_dev_get(dev);
1159 struct rtable *rth, **rthp;
f7655229 1160 __be32 skeys[2] = { saddr, 0 };
1da177e4 1161 int ikeys[2] = { dev->ifindex, 0 };
8d71740c 1162 struct netevent_redirect netevent;
317805b8 1163 struct net *net;
1da177e4 1164
1da177e4
LT
1165 if (!in_dev)
1166 return;
1167
c346dca1 1168 net = dev_net(dev);
1da177e4 1169 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1e637c74 1170 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
f97c1e0c 1171 || ipv4_is_zeronet(new_gw))
1da177e4
LT
1172 goto reject_redirect;
1173
1174 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1175 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1176 goto reject_redirect;
1177 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1178 goto reject_redirect;
1179 } else {
317805b8 1180 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1181 goto reject_redirect;
1182 }
1183
1184 for (i = 0; i < 2; i++) {
1185 for (k = 0; k < 2; k++) {
8c7bc840 1186 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1da177e4
LT
1187
1188 rthp=&rt_hash_table[hash].chain;
1189
1190 rcu_read_lock();
1191 while ((rth = rcu_dereference(*rthp)) != NULL) {
1192 struct rtable *rt;
1193
1194 if (rth->fl.fl4_dst != daddr ||
1195 rth->fl.fl4_src != skeys[i] ||
1da177e4 1196 rth->fl.oif != ikeys[k] ||
29e75252 1197 rth->fl.iif != 0 ||
317805b8 1198 rth->rt_genid != atomic_read(&rt_genid) ||
878628fb 1199 !net_eq(dev_net(rth->u.dst.dev), net)) {
093c2ca4 1200 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1201 continue;
1202 }
1203
1204 if (rth->rt_dst != daddr ||
1205 rth->rt_src != saddr ||
1206 rth->u.dst.error ||
1207 rth->rt_gateway != old_gw ||
1208 rth->u.dst.dev != dev)
1209 break;
1210
1211 dst_hold(&rth->u.dst);
1212 rcu_read_unlock();
1213
1214 rt = dst_alloc(&ipv4_dst_ops);
1215 if (rt == NULL) {
1216 ip_rt_put(rth);
1217 in_dev_put(in_dev);
1218 return;
1219 }
1220
1221 /* Copy all the information. */
1222 *rt = *rth;
e905a9ed 1223 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1da177e4
LT
1224 rt->u.dst.__use = 1;
1225 atomic_set(&rt->u.dst.__refcnt, 1);
1226 rt->u.dst.child = NULL;
1227 if (rt->u.dst.dev)
1228 dev_hold(rt->u.dst.dev);
1229 if (rt->idev)
1230 in_dev_hold(rt->idev);
1231 rt->u.dst.obsolete = 0;
1232 rt->u.dst.lastuse = jiffies;
1233 rt->u.dst.path = &rt->u.dst;
1234 rt->u.dst.neighbour = NULL;
1235 rt->u.dst.hh = NULL;
1236 rt->u.dst.xfrm = NULL;
29e75252 1237 rt->rt_genid = atomic_read(&rt_genid);
1da177e4
LT
1238 rt->rt_flags |= RTCF_REDIRECTED;
1239
1240 /* Gateway is different ... */
1241 rt->rt_gateway = new_gw;
1242
1243 /* Redirect received -> path was valid */
1244 dst_confirm(&rth->u.dst);
1245
1246 if (rt->peer)
1247 atomic_inc(&rt->peer->refcnt);
1248
1249 if (arp_bind_neighbour(&rt->u.dst) ||
1250 !(rt->u.dst.neighbour->nud_state &
1251 NUD_VALID)) {
1252 if (rt->u.dst.neighbour)
1253 neigh_event_send(rt->u.dst.neighbour, NULL);
1254 ip_rt_put(rth);
1255 rt_drop(rt);
1256 goto do_next;
1257 }
e905a9ed 1258
8d71740c
TT
1259 netevent.old = &rth->u.dst;
1260 netevent.new = &rt->u.dst;
e905a9ed
YH
1261 call_netevent_notifiers(NETEVENT_REDIRECT,
1262 &netevent);
1da177e4
LT
1263
1264 rt_del(hash, rth);
1265 if (!rt_intern_hash(hash, rt, &rt))
1266 ip_rt_put(rt);
1267 goto do_next;
1268 }
1269 rcu_read_unlock();
1270 do_next:
1271 ;
1272 }
1273 }
1274 in_dev_put(in_dev);
1275 return;
1276
1277reject_redirect:
1278#ifdef CONFIG_IP_ROUTE_VERBOSE
1279 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1280 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1281 "%u.%u.%u.%u ignored.\n"
cef2685e 1282 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1da177e4 1283 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
cef2685e 1284 NIPQUAD(saddr), NIPQUAD(daddr));
1da177e4
LT
1285#endif
1286 in_dev_put(in_dev);
1287}
1288
1289static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1290{
ee6b9673 1291 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1292 struct dst_entry *ret = dst;
1293
1294 if (rt) {
1295 if (dst->obsolete) {
1296 ip_rt_put(rt);
1297 ret = NULL;
1298 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1299 rt->u.dst.expires) {
8c7bc840
AV
1300 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1301 rt->fl.oif);
1da177e4 1302#if RT_CACHE_DEBUG >= 1
56c99d04 1303 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1da177e4
LT
1304 "%u.%u.%u.%u/%02x dropped\n",
1305 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1306#endif
1307 rt_del(hash, rt);
1308 ret = NULL;
1309 }
1310 }
1311 return ret;
1312}
1313
1314/*
1315 * Algorithm:
1316 * 1. The first ip_rt_redirect_number redirects are sent
1317 * with exponential backoff, then we stop sending them at all,
1318 * assuming that the host ignores our redirects.
1319 * 2. If we did not see packets requiring redirects
1320 * during ip_rt_redirect_silence, we assume that the host
1321 * forgot redirected route and start to send redirects again.
1322 *
1323 * This algorithm is much cheaper and more intelligent than dumb load limiting
1324 * in icmp.c.
1325 *
1326 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1327 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1328 */
1329
1330void ip_rt_send_redirect(struct sk_buff *skb)
1331{
ee6b9673 1332 struct rtable *rt = skb->rtable;
1da177e4
LT
1333 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1334
1335 if (!in_dev)
1336 return;
1337
1338 if (!IN_DEV_TX_REDIRECTS(in_dev))
1339 goto out;
1340
1341 /* No redirected packets during ip_rt_redirect_silence;
1342 * reset the algorithm.
1343 */
1344 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1345 rt->u.dst.rate_tokens = 0;
1346
1347 /* Too many ignored redirects; do not send anything
1348 * set u.dst.rate_last to the last seen redirected packet.
1349 */
1350 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1351 rt->u.dst.rate_last = jiffies;
1352 goto out;
1353 }
1354
1355 /* Check for load limit; set rate_last to the latest sent
1356 * redirect.
1357 */
14fb8a76
LY
1358 if (rt->u.dst.rate_tokens == 0 ||
1359 time_after(jiffies,
1da177e4
LT
1360 (rt->u.dst.rate_last +
1361 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1362 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1363 rt->u.dst.rate_last = jiffies;
1364 ++rt->u.dst.rate_tokens;
1365#ifdef CONFIG_IP_ROUTE_VERBOSE
1366 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1367 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1368 net_ratelimit())
1369 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1370 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1371 NIPQUAD(rt->rt_src), rt->rt_iif,
1372 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1373#endif
1374 }
1375out:
e905a9ed 1376 in_dev_put(in_dev);
1da177e4
LT
1377}
1378
1379static int ip_error(struct sk_buff *skb)
1380{
ee6b9673 1381 struct rtable *rt = skb->rtable;
1da177e4
LT
1382 unsigned long now;
1383 int code;
1384
1385 switch (rt->u.dst.error) {
1386 case EINVAL:
1387 default:
1388 goto out;
1389 case EHOSTUNREACH:
1390 code = ICMP_HOST_UNREACH;
1391 break;
1392 case ENETUNREACH:
1393 code = ICMP_NET_UNREACH;
7f53878d 1394 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1da177e4
LT
1395 break;
1396 case EACCES:
1397 code = ICMP_PKT_FILTERED;
1398 break;
1399 }
1400
1401 now = jiffies;
1402 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1403 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1404 rt->u.dst.rate_tokens = ip_rt_error_burst;
1405 rt->u.dst.rate_last = now;
1406 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1407 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1408 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1409 }
1410
1411out: kfree_skb(skb);
1412 return 0;
e905a9ed 1413}
1da177e4
LT
1414
1415/*
1416 * The last two values are not from the RFC but
1417 * are needed for AMPRnet AX.25 paths.
1418 */
1419
9b5b5cff 1420static const unsigned short mtu_plateau[] =
1da177e4
LT
1421{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1422
1423static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1424{
1425 int i;
e905a9ed 1426
1da177e4
LT
1427 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1428 if (old_mtu > mtu_plateau[i])
1429 return mtu_plateau[i];
1430 return 68;
1431}
1432
b5921910
DL
1433unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1434 unsigned short new_mtu)
1da177e4
LT
1435{
1436 int i;
1437 unsigned short old_mtu = ntohs(iph->tot_len);
1438 struct rtable *rth;
e448515c
AV
1439 __be32 skeys[2] = { iph->saddr, 0, };
1440 __be32 daddr = iph->daddr;
1da177e4
LT
1441 unsigned short est_mtu = 0;
1442
1443 if (ipv4_config.no_pmtu_disc)
1444 return 0;
1445
1446 for (i = 0; i < 2; i++) {
8c7bc840 1447 unsigned hash = rt_hash(daddr, skeys[i], 0);
1da177e4
LT
1448
1449 rcu_read_lock();
1450 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 1451 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
1452 if (rth->fl.fl4_dst == daddr &&
1453 rth->fl.fl4_src == skeys[i] &&
1454 rth->rt_dst == daddr &&
1455 rth->rt_src == iph->saddr &&
1da177e4 1456 rth->fl.iif == 0 &&
b5921910 1457 !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
878628fb 1458 net_eq(dev_net(rth->u.dst.dev), net) &&
29e75252 1459 rth->rt_genid == atomic_read(&rt_genid)) {
1da177e4
LT
1460 unsigned short mtu = new_mtu;
1461
1462 if (new_mtu < 68 || new_mtu >= old_mtu) {
1463
1464 /* BSD 4.2 compatibility hack :-( */
1465 if (mtu == 0 &&
1466 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1467 old_mtu >= 68 + (iph->ihl << 2))
1468 old_mtu -= iph->ihl << 2;
1469
1470 mtu = guess_mtu(old_mtu);
1471 }
1472 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
e905a9ed 1473 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1da177e4
LT
1474 dst_confirm(&rth->u.dst);
1475 if (mtu < ip_rt_min_pmtu) {
1476 mtu = ip_rt_min_pmtu;
1477 rth->u.dst.metrics[RTAX_LOCK-1] |=
1478 (1 << RTAX_MTU);
1479 }
1480 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1481 dst_set_expires(&rth->u.dst,
1482 ip_rt_mtu_expires);
1483 }
1484 est_mtu = mtu;
1485 }
1486 }
1487 }
1488 rcu_read_unlock();
1489 }
1490 return est_mtu ? : new_mtu;
1491}
1492
1493static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1494{
1495 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1496 !(dst_metric_locked(dst, RTAX_MTU))) {
1497 if (mtu < ip_rt_min_pmtu) {
1498 mtu = ip_rt_min_pmtu;
1499 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1500 }
1501 dst->metrics[RTAX_MTU-1] = mtu;
1502 dst_set_expires(dst, ip_rt_mtu_expires);
8d71740c 1503 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1da177e4
LT
1504 }
1505}
1506
1507static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1508{
1509 return NULL;
1510}
1511
1512static void ipv4_dst_destroy(struct dst_entry *dst)
1513{
1514 struct rtable *rt = (struct rtable *) dst;
1515 struct inet_peer *peer = rt->peer;
1516 struct in_device *idev = rt->idev;
1517
1518 if (peer) {
1519 rt->peer = NULL;
1520 inet_putpeer(peer);
1521 }
1522
1523 if (idev) {
1524 rt->idev = NULL;
1525 in_dev_put(idev);
1526 }
1527}
1528
1529static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1530 int how)
1531{
1532 struct rtable *rt = (struct rtable *) dst;
1533 struct in_device *idev = rt->idev;
c346dca1 1534 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
5a3e55d6 1535 struct in_device *loopback_idev =
c346dca1 1536 in_dev_get(dev_net(dev)->loopback_dev);
1da177e4
LT
1537 if (loopback_idev) {
1538 rt->idev = loopback_idev;
1539 in_dev_put(idev);
1540 }
1541 }
1542}
1543
1544static void ipv4_link_failure(struct sk_buff *skb)
1545{
1546 struct rtable *rt;
1547
1548 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1549
ee6b9673 1550 rt = skb->rtable;
1da177e4
LT
1551 if (rt)
1552 dst_set_expires(&rt->u.dst, 0);
1553}
1554
1555static int ip_rt_bug(struct sk_buff *skb)
1556{
1557 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
eddc9ec5 1558 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1da177e4
LT
1559 skb->dev ? skb->dev->name : "?");
1560 kfree_skb(skb);
1561 return 0;
1562}
1563
1564/*
1565 We do not cache source address of outgoing interface,
1566 because it is used only by IP RR, TS and SRR options,
1567 so that it out of fast path.
1568
1569 BTW remember: "addr" is allowed to be not aligned
1570 in IP options!
1571 */
1572
1573void ip_rt_get_source(u8 *addr, struct rtable *rt)
1574{
a61ced5d 1575 __be32 src;
1da177e4
LT
1576 struct fib_result res;
1577
1578 if (rt->fl.iif == 0)
1579 src = rt->rt_src;
c346dca1 1580 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1da177e4
LT
1581 src = FIB_RES_PREFSRC(res);
1582 fib_res_put(&res);
1583 } else
1584 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1585 RT_SCOPE_UNIVERSE);
1586 memcpy(addr, &src, 4);
1587}
1588
1589#ifdef CONFIG_NET_CLS_ROUTE
1590static void set_class_tag(struct rtable *rt, u32 tag)
1591{
1592 if (!(rt->u.dst.tclassid & 0xFFFF))
1593 rt->u.dst.tclassid |= tag & 0xFFFF;
1594 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1595 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1596}
1597#endif
1598
1599static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1600{
1601 struct fib_info *fi = res->fi;
1602
1603 if (fi) {
1604 if (FIB_RES_GW(*res) &&
1605 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1606 rt->rt_gateway = FIB_RES_GW(*res);
1607 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1608 sizeof(rt->u.dst.metrics));
1609 if (fi->fib_mtu == 0) {
1610 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1611 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1612 rt->rt_gateway != rt->rt_dst &&
1613 rt->u.dst.dev->mtu > 576)
1614 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1615 }
1616#ifdef CONFIG_NET_CLS_ROUTE
1617 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1618#endif
1619 } else
1620 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1621
1622 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1623 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1624 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1625 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1626 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1627 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1628 ip_rt_min_advmss);
1629 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1630 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1631
1632#ifdef CONFIG_NET_CLS_ROUTE
1633#ifdef CONFIG_IP_MULTIPLE_TABLES
1634 set_class_tag(rt, fib_rules_tclass(res));
1635#endif
1636 set_class_tag(rt, itag);
1637#endif
e905a9ed 1638 rt->rt_type = res->type;
1da177e4
LT
1639}
1640
9e12bb22 1641static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1642 u8 tos, struct net_device *dev, int our)
1643{
1644 unsigned hash;
1645 struct rtable *rth;
a61ced5d 1646 __be32 spec_dst;
1da177e4
LT
1647 struct in_device *in_dev = in_dev_get(dev);
1648 u32 itag = 0;
1649
1650 /* Primary sanity checks. */
1651
1652 if (in_dev == NULL)
1653 return -EINVAL;
1654
1e637c74 1655 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 1656 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1657 goto e_inval;
1658
f97c1e0c
JP
1659 if (ipv4_is_zeronet(saddr)) {
1660 if (!ipv4_is_local_multicast(daddr))
1da177e4
LT
1661 goto e_inval;
1662 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1663 } else if (fib_validate_source(saddr, 0, tos, 0,
1664 dev, &spec_dst, &itag) < 0)
1665 goto e_inval;
1666
1667 rth = dst_alloc(&ipv4_dst_ops);
1668 if (!rth)
1669 goto e_nobufs;
1670
1671 rth->u.dst.output= ip_rt_bug;
1672
1673 atomic_set(&rth->u.dst.__refcnt, 1);
1674 rth->u.dst.flags= DST_HOST;
42f811b8 1675 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
1676 rth->u.dst.flags |= DST_NOPOLICY;
1677 rth->fl.fl4_dst = daddr;
1678 rth->rt_dst = daddr;
1679 rth->fl.fl4_tos = tos;
47dcf0cb 1680 rth->fl.mark = skb->mark;
1da177e4
LT
1681 rth->fl.fl4_src = saddr;
1682 rth->rt_src = saddr;
1683#ifdef CONFIG_NET_CLS_ROUTE
1684 rth->u.dst.tclassid = itag;
1685#endif
1686 rth->rt_iif =
1687 rth->fl.iif = dev->ifindex;
2774c7ab 1688 rth->u.dst.dev = init_net.loopback_dev;
1da177e4
LT
1689 dev_hold(rth->u.dst.dev);
1690 rth->idev = in_dev_get(rth->u.dst.dev);
1691 rth->fl.oif = 0;
1692 rth->rt_gateway = daddr;
1693 rth->rt_spec_dst= spec_dst;
29e75252 1694 rth->rt_genid = atomic_read(&rt_genid);
1da177e4 1695 rth->rt_flags = RTCF_MULTICAST;
29e75252 1696 rth->rt_type = RTN_MULTICAST;
1da177e4
LT
1697 if (our) {
1698 rth->u.dst.input= ip_local_deliver;
1699 rth->rt_flags |= RTCF_LOCAL;
1700 }
1701
1702#ifdef CONFIG_IP_MROUTE
f97c1e0c 1703 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1da177e4
LT
1704 rth->u.dst.input = ip_mr_input;
1705#endif
1706 RT_CACHE_STAT_INC(in_slow_mc);
1707
1708 in_dev_put(in_dev);
8c7bc840 1709 hash = rt_hash(daddr, saddr, dev->ifindex);
ee6b9673 1710 return rt_intern_hash(hash, rth, &skb->rtable);
1da177e4
LT
1711
1712e_nobufs:
1713 in_dev_put(in_dev);
1714 return -ENOBUFS;
1715
1716e_inval:
1717 in_dev_put(in_dev);
1718 return -EINVAL;
1719}
1720
1721
1722static void ip_handle_martian_source(struct net_device *dev,
1723 struct in_device *in_dev,
1724 struct sk_buff *skb,
9e12bb22
AV
1725 __be32 daddr,
1726 __be32 saddr)
1da177e4
LT
1727{
1728 RT_CACHE_STAT_INC(in_martian_src);
1729#ifdef CONFIG_IP_ROUTE_VERBOSE
1730 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1731 /*
1732 * RFC1812 recommendation, if source is martian,
1733 * the only hint is MAC header.
1734 */
1735 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1736 "%u.%u.%u.%u, on dev %s\n",
1737 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
98e399f8 1738 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 1739 int i;
98e399f8 1740 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
1741 printk(KERN_WARNING "ll header: ");
1742 for (i = 0; i < dev->hard_header_len; i++, p++) {
1743 printk("%02x", *p);
1744 if (i < (dev->hard_header_len - 1))
1745 printk(":");
1746 }
1747 printk("\n");
1748 }
1749 }
1750#endif
1751}
1752
e905a9ed
YH
1753static inline int __mkroute_input(struct sk_buff *skb,
1754 struct fib_result* res,
1755 struct in_device *in_dev,
9e12bb22 1756 __be32 daddr, __be32 saddr, u32 tos,
e905a9ed 1757 struct rtable **result)
1da177e4
LT
1758{
1759
1760 struct rtable *rth;
1761 int err;
1762 struct in_device *out_dev;
1763 unsigned flags = 0;
d9c9df8c
AV
1764 __be32 spec_dst;
1765 u32 itag;
1da177e4
LT
1766
1767 /* get a working reference to the output device */
1768 out_dev = in_dev_get(FIB_RES_DEV(*res));
1769 if (out_dev == NULL) {
1770 if (net_ratelimit())
1771 printk(KERN_CRIT "Bug in ip_route_input" \
1772 "_slow(). Please, report\n");
1773 return -EINVAL;
1774 }
1775
1776
e905a9ed 1777 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1da177e4
LT
1778 in_dev->dev, &spec_dst, &itag);
1779 if (err < 0) {
e905a9ed 1780 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1781 saddr);
e905a9ed 1782
1da177e4
LT
1783 err = -EINVAL;
1784 goto cleanup;
1785 }
1786
1787 if (err)
1788 flags |= RTCF_DIRECTSRC;
1789
cb7928a5 1790 if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1da177e4
LT
1791 (IN_DEV_SHARED_MEDIA(out_dev) ||
1792 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1793 flags |= RTCF_DOREDIRECT;
1794
1795 if (skb->protocol != htons(ETH_P_IP)) {
1796 /* Not IP (i.e. ARP). Do not create route, if it is
1797 * invalid for proxy arp. DNAT routes are always valid.
1798 */
cb7928a5 1799 if (out_dev == in_dev) {
1da177e4
LT
1800 err = -EINVAL;
1801 goto cleanup;
1802 }
1803 }
1804
1805
1806 rth = dst_alloc(&ipv4_dst_ops);
1807 if (!rth) {
1808 err = -ENOBUFS;
1809 goto cleanup;
1810 }
1811
ce723d8e 1812 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 1813 rth->u.dst.flags= DST_HOST;
42f811b8 1814 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4 1815 rth->u.dst.flags |= DST_NOPOLICY;
42f811b8 1816 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1da177e4
LT
1817 rth->u.dst.flags |= DST_NOXFRM;
1818 rth->fl.fl4_dst = daddr;
1819 rth->rt_dst = daddr;
1820 rth->fl.fl4_tos = tos;
47dcf0cb 1821 rth->fl.mark = skb->mark;
1da177e4
LT
1822 rth->fl.fl4_src = saddr;
1823 rth->rt_src = saddr;
1824 rth->rt_gateway = daddr;
1825 rth->rt_iif =
1826 rth->fl.iif = in_dev->dev->ifindex;
1827 rth->u.dst.dev = (out_dev)->dev;
1828 dev_hold(rth->u.dst.dev);
1829 rth->idev = in_dev_get(rth->u.dst.dev);
1830 rth->fl.oif = 0;
1831 rth->rt_spec_dst= spec_dst;
1832
1833 rth->u.dst.input = ip_forward;
1834 rth->u.dst.output = ip_output;
29e75252 1835 rth->rt_genid = atomic_read(&rt_genid);
1da177e4
LT
1836
1837 rt_set_nexthop(rth, res, itag);
1838
1839 rth->rt_flags = flags;
1840
1841 *result = rth;
1842 err = 0;
1843 cleanup:
1844 /* release the working reference to the output device */
1845 in_dev_put(out_dev);
1846 return err;
e905a9ed 1847}
1da177e4 1848
e06e7c61
DM
1849static inline int ip_mkroute_input(struct sk_buff *skb,
1850 struct fib_result* res,
1851 const struct flowi *fl,
1852 struct in_device *in_dev,
1853 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1854{
7abaa27c 1855 struct rtable* rth = NULL;
1da177e4
LT
1856 int err;
1857 unsigned hash;
1858
1859#ifdef CONFIG_IP_ROUTE_MULTIPATH
1860 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1861 fib_select_multipath(fl, res);
1862#endif
1863
1864 /* create a routing cache entry */
1865 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1866 if (err)
1867 return err;
1da177e4
LT
1868
1869 /* put it into the cache */
8c7bc840 1870 hash = rt_hash(daddr, saddr, fl->iif);
ee6b9673 1871 return rt_intern_hash(hash, rth, &skb->rtable);
1da177e4
LT
1872}
1873
1da177e4
LT
1874/*
1875 * NOTE. We drop all the packets that has local source
1876 * addresses, because every properly looped back packet
1877 * must have correct destination already attached by output routine.
1878 *
1879 * Such approach solves two big problems:
1880 * 1. Not simplex devices are handled properly.
1881 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1882 */
1883
9e12bb22 1884static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1885 u8 tos, struct net_device *dev)
1886{
1887 struct fib_result res;
1888 struct in_device *in_dev = in_dev_get(dev);
1889 struct flowi fl = { .nl_u = { .ip4_u =
1890 { .daddr = daddr,
1891 .saddr = saddr,
1892 .tos = tos,
1893 .scope = RT_SCOPE_UNIVERSE,
1da177e4 1894 } },
47dcf0cb 1895 .mark = skb->mark,
1da177e4
LT
1896 .iif = dev->ifindex };
1897 unsigned flags = 0;
1898 u32 itag = 0;
1899 struct rtable * rth;
1900 unsigned hash;
9e12bb22 1901 __be32 spec_dst;
1da177e4
LT
1902 int err = -EINVAL;
1903 int free_res = 0;
c346dca1 1904 struct net * net = dev_net(dev);
1da177e4
LT
1905
1906 /* IP on this device is disabled. */
1907
1908 if (!in_dev)
1909 goto out;
1910
1911 /* Check for the most weird martians, which can be not detected
1912 by fib_lookup.
1913 */
1914
1e637c74 1915 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 1916 ipv4_is_loopback(saddr))
1da177e4
LT
1917 goto martian_source;
1918
e448515c 1919 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1da177e4
LT
1920 goto brd_input;
1921
1922 /* Accept zero addresses only to limited broadcast;
1923 * I even do not know to fix it or not. Waiting for complains :-)
1924 */
f97c1e0c 1925 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1926 goto martian_source;
1927
1e637c74 1928 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
f97c1e0c 1929 ipv4_is_loopback(daddr))
1da177e4
LT
1930 goto martian_destination;
1931
1932 /*
1933 * Now we are ready to route packet.
1934 */
84a885f4 1935 if ((err = fib_lookup(net, &fl, &res)) != 0) {
1da177e4 1936 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1937 goto e_hostunreach;
1da177e4
LT
1938 goto no_route;
1939 }
1940 free_res = 1;
1941
1942 RT_CACHE_STAT_INC(in_slow_tot);
1943
1944 if (res.type == RTN_BROADCAST)
1945 goto brd_input;
1946
1947 if (res.type == RTN_LOCAL) {
1948 int result;
1949 result = fib_validate_source(saddr, daddr, tos,
84a885f4 1950 net->loopback_dev->ifindex,
1da177e4
LT
1951 dev, &spec_dst, &itag);
1952 if (result < 0)
1953 goto martian_source;
1954 if (result)
1955 flags |= RTCF_DIRECTSRC;
1956 spec_dst = daddr;
1957 goto local_input;
1958 }
1959
1960 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1961 goto e_hostunreach;
1da177e4
LT
1962 if (res.type != RTN_UNICAST)
1963 goto martian_destination;
1964
1965 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1da177e4
LT
1966done:
1967 in_dev_put(in_dev);
1968 if (free_res)
1969 fib_res_put(&res);
1970out: return err;
1971
1972brd_input:
1973 if (skb->protocol != htons(ETH_P_IP))
1974 goto e_inval;
1975
f97c1e0c 1976 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1977 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1978 else {
1979 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1980 &itag);
1981 if (err < 0)
1982 goto martian_source;
1983 if (err)
1984 flags |= RTCF_DIRECTSRC;
1985 }
1986 flags |= RTCF_BROADCAST;
1987 res.type = RTN_BROADCAST;
1988 RT_CACHE_STAT_INC(in_brd);
1989
1990local_input:
1991 rth = dst_alloc(&ipv4_dst_ops);
1992 if (!rth)
1993 goto e_nobufs;
1994
1995 rth->u.dst.output= ip_rt_bug;
29e75252 1996 rth->rt_genid = atomic_read(&rt_genid);
1da177e4
LT
1997
1998 atomic_set(&rth->u.dst.__refcnt, 1);
1999 rth->u.dst.flags= DST_HOST;
42f811b8 2000 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
2001 rth->u.dst.flags |= DST_NOPOLICY;
2002 rth->fl.fl4_dst = daddr;
2003 rth->rt_dst = daddr;
2004 rth->fl.fl4_tos = tos;
47dcf0cb 2005 rth->fl.mark = skb->mark;
1da177e4
LT
2006 rth->fl.fl4_src = saddr;
2007 rth->rt_src = saddr;
2008#ifdef CONFIG_NET_CLS_ROUTE
2009 rth->u.dst.tclassid = itag;
2010#endif
2011 rth->rt_iif =
2012 rth->fl.iif = dev->ifindex;
84a885f4 2013 rth->u.dst.dev = net->loopback_dev;
1da177e4
LT
2014 dev_hold(rth->u.dst.dev);
2015 rth->idev = in_dev_get(rth->u.dst.dev);
2016 rth->rt_gateway = daddr;
2017 rth->rt_spec_dst= spec_dst;
2018 rth->u.dst.input= ip_local_deliver;
2019 rth->rt_flags = flags|RTCF_LOCAL;
2020 if (res.type == RTN_UNREACHABLE) {
2021 rth->u.dst.input= ip_error;
2022 rth->u.dst.error= -err;
2023 rth->rt_flags &= ~RTCF_LOCAL;
2024 }
2025 rth->rt_type = res.type;
8c7bc840 2026 hash = rt_hash(daddr, saddr, fl.iif);
ee6b9673 2027 err = rt_intern_hash(hash, rth, &skb->rtable);
1da177e4
LT
2028 goto done;
2029
2030no_route:
2031 RT_CACHE_STAT_INC(in_no_route);
2032 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2033 res.type = RTN_UNREACHABLE;
7f53878d
MC
2034 if (err == -ESRCH)
2035 err = -ENETUNREACH;
1da177e4
LT
2036 goto local_input;
2037
2038 /*
2039 * Do not cache martian addresses: they should be logged (RFC1812)
2040 */
2041martian_destination:
2042 RT_CACHE_STAT_INC(in_martian_dst);
2043#ifdef CONFIG_IP_ROUTE_VERBOSE
2044 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2045 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2046 "%u.%u.%u.%u, dev %s\n",
2047 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2048#endif
2c2910a4
DE
2049
2050e_hostunreach:
e905a9ed
YH
2051 err = -EHOSTUNREACH;
2052 goto done;
2c2910a4 2053
1da177e4
LT
2054e_inval:
2055 err = -EINVAL;
2056 goto done;
2057
2058e_nobufs:
2059 err = -ENOBUFS;
2060 goto done;
2061
2062martian_source:
2063 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2064 goto e_inval;
2065}
2066
9e12bb22 2067int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2068 u8 tos, struct net_device *dev)
2069{
2070 struct rtable * rth;
2071 unsigned hash;
2072 int iif = dev->ifindex;
b5921910 2073 struct net *net;
1da177e4 2074
c346dca1 2075 net = dev_net(dev);
1da177e4 2076 tos &= IPTOS_RT_MASK;
8c7bc840 2077 hash = rt_hash(daddr, saddr, iif);
1da177e4
LT
2078
2079 rcu_read_lock();
2080 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 2081 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
2082 if (rth->fl.fl4_dst == daddr &&
2083 rth->fl.fl4_src == saddr &&
2084 rth->fl.iif == iif &&
2085 rth->fl.oif == 0 &&
47dcf0cb 2086 rth->fl.mark == skb->mark &&
b5921910 2087 rth->fl.fl4_tos == tos &&
878628fb 2088 net_eq(dev_net(rth->u.dst.dev), net) &&
29e75252 2089 rth->rt_genid == atomic_read(&rt_genid)) {
03f49f34 2090 dst_use(&rth->u.dst, jiffies);
1da177e4
LT
2091 RT_CACHE_STAT_INC(in_hit);
2092 rcu_read_unlock();
ee6b9673 2093 skb->rtable = rth;
1da177e4
LT
2094 return 0;
2095 }
2096 RT_CACHE_STAT_INC(in_hlist_search);
2097 }
2098 rcu_read_unlock();
2099
2100 /* Multicast recognition logic is moved from route cache to here.
2101 The problem was that too many Ethernet cards have broken/missing
2102 hardware multicast filters :-( As result the host on multicasting
2103 network acquires a lot of useless route cache entries, sort of
2104 SDR messages from all the world. Now we try to get rid of them.
2105 Really, provided software IP multicast filter is organized
2106 reasonably (at least, hashed), it does not result in a slowdown
2107 comparing with route cache reject entries.
2108 Note, that multicast routers are not affected, because
2109 route cache entry is created eventually.
2110 */
f97c1e0c 2111 if (ipv4_is_multicast(daddr)) {
1da177e4
LT
2112 struct in_device *in_dev;
2113
2114 rcu_read_lock();
e5ed6399 2115 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1da177e4 2116 int our = ip_check_mc(in_dev, daddr, saddr,
eddc9ec5 2117 ip_hdr(skb)->protocol);
1da177e4
LT
2118 if (our
2119#ifdef CONFIG_IP_MROUTE
f97c1e0c
JP
2120 || (!ipv4_is_local_multicast(daddr) &&
2121 IN_DEV_MFORWARD(in_dev))
1da177e4
LT
2122#endif
2123 ) {
2124 rcu_read_unlock();
2125 return ip_route_input_mc(skb, daddr, saddr,
2126 tos, dev, our);
2127 }
2128 }
2129 rcu_read_unlock();
2130 return -EINVAL;
2131 }
2132 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2133}
2134
2135static inline int __mkroute_output(struct rtable **result,
e905a9ed 2136 struct fib_result* res,
1da177e4 2137 const struct flowi *fl,
e905a9ed
YH
2138 const struct flowi *oldflp,
2139 struct net_device *dev_out,
2140 unsigned flags)
1da177e4
LT
2141{
2142 struct rtable *rth;
2143 struct in_device *in_dev;
2144 u32 tos = RT_FL_TOS(oldflp);
2145 int err = 0;
2146
f97c1e0c 2147 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
1da177e4
LT
2148 return -EINVAL;
2149
e448515c 2150 if (fl->fl4_dst == htonl(0xFFFFFFFF))
1da177e4 2151 res->type = RTN_BROADCAST;
f97c1e0c 2152 else if (ipv4_is_multicast(fl->fl4_dst))
1da177e4 2153 res->type = RTN_MULTICAST;
1e637c74 2154 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
1da177e4
LT
2155 return -EINVAL;
2156
2157 if (dev_out->flags & IFF_LOOPBACK)
2158 flags |= RTCF_LOCAL;
2159
2160 /* get work reference to inet device */
2161 in_dev = in_dev_get(dev_out);
2162 if (!in_dev)
2163 return -EINVAL;
2164
2165 if (res->type == RTN_BROADCAST) {
2166 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2167 if (res->fi) {
2168 fib_info_put(res->fi);
2169 res->fi = NULL;
2170 }
2171 } else if (res->type == RTN_MULTICAST) {
2172 flags |= RTCF_MULTICAST|RTCF_LOCAL;
e905a9ed 2173 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
1da177e4
LT
2174 oldflp->proto))
2175 flags &= ~RTCF_LOCAL;
2176 /* If multicast route do not exist use
2177 default one, but do not gateway in this case.
2178 Yes, it is hack.
2179 */
2180 if (res->fi && res->prefixlen < 4) {
2181 fib_info_put(res->fi);
2182 res->fi = NULL;
2183 }
2184 }
2185
2186
2187 rth = dst_alloc(&ipv4_dst_ops);
2188 if (!rth) {
2189 err = -ENOBUFS;
2190 goto cleanup;
e905a9ed 2191 }
1da177e4 2192
ce723d8e 2193 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 2194 rth->u.dst.flags= DST_HOST;
42f811b8 2195 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
1da177e4 2196 rth->u.dst.flags |= DST_NOXFRM;
42f811b8 2197 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
2198 rth->u.dst.flags |= DST_NOPOLICY;
2199
2200 rth->fl.fl4_dst = oldflp->fl4_dst;
2201 rth->fl.fl4_tos = tos;
2202 rth->fl.fl4_src = oldflp->fl4_src;
2203 rth->fl.oif = oldflp->oif;
47dcf0cb 2204 rth->fl.mark = oldflp->mark;
1da177e4
LT
2205 rth->rt_dst = fl->fl4_dst;
2206 rth->rt_src = fl->fl4_src;
2207 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
e905a9ed 2208 /* get references to the devices that are to be hold by the routing
1da177e4
LT
2209 cache entry */
2210 rth->u.dst.dev = dev_out;
2211 dev_hold(dev_out);
2212 rth->idev = in_dev_get(dev_out);
2213 rth->rt_gateway = fl->fl4_dst;
2214 rth->rt_spec_dst= fl->fl4_src;
2215
2216 rth->u.dst.output=ip_output;
29e75252 2217 rth->rt_genid = atomic_read(&rt_genid);
1da177e4
LT
2218
2219 RT_CACHE_STAT_INC(out_slow_tot);
2220
2221 if (flags & RTCF_LOCAL) {
2222 rth->u.dst.input = ip_local_deliver;
2223 rth->rt_spec_dst = fl->fl4_dst;
2224 }
2225 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2226 rth->rt_spec_dst = fl->fl4_src;
e905a9ed 2227 if (flags & RTCF_LOCAL &&
1da177e4
LT
2228 !(dev_out->flags & IFF_LOOPBACK)) {
2229 rth->u.dst.output = ip_mc_output;
2230 RT_CACHE_STAT_INC(out_slow_mc);
2231 }
2232#ifdef CONFIG_IP_MROUTE
2233 if (res->type == RTN_MULTICAST) {
2234 if (IN_DEV_MFORWARD(in_dev) &&
f97c1e0c 2235 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
1da177e4
LT
2236 rth->u.dst.input = ip_mr_input;
2237 rth->u.dst.output = ip_mc_output;
2238 }
2239 }
2240#endif
2241 }
2242
2243 rt_set_nexthop(rth, res, 0);
2244
2245 rth->rt_flags = flags;
2246
2247 *result = rth;
2248 cleanup:
2249 /* release work reference to inet device */
2250 in_dev_put(in_dev);
2251
2252 return err;
2253}
2254
e06e7c61
DM
2255static inline int ip_mkroute_output(struct rtable **rp,
2256 struct fib_result* res,
2257 const struct flowi *fl,
2258 const struct flowi *oldflp,
2259 struct net_device *dev_out,
2260 unsigned flags)
1da177e4 2261{
7abaa27c 2262 struct rtable *rth = NULL;
1da177e4
LT
2263 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2264 unsigned hash;
2265 if (err == 0) {
8c7bc840 2266 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
1da177e4
LT
2267 err = rt_intern_hash(hash, rth, rp);
2268 }
e905a9ed 2269
1da177e4
LT
2270 return err;
2271}
2272
1da177e4
LT
2273/*
2274 * Major route resolver routine.
2275 */
2276
b40afd0e
DL
2277static int ip_route_output_slow(struct net *net, struct rtable **rp,
2278 const struct flowi *oldflp)
1da177e4
LT
2279{
2280 u32 tos = RT_FL_TOS(oldflp);
2281 struct flowi fl = { .nl_u = { .ip4_u =
2282 { .daddr = oldflp->fl4_dst,
2283 .saddr = oldflp->fl4_src,
2284 .tos = tos & IPTOS_RT_MASK,
2285 .scope = ((tos & RTO_ONLINK) ?
2286 RT_SCOPE_LINK :
2287 RT_SCOPE_UNIVERSE),
1da177e4 2288 } },
47dcf0cb 2289 .mark = oldflp->mark,
b40afd0e 2290 .iif = net->loopback_dev->ifindex,
1da177e4
LT
2291 .oif = oldflp->oif };
2292 struct fib_result res;
2293 unsigned flags = 0;
2294 struct net_device *dev_out = NULL;
2295 int free_res = 0;
2296 int err;
2297
2298
2299 res.fi = NULL;
2300#ifdef CONFIG_IP_MULTIPLE_TABLES
2301 res.r = NULL;
2302#endif
2303
2304 if (oldflp->fl4_src) {
2305 err = -EINVAL;
f97c1e0c 2306 if (ipv4_is_multicast(oldflp->fl4_src) ||
1e637c74 2307 ipv4_is_lbcast(oldflp->fl4_src) ||
f97c1e0c 2308 ipv4_is_zeronet(oldflp->fl4_src))
1da177e4
LT
2309 goto out;
2310
2311 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
b40afd0e 2312 dev_out = ip_dev_find(net, oldflp->fl4_src);
f6c5d736 2313 if (dev_out == NULL)
1da177e4
LT
2314 goto out;
2315
2316 /* I removed check for oif == dev_out->oif here.
2317 It was wrong for two reasons:
1ab35276
DL
2318 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2319 is assigned to multiple interfaces.
1da177e4
LT
2320 2. Moreover, we are allowed to send packets with saddr
2321 of another iface. --ANK
2322 */
2323
f6c5d736 2324 if (oldflp->oif == 0
f97c1e0c
JP
2325 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2326 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
1da177e4
LT
2327 /* Special hack: user can direct multicasts
2328 and limited broadcast via necessary interface
2329 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2330 This hack is not just for fun, it allows
2331 vic,vat and friends to work.
2332 They bind socket to loopback, set ttl to zero
2333 and expect that it will work.
2334 From the viewpoint of routing cache they are broken,
2335 because we are not allowed to build multicast path
2336 with loopback source addr (look, routing cache
2337 cannot know, that ttl is zero, so that packet
2338 will not leave this host and route is valid).
2339 Luckily, this hack is good workaround.
2340 */
2341
2342 fl.oif = dev_out->ifindex;
2343 goto make_route;
2344 }
2345 if (dev_out)
2346 dev_put(dev_out);
2347 dev_out = NULL;
2348 }
2349
2350
2351 if (oldflp->oif) {
b40afd0e 2352 dev_out = dev_get_by_index(net, oldflp->oif);
1da177e4
LT
2353 err = -ENODEV;
2354 if (dev_out == NULL)
2355 goto out;
e5ed6399
HX
2356
2357 /* RACE: Check return value of inet_select_addr instead. */
2358 if (__in_dev_get_rtnl(dev_out) == NULL) {
1da177e4
LT
2359 dev_put(dev_out);
2360 goto out; /* Wrong error code */
2361 }
2362
f97c1e0c
JP
2363 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2364 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
1da177e4
LT
2365 if (!fl.fl4_src)
2366 fl.fl4_src = inet_select_addr(dev_out, 0,
2367 RT_SCOPE_LINK);
2368 goto make_route;
2369 }
2370 if (!fl.fl4_src) {
f97c1e0c 2371 if (ipv4_is_multicast(oldflp->fl4_dst))
1da177e4
LT
2372 fl.fl4_src = inet_select_addr(dev_out, 0,
2373 fl.fl4_scope);
2374 else if (!oldflp->fl4_dst)
2375 fl.fl4_src = inet_select_addr(dev_out, 0,
2376 RT_SCOPE_HOST);
2377 }
2378 }
2379
2380 if (!fl.fl4_dst) {
2381 fl.fl4_dst = fl.fl4_src;
2382 if (!fl.fl4_dst)
2383 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2384 if (dev_out)
2385 dev_put(dev_out);
b40afd0e 2386 dev_out = net->loopback_dev;
1da177e4 2387 dev_hold(dev_out);
b40afd0e 2388 fl.oif = net->loopback_dev->ifindex;
1da177e4
LT
2389 res.type = RTN_LOCAL;
2390 flags |= RTCF_LOCAL;
2391 goto make_route;
2392 }
2393
b40afd0e 2394 if (fib_lookup(net, &fl, &res)) {
1da177e4
LT
2395 res.fi = NULL;
2396 if (oldflp->oif) {
2397 /* Apparently, routing tables are wrong. Assume,
2398 that the destination is on link.
2399
2400 WHY? DW.
2401 Because we are allowed to send to iface
2402 even if it has NO routes and NO assigned
2403 addresses. When oif is specified, routing
2404 tables are looked up with only one purpose:
2405 to catch if destination is gatewayed, rather than
2406 direct. Moreover, if MSG_DONTROUTE is set,
2407 we send packet, ignoring both routing tables
2408 and ifaddr state. --ANK
2409
2410
2411 We could make it even if oif is unknown,
2412 likely IPv6, but we do not.
2413 */
2414
2415 if (fl.fl4_src == 0)
2416 fl.fl4_src = inet_select_addr(dev_out, 0,
2417 RT_SCOPE_LINK);
2418 res.type = RTN_UNICAST;
2419 goto make_route;
2420 }
2421 if (dev_out)
2422 dev_put(dev_out);
2423 err = -ENETUNREACH;
2424 goto out;
2425 }
2426 free_res = 1;
2427
2428 if (res.type == RTN_LOCAL) {
2429 if (!fl.fl4_src)
2430 fl.fl4_src = fl.fl4_dst;
2431 if (dev_out)
2432 dev_put(dev_out);
b40afd0e 2433 dev_out = net->loopback_dev;
1da177e4
LT
2434 dev_hold(dev_out);
2435 fl.oif = dev_out->ifindex;
2436 if (res.fi)
2437 fib_info_put(res.fi);
2438 res.fi = NULL;
2439 flags |= RTCF_LOCAL;
2440 goto make_route;
2441 }
2442
2443#ifdef CONFIG_IP_ROUTE_MULTIPATH
2444 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2445 fib_select_multipath(&fl, &res);
2446 else
2447#endif
2448 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
b40afd0e 2449 fib_select_default(net, &fl, &res);
1da177e4
LT
2450
2451 if (!fl.fl4_src)
2452 fl.fl4_src = FIB_RES_PREFSRC(res);
2453
2454 if (dev_out)
2455 dev_put(dev_out);
2456 dev_out = FIB_RES_DEV(res);
2457 dev_hold(dev_out);
2458 fl.oif = dev_out->ifindex;
2459
2460
2461make_route:
2462 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2463
2464
2465 if (free_res)
2466 fib_res_put(&res);
2467 if (dev_out)
2468 dev_put(dev_out);
2469out: return err;
2470}
2471
611c183e
DL
2472int __ip_route_output_key(struct net *net, struct rtable **rp,
2473 const struct flowi *flp)
1da177e4
LT
2474{
2475 unsigned hash;
2476 struct rtable *rth;
2477
8c7bc840 2478 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
1da177e4
LT
2479
2480 rcu_read_lock_bh();
2481 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 2482 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
2483 if (rth->fl.fl4_dst == flp->fl4_dst &&
2484 rth->fl.fl4_src == flp->fl4_src &&
2485 rth->fl.iif == 0 &&
2486 rth->fl.oif == flp->oif &&
47dcf0cb 2487 rth->fl.mark == flp->mark &&
1da177e4 2488 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
b5921910 2489 (IPTOS_RT_MASK | RTO_ONLINK)) &&
878628fb 2490 net_eq(dev_net(rth->u.dst.dev), net) &&
29e75252 2491 rth->rt_genid == atomic_read(&rt_genid)) {
03f49f34 2492 dst_use(&rth->u.dst, jiffies);
1da177e4
LT
2493 RT_CACHE_STAT_INC(out_hit);
2494 rcu_read_unlock_bh();
2495 *rp = rth;
2496 return 0;
2497 }
2498 RT_CACHE_STAT_INC(out_hlist_search);
2499 }
2500 rcu_read_unlock_bh();
2501
611c183e 2502 return ip_route_output_slow(net, rp, flp);
1da177e4
LT
2503}
2504
d8c97a94
ACM
2505EXPORT_SYMBOL_GPL(__ip_route_output_key);
2506
14e50e57
DM
2507static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2508{
2509}
2510
2511static struct dst_ops ipv4_dst_blackhole_ops = {
2512 .family = AF_INET,
2513 .protocol = __constant_htons(ETH_P_IP),
2514 .destroy = ipv4_dst_destroy,
2515 .check = ipv4_dst_check,
2516 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2517 .entry_size = sizeof(struct rtable),
e2422970 2518 .entries = ATOMIC_INIT(0),
14e50e57
DM
2519};
2520
2521
ce259990 2522static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
14e50e57
DM
2523{
2524 struct rtable *ort = *rp;
2525 struct rtable *rt = (struct rtable *)
2526 dst_alloc(&ipv4_dst_blackhole_ops);
2527
2528 if (rt) {
2529 struct dst_entry *new = &rt->u.dst;
2530
2531 atomic_set(&new->__refcnt, 1);
2532 new->__use = 1;
352e512c
HX
2533 new->input = dst_discard;
2534 new->output = dst_discard;
14e50e57
DM
2535 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2536
2537 new->dev = ort->u.dst.dev;
2538 if (new->dev)
2539 dev_hold(new->dev);
2540
2541 rt->fl = ort->fl;
2542
2543 rt->idev = ort->idev;
2544 if (rt->idev)
2545 in_dev_hold(rt->idev);
29e75252 2546 rt->rt_genid = atomic_read(&rt_genid);
14e50e57
DM
2547 rt->rt_flags = ort->rt_flags;
2548 rt->rt_type = ort->rt_type;
2549 rt->rt_dst = ort->rt_dst;
2550 rt->rt_src = ort->rt_src;
2551 rt->rt_iif = ort->rt_iif;
2552 rt->rt_gateway = ort->rt_gateway;
2553 rt->rt_spec_dst = ort->rt_spec_dst;
2554 rt->peer = ort->peer;
2555 if (rt->peer)
2556 atomic_inc(&rt->peer->refcnt);
2557
2558 dst_free(new);
2559 }
2560
2561 dst_release(&(*rp)->u.dst);
2562 *rp = rt;
2563 return (rt ? 0 : -ENOMEM);
2564}
2565
f1b050bf
DL
2566int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2567 struct sock *sk, int flags)
1da177e4
LT
2568{
2569 int err;
2570
f1b050bf 2571 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
1da177e4
LT
2572 return err;
2573
2574 if (flp->proto) {
2575 if (!flp->fl4_src)
2576 flp->fl4_src = (*rp)->rt_src;
2577 if (!flp->fl4_dst)
2578 flp->fl4_dst = (*rp)->rt_dst;
bb72845e
HX
2579 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2580 flags ? XFRM_LOOKUP_WAIT : 0);
14e50e57 2581 if (err == -EREMOTE)
ce259990 2582 err = ipv4_dst_blackhole(rp, flp);
14e50e57
DM
2583
2584 return err;
1da177e4
LT
2585 }
2586
2587 return 0;
2588}
2589
d8c97a94
ACM
2590EXPORT_SYMBOL_GPL(ip_route_output_flow);
2591
f206351a 2592int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
1da177e4 2593{
f206351a 2594 return ip_route_output_flow(net, rp, flp, NULL, 0);
1da177e4
LT
2595}
2596
2597static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2598 int nowait, unsigned int flags)
1da177e4 2599{
ee6b9673 2600 struct rtable *rt = skb->rtable;
1da177e4 2601 struct rtmsg *r;
be403ea1 2602 struct nlmsghdr *nlh;
e3703b3d
TG
2603 long expires;
2604 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2605
2606 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2607 if (nlh == NULL)
26932566 2608 return -EMSGSIZE;
be403ea1
TG
2609
2610 r = nlmsg_data(nlh);
1da177e4
LT
2611 r->rtm_family = AF_INET;
2612 r->rtm_dst_len = 32;
2613 r->rtm_src_len = 0;
2614 r->rtm_tos = rt->fl.fl4_tos;
2615 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2616 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2617 r->rtm_type = rt->rt_type;
2618 r->rtm_scope = RT_SCOPE_UNIVERSE;
2619 r->rtm_protocol = RTPROT_UNSPEC;
2620 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2621 if (rt->rt_flags & RTCF_NOTIFY)
2622 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2623
17fb2c64 2624 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2625
1da177e4
LT
2626 if (rt->fl.fl4_src) {
2627 r->rtm_src_len = 32;
17fb2c64 2628 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
1da177e4
LT
2629 }
2630 if (rt->u.dst.dev)
be403ea1 2631 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
1da177e4
LT
2632#ifdef CONFIG_NET_CLS_ROUTE
2633 if (rt->u.dst.tclassid)
be403ea1 2634 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
1da177e4
LT
2635#endif
2636 if (rt->fl.iif)
17fb2c64 2637 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
1da177e4 2638 else if (rt->rt_src != rt->fl.fl4_src)
17fb2c64 2639 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2640
1da177e4 2641 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2642 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2643
1da177e4 2644 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
be403ea1
TG
2645 goto nla_put_failure;
2646
e3703b3d
TG
2647 error = rt->u.dst.error;
2648 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
1da177e4 2649 if (rt->peer) {
e3703b3d 2650 id = rt->peer->ip_id_count;
1da177e4 2651 if (rt->peer->tcp_ts_stamp) {
e3703b3d 2652 ts = rt->peer->tcp_ts;
9d729f72 2653 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
1da177e4
LT
2654 }
2655 }
be403ea1 2656
1da177e4
LT
2657 if (rt->fl.iif) {
2658#ifdef CONFIG_IP_MROUTE
e448515c 2659 __be32 dst = rt->rt_dst;
1da177e4 2660
f97c1e0c 2661 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
586f1211 2662 IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
1da177e4
LT
2663 int err = ipmr_get_route(skb, r, nowait);
2664 if (err <= 0) {
2665 if (!nowait) {
2666 if (err == 0)
2667 return 0;
be403ea1 2668 goto nla_put_failure;
1da177e4
LT
2669 } else {
2670 if (err == -EMSGSIZE)
be403ea1 2671 goto nla_put_failure;
e3703b3d 2672 error = err;
1da177e4
LT
2673 }
2674 }
2675 } else
2676#endif
be403ea1 2677 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
1da177e4
LT
2678 }
2679
e3703b3d
TG
2680 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2681 expires, error) < 0)
2682 goto nla_put_failure;
be403ea1
TG
2683
2684 return nlmsg_end(skb, nlh);
1da177e4 2685
be403ea1 2686nla_put_failure:
26932566
PM
2687 nlmsg_cancel(skb, nlh);
2688 return -EMSGSIZE;
1da177e4
LT
2689}
2690
63f3444f 2691static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 2692{
3b1e0a65 2693 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2694 struct rtmsg *rtm;
2695 struct nlattr *tb[RTA_MAX+1];
1da177e4 2696 struct rtable *rt = NULL;
9e12bb22
AV
2697 __be32 dst = 0;
2698 __be32 src = 0;
2699 u32 iif;
d889ce3b 2700 int err;
1da177e4
LT
2701 struct sk_buff *skb;
2702
d889ce3b
TG
2703 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2704 if (err < 0)
2705 goto errout;
2706
2707 rtm = nlmsg_data(nlh);
2708
1da177e4 2709 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2710 if (skb == NULL) {
2711 err = -ENOBUFS;
2712 goto errout;
2713 }
1da177e4
LT
2714
2715 /* Reserve room for dummy headers, this skb can pass
2716 through good chunk of routing engine.
2717 */
459a98ed 2718 skb_reset_mac_header(skb);
c1d2bbe1 2719 skb_reset_network_header(skb);
d2c962b8
SH
2720
2721 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2722 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2723 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2724
17fb2c64
AV
2725 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2726 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2727 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
1da177e4
LT
2728
2729 if (iif) {
d889ce3b
TG
2730 struct net_device *dev;
2731
1937504d 2732 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
2733 if (dev == NULL) {
2734 err = -ENODEV;
2735 goto errout_free;
2736 }
2737
1da177e4
LT
2738 skb->protocol = htons(ETH_P_IP);
2739 skb->dev = dev;
2740 local_bh_disable();
2741 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2742 local_bh_enable();
d889ce3b 2743
ee6b9673 2744 rt = skb->rtable;
d889ce3b 2745 if (err == 0 && rt->u.dst.error)
1da177e4
LT
2746 err = -rt->u.dst.error;
2747 } else {
d889ce3b
TG
2748 struct flowi fl = {
2749 .nl_u = {
2750 .ip4_u = {
2751 .daddr = dst,
2752 .saddr = src,
2753 .tos = rtm->rtm_tos,
2754 },
2755 },
2756 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2757 };
1937504d 2758 err = ip_route_output_key(net, &rt, &fl);
1da177e4 2759 }
d889ce3b 2760
1da177e4 2761 if (err)
d889ce3b 2762 goto errout_free;
1da177e4 2763
ee6b9673 2764 skb->rtable = rt;
1da177e4
LT
2765 if (rtm->rtm_flags & RTM_F_NOTIFY)
2766 rt->rt_flags |= RTCF_NOTIFY;
2767
1da177e4 2768 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 2769 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2770 if (err <= 0)
2771 goto errout_free;
1da177e4 2772
1937504d 2773 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 2774errout:
2942e900 2775 return err;
1da177e4 2776
d889ce3b 2777errout_free:
1da177e4 2778 kfree_skb(skb);
d889ce3b 2779 goto errout;
1da177e4
LT
2780}
2781
2782int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2783{
2784 struct rtable *rt;
2785 int h, s_h;
2786 int idx, s_idx;
1937504d
DL
2787 struct net *net;
2788
3b1e0a65 2789 net = sock_net(skb->sk);
1da177e4
LT
2790
2791 s_h = cb->args[0];
d8c92830
ED
2792 if (s_h < 0)
2793 s_h = 0;
1da177e4 2794 s_idx = idx = cb->args[1];
d8c92830 2795 for (h = s_h; h <= rt_hash_mask; h++) {
1da177e4
LT
2796 rcu_read_lock_bh();
2797 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
093c2ca4 2798 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
878628fb 2799 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
1da177e4 2800 continue;
29e75252
ED
2801 if (rt->rt_genid != atomic_read(&rt_genid))
2802 continue;
1da177e4
LT
2803 skb->dst = dst_clone(&rt->u.dst);
2804 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
e905a9ed 2805 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 2806 1, NLM_F_MULTI) <= 0) {
1da177e4
LT
2807 dst_release(xchg(&skb->dst, NULL));
2808 rcu_read_unlock_bh();
2809 goto done;
2810 }
2811 dst_release(xchg(&skb->dst, NULL));
2812 }
2813 rcu_read_unlock_bh();
d8c92830 2814 s_idx = 0;
1da177e4
LT
2815 }
2816
2817done:
2818 cb->args[0] = h;
2819 cb->args[1] = idx;
2820 return skb->len;
2821}
2822
2823void ip_rt_multicast_event(struct in_device *in_dev)
2824{
2825 rt_cache_flush(0);
2826}
2827
2828#ifdef CONFIG_SYSCTL
2829static int flush_delay;
2830
2831static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2832 struct file *filp, void __user *buffer,
2833 size_t *lenp, loff_t *ppos)
2834{
2835 if (write) {
2836 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2837 rt_cache_flush(flush_delay);
2838 return 0;
e905a9ed 2839 }
1da177e4
LT
2840
2841 return -EINVAL;
2842}
2843
2844static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2845 int __user *name,
2846 int nlen,
2847 void __user *oldval,
2848 size_t __user *oldlenp,
2849 void __user *newval,
1f29bcd7 2850 size_t newlen)
1da177e4
LT
2851{
2852 int delay;
2853 if (newlen != sizeof(int))
2854 return -EINVAL;
2855 if (get_user(delay, (int __user *)newval))
e905a9ed
YH
2856 return -EFAULT;
2857 rt_cache_flush(delay);
1da177e4
LT
2858 return 0;
2859}
2860
2861ctl_table ipv4_route_table[] = {
e905a9ed 2862 {
1da177e4
LT
2863 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2864 .procname = "flush",
2865 .data = &flush_delay,
2866 .maxlen = sizeof(int),
7e3e0360 2867 .mode = 0200,
1da177e4
LT
2868 .proc_handler = &ipv4_sysctl_rtcache_flush,
2869 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2870 },
1da177e4
LT
2871 {
2872 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2873 .procname = "gc_thresh",
2874 .data = &ipv4_dst_ops.gc_thresh,
2875 .maxlen = sizeof(int),
2876 .mode = 0644,
2877 .proc_handler = &proc_dointvec,
2878 },
2879 {
2880 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2881 .procname = "max_size",
2882 .data = &ip_rt_max_size,
2883 .maxlen = sizeof(int),
2884 .mode = 0644,
2885 .proc_handler = &proc_dointvec,
2886 },
2887 {
2888 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2889
1da177e4
LT
2890 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2891 .procname = "gc_min_interval",
2892 .data = &ip_rt_gc_min_interval,
2893 .maxlen = sizeof(int),
2894 .mode = 0644,
2895 .proc_handler = &proc_dointvec_jiffies,
2896 .strategy = &sysctl_jiffies,
2897 },
2898 {
2899 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2900 .procname = "gc_min_interval_ms",
2901 .data = &ip_rt_gc_min_interval,
2902 .maxlen = sizeof(int),
2903 .mode = 0644,
2904 .proc_handler = &proc_dointvec_ms_jiffies,
2905 .strategy = &sysctl_ms_jiffies,
2906 },
2907 {
2908 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2909 .procname = "gc_timeout",
2910 .data = &ip_rt_gc_timeout,
2911 .maxlen = sizeof(int),
2912 .mode = 0644,
2913 .proc_handler = &proc_dointvec_jiffies,
2914 .strategy = &sysctl_jiffies,
2915 },
2916 {
2917 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2918 .procname = "gc_interval",
2919 .data = &ip_rt_gc_interval,
2920 .maxlen = sizeof(int),
2921 .mode = 0644,
2922 .proc_handler = &proc_dointvec_jiffies,
2923 .strategy = &sysctl_jiffies,
2924 },
2925 {
2926 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2927 .procname = "redirect_load",
2928 .data = &ip_rt_redirect_load,
2929 .maxlen = sizeof(int),
2930 .mode = 0644,
2931 .proc_handler = &proc_dointvec,
2932 },
2933 {
2934 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2935 .procname = "redirect_number",
2936 .data = &ip_rt_redirect_number,
2937 .maxlen = sizeof(int),
2938 .mode = 0644,
2939 .proc_handler = &proc_dointvec,
2940 },
2941 {
2942 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2943 .procname = "redirect_silence",
2944 .data = &ip_rt_redirect_silence,
2945 .maxlen = sizeof(int),
2946 .mode = 0644,
2947 .proc_handler = &proc_dointvec,
2948 },
2949 {
2950 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2951 .procname = "error_cost",
2952 .data = &ip_rt_error_cost,
2953 .maxlen = sizeof(int),
2954 .mode = 0644,
2955 .proc_handler = &proc_dointvec,
2956 },
2957 {
2958 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2959 .procname = "error_burst",
2960 .data = &ip_rt_error_burst,
2961 .maxlen = sizeof(int),
2962 .mode = 0644,
2963 .proc_handler = &proc_dointvec,
2964 },
2965 {
2966 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2967 .procname = "gc_elasticity",
2968 .data = &ip_rt_gc_elasticity,
2969 .maxlen = sizeof(int),
2970 .mode = 0644,
2971 .proc_handler = &proc_dointvec,
2972 },
2973 {
2974 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2975 .procname = "mtu_expires",
2976 .data = &ip_rt_mtu_expires,
2977 .maxlen = sizeof(int),
2978 .mode = 0644,
2979 .proc_handler = &proc_dointvec_jiffies,
2980 .strategy = &sysctl_jiffies,
2981 },
2982 {
2983 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2984 .procname = "min_pmtu",
2985 .data = &ip_rt_min_pmtu,
2986 .maxlen = sizeof(int),
2987 .mode = 0644,
2988 .proc_handler = &proc_dointvec,
2989 },
2990 {
2991 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2992 .procname = "min_adv_mss",
2993 .data = &ip_rt_min_advmss,
2994 .maxlen = sizeof(int),
2995 .mode = 0644,
2996 .proc_handler = &proc_dointvec,
2997 },
2998 {
2999 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3000 .procname = "secret_interval",
3001 .data = &ip_rt_secret_interval,
3002 .maxlen = sizeof(int),
3003 .mode = 0644,
3004 .proc_handler = &proc_dointvec_jiffies,
3005 .strategy = &sysctl_jiffies,
3006 },
3007 { .ctl_name = 0 }
3008};
3009#endif
3010
3011#ifdef CONFIG_NET_CLS_ROUTE
8dbde28d 3012struct ip_rt_acct *ip_rt_acct __read_mostly;
1da177e4
LT
3013#endif /* CONFIG_NET_CLS_ROUTE */
3014
3015static __initdata unsigned long rhash_entries;
3016static int __init set_rhash_entries(char *str)
3017{
3018 if (!str)
3019 return 0;
3020 rhash_entries = simple_strtoul(str, &str, 0);
3021 return 1;
3022}
3023__setup("rhash_entries=", set_rhash_entries);
3024
3025int __init ip_rt_init(void)
3026{
424c4b70 3027 int rc = 0;
1da177e4 3028
29e75252
ED
3029 atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3030 (jiffies ^ (jiffies >> 7))));
1da177e4
LT
3031
3032#ifdef CONFIG_NET_CLS_ROUTE
8dbde28d 3033 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
1da177e4
LT
3034 if (!ip_rt_acct)
3035 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3036#endif
3037
e5d679f3
AD
3038 ipv4_dst_ops.kmem_cachep =
3039 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3040 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3041
14e50e57
DM
3042 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3043
424c4b70
ED
3044 rt_hash_table = (struct rt_hash_bucket *)
3045 alloc_large_system_hash("IP route cache",
3046 sizeof(struct rt_hash_bucket),
3047 rhash_entries,
3048 (num_physpages >= 128 * 1024) ?
18955cfc 3049 15 : 17,
8d1502de 3050 0,
424c4b70
ED
3051 &rt_hash_log,
3052 &rt_hash_mask,
3053 0);
22c047cc
ED
3054 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3055 rt_hash_lock_init();
1da177e4
LT
3056
3057 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3058 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3059
1da177e4
LT
3060 devinet_init();
3061 ip_fib_init();
3062
b24b8a24 3063 setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
1da177e4
LT
3064
3065 /* All the timers, started at system startup tend
3066 to synchronize. Perturb it a bit.
3067 */
39c90ece
ED
3068 schedule_delayed_work(&expires_work,
3069 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
1da177e4
LT
3070
3071 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3072 ip_rt_secret_interval;
3073 add_timer(&rt_secret_timer);
3074
73b38711 3075 if (ip_rt_proc_init())
107f1634 3076 printk(KERN_ERR "Unable to create route proc files\n");
1da177e4
LT
3077#ifdef CONFIG_XFRM
3078 xfrm_init();
3079 xfrm4_init();
3080#endif
63f3444f
TG
3081 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3082
1da177e4
LT
3083 return rc;
3084}
3085
3086EXPORT_SYMBOL(__ip_select_ident);
3087EXPORT_SYMBOL(ip_route_input);
3088EXPORT_SYMBOL(ip_route_output_key);