]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/route.c
netns: make rt_secret_rebuild timer per namespace
[net-next-2.6.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
1da177e4
LT
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
1da177e4 71#include <linux/mm.h>
424c4b70 72#include <linux/bootmem.h>
1da177e4
LT
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
39c90ece 82#include <linux/workqueue.h>
1da177e4 83#include <linux/skbuff.h>
1da177e4
LT
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
352e512c 93#include <net/dst.h>
457c4cbc 94#include <net/net_namespace.h>
1da177e4
LT
95#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
8d71740c 105#include <net/netevent.h>
63f3444f 106#include <net/rtnetlink.h>
1da177e4
LT
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
1da177e4 118static int ip_rt_max_size;
817bc4db
SH
119static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
120static int ip_rt_gc_interval __read_mostly = 60 * HZ;
121static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
122static int ip_rt_redirect_number __read_mostly = 9;
123static int ip_rt_redirect_load __read_mostly = HZ / 50;
124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125static int ip_rt_error_cost __read_mostly = HZ;
126static int ip_rt_error_burst __read_mostly = 5 * HZ;
127static int ip_rt_gc_elasticity __read_mostly = 8;
128static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
1da177e4 132
beb659bd
ED
133static void rt_worker_func(struct work_struct *work);
134static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
1da177e4
LT
135
136/*
137 * Interface to generic destination cache.
138 */
139
140static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141static void ipv4_dst_destroy(struct dst_entry *dst);
142static void ipv4_dst_ifdown(struct dst_entry *dst,
143 struct net_device *dev, int how);
144static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145static void ipv4_link_failure(struct sk_buff *skb);
146static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 147static int rt_garbage_collect(struct dst_ops *ops);
1da177e4
LT
148
149
150static struct dst_ops ipv4_dst_ops = {
151 .family = AF_INET,
152 .protocol = __constant_htons(ETH_P_IP),
153 .gc = rt_garbage_collect,
154 .check = ipv4_dst_check,
155 .destroy = ipv4_dst_destroy,
156 .ifdown = ipv4_dst_ifdown,
157 .negative_advice = ipv4_negative_advice,
158 .link_failure = ipv4_link_failure,
159 .update_pmtu = ip_rt_update_pmtu,
1ac06e03 160 .local_out = __ip_local_out,
1da177e4 161 .entry_size = sizeof(struct rtable),
e2422970 162 .entries = ATOMIC_INIT(0),
1da177e4
LT
163};
164
165#define ECN_OR_COST(class) TC_PRIO_##class
166
4839c52b 167const __u8 ip_tos2prio[16] = {
1da177e4
LT
168 TC_PRIO_BESTEFFORT,
169 ECN_OR_COST(FILLER),
170 TC_PRIO_BESTEFFORT,
171 ECN_OR_COST(BESTEFFORT),
172 TC_PRIO_BULK,
173 ECN_OR_COST(BULK),
174 TC_PRIO_BULK,
175 ECN_OR_COST(BULK),
176 TC_PRIO_INTERACTIVE,
177 ECN_OR_COST(INTERACTIVE),
178 TC_PRIO_INTERACTIVE,
179 ECN_OR_COST(INTERACTIVE),
180 TC_PRIO_INTERACTIVE_BULK,
181 ECN_OR_COST(INTERACTIVE_BULK),
182 TC_PRIO_INTERACTIVE_BULK,
183 ECN_OR_COST(INTERACTIVE_BULK)
184};
185
186
187/*
188 * Route cache.
189 */
190
191/* The locking scheme is rather straight forward:
192 *
193 * 1) Read-Copy Update protects the buckets of the central route hash.
194 * 2) Only writers remove entries, and they hold the lock
195 * as they look at rtable reference counts.
196 * 3) Only readers acquire references to rtable entries,
197 * they do so with atomic increments and with the
198 * lock held.
199 */
200
201struct rt_hash_bucket {
202 struct rtable *chain;
22c047cc 203};
8a25d5de
IM
204#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
205 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
206/*
207 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
208 * The size of this table is a power of two and depends on the number of CPUS.
62051200 209 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 210 */
62051200
IM
211#ifdef CONFIG_LOCKDEP
212# define RT_HASH_LOCK_SZ 256
22c047cc 213#else
62051200
IM
214# if NR_CPUS >= 32
215# define RT_HASH_LOCK_SZ 4096
216# elif NR_CPUS >= 16
217# define RT_HASH_LOCK_SZ 2048
218# elif NR_CPUS >= 8
219# define RT_HASH_LOCK_SZ 1024
220# elif NR_CPUS >= 4
221# define RT_HASH_LOCK_SZ 512
222# else
223# define RT_HASH_LOCK_SZ 256
224# endif
22c047cc
ED
225#endif
226
227static spinlock_t *rt_hash_locks;
228# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
229
230static __init void rt_hash_lock_init(void)
231{
232 int i;
233
234 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
235 GFP_KERNEL);
236 if (!rt_hash_locks)
237 panic("IP: failed to allocate rt_hash_locks\n");
238
239 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
240 spin_lock_init(&rt_hash_locks[i]);
241}
22c047cc
ED
242#else
243# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
244
245static inline void rt_hash_lock_init(void)
246{
247}
22c047cc 248#endif
1da177e4 249
817bc4db
SH
250static struct rt_hash_bucket *rt_hash_table __read_mostly;
251static unsigned rt_hash_mask __read_mostly;
252static unsigned int rt_hash_log __read_mostly;
253static atomic_t rt_genid __read_mostly;
1da177e4 254
2f970d83 255static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
dbd2915c 256#define RT_CACHE_STAT_INC(field) \
bfe5d834 257 (__raw_get_cpu_var(rt_cache_stat).field++)
1da177e4 258
1294fc4a 259static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
1da177e4 260{
1294fc4a
SH
261 return jhash_3words((__force u32)(__be32)(daddr),
262 (__force u32)(__be32)(saddr),
263 idx, atomic_read(&rt_genid))
29e75252 264 & rt_hash_mask;
1da177e4
LT
265}
266
267#ifdef CONFIG_PROC_FS
268struct rt_cache_iter_state {
a75e936f 269 struct seq_net_private p;
1da177e4 270 int bucket;
29e75252 271 int genid;
1da177e4
LT
272};
273
1218854a 274static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 275{
1218854a 276 struct rt_cache_iter_state *st = seq->private;
1da177e4 277 struct rtable *r = NULL;
1da177e4
LT
278
279 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
280 rcu_read_lock_bh();
29e75252
ED
281 r = rcu_dereference(rt_hash_table[st->bucket].chain);
282 while (r) {
1218854a 283 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
a75e936f 284 r->rt_genid == st->genid)
29e75252
ED
285 return r;
286 r = rcu_dereference(r->u.dst.rt_next);
287 }
1da177e4
LT
288 rcu_read_unlock_bh();
289 }
29e75252 290 return r;
1da177e4
LT
291}
292
1218854a 293static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 294 struct rtable *r)
1da177e4 295{
1218854a 296 struct rt_cache_iter_state *st = seq->private;
093c2ca4 297 r = r->u.dst.rt_next;
1da177e4
LT
298 while (!r) {
299 rcu_read_unlock_bh();
300 if (--st->bucket < 0)
301 break;
302 rcu_read_lock_bh();
303 r = rt_hash_table[st->bucket].chain;
304 }
0bcceadc 305 return rcu_dereference(r);
1da177e4
LT
306}
307
1218854a 308static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
309 struct rtable *r)
310{
1218854a
YH
311 struct rt_cache_iter_state *st = seq->private;
312 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
313 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
a75e936f 314 continue;
642d6318
DL
315 if (r->rt_genid == st->genid)
316 break;
317 }
318 return r;
319}
320
1218854a 321static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 322{
1218854a 323 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
324
325 if (r)
1218854a 326 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
327 --pos;
328 return pos ? NULL : r;
329}
330
331static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
332{
29e75252 333 struct rt_cache_iter_state *st = seq->private;
29e75252 334 if (*pos)
1218854a 335 return rt_cache_get_idx(seq, *pos - 1);
29e75252
ED
336 st->genid = atomic_read(&rt_genid);
337 return SEQ_START_TOKEN;
1da177e4
LT
338}
339
340static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
341{
29e75252 342 struct rtable *r;
1da177e4
LT
343
344 if (v == SEQ_START_TOKEN)
1218854a 345 r = rt_cache_get_first(seq);
1da177e4 346 else
1218854a 347 r = rt_cache_get_next(seq, v);
1da177e4
LT
348 ++*pos;
349 return r;
350}
351
352static void rt_cache_seq_stop(struct seq_file *seq, void *v)
353{
354 if (v && v != SEQ_START_TOKEN)
355 rcu_read_unlock_bh();
356}
357
358static int rt_cache_seq_show(struct seq_file *seq, void *v)
359{
360 if (v == SEQ_START_TOKEN)
361 seq_printf(seq, "%-127s\n",
362 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
363 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
364 "HHUptod\tSpecDst");
365 else {
366 struct rtable *r = v;
5e659e4c 367 int len;
1da177e4 368
5e659e4c
PE
369 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
370 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
1da177e4
LT
371 r->u.dst.dev ? r->u.dst.dev->name : "*",
372 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
373 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
374 r->u.dst.__use, 0, (unsigned long)r->rt_src,
375 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
376 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
377 dst_metric(&r->u.dst, RTAX_WINDOW),
378 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
379 dst_metric(&r->u.dst, RTAX_RTTVAR)),
380 r->fl.fl4_tos,
381 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
382 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
383 dev_queue_xmit) : 0,
5e659e4c
PE
384 r->rt_spec_dst, &len);
385
386 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
387 }
388 return 0;
1da177e4
LT
389}
390
f690808e 391static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
392 .start = rt_cache_seq_start,
393 .next = rt_cache_seq_next,
394 .stop = rt_cache_seq_stop,
395 .show = rt_cache_seq_show,
396};
397
398static int rt_cache_seq_open(struct inode *inode, struct file *file)
399{
a75e936f 400 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 401 sizeof(struct rt_cache_iter_state));
1da177e4
LT
402}
403
9a32144e 404static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
405 .owner = THIS_MODULE,
406 .open = rt_cache_seq_open,
407 .read = seq_read,
408 .llseek = seq_lseek,
a75e936f 409 .release = seq_release_net,
1da177e4
LT
410};
411
412
413static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
414{
415 int cpu;
416
417 if (*pos == 0)
418 return SEQ_START_TOKEN;
419
420 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
421 if (!cpu_possible(cpu))
422 continue;
423 *pos = cpu+1;
2f970d83 424 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
425 }
426 return NULL;
427}
428
429static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
430{
431 int cpu;
432
433 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
434 if (!cpu_possible(cpu))
435 continue;
436 *pos = cpu+1;
2f970d83 437 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
438 }
439 return NULL;
e905a9ed 440
1da177e4
LT
441}
442
443static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
444{
445
446}
447
448static int rt_cpu_seq_show(struct seq_file *seq, void *v)
449{
450 struct rt_cache_stat *st = v;
451
452 if (v == SEQ_START_TOKEN) {
5bec0039 453 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
454 return 0;
455 }
e905a9ed 456
1da177e4
LT
457 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
458 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
459 atomic_read(&ipv4_dst_ops.entries),
460 st->in_hit,
461 st->in_slow_tot,
462 st->in_slow_mc,
463 st->in_no_route,
464 st->in_brd,
465 st->in_martian_dst,
466 st->in_martian_src,
467
468 st->out_hit,
469 st->out_slow_tot,
e905a9ed 470 st->out_slow_mc,
1da177e4
LT
471
472 st->gc_total,
473 st->gc_ignored,
474 st->gc_goal_miss,
475 st->gc_dst_overflow,
476 st->in_hlist_search,
477 st->out_hlist_search
478 );
479 return 0;
480}
481
f690808e 482static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
483 .start = rt_cpu_seq_start,
484 .next = rt_cpu_seq_next,
485 .stop = rt_cpu_seq_stop,
486 .show = rt_cpu_seq_show,
487};
488
489
490static int rt_cpu_seq_open(struct inode *inode, struct file *file)
491{
492 return seq_open(file, &rt_cpu_seq_ops);
493}
494
9a32144e 495static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
496 .owner = THIS_MODULE,
497 .open = rt_cpu_seq_open,
498 .read = seq_read,
499 .llseek = seq_lseek,
500 .release = seq_release,
501};
502
78c686e9
PE
503#ifdef CONFIG_NET_CLS_ROUTE
504static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
505 int length, int *eof, void *data)
506{
507 unsigned int i;
508
509 if ((offset & 3) || (length & 3))
510 return -EIO;
511
512 if (offset >= sizeof(struct ip_rt_acct) * 256) {
513 *eof = 1;
514 return 0;
515 }
516
517 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
518 length = sizeof(struct ip_rt_acct) * 256 - offset;
519 *eof = 1;
520 }
521
522 offset /= sizeof(u32);
523
524 if (length > 0) {
525 u32 *dst = (u32 *) buffer;
526
527 *start = buffer;
528 memset(dst, 0, length);
529
530 for_each_possible_cpu(i) {
531 unsigned int j;
532 u32 *src;
533
534 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
535 for (j = 0; j < length/4; j++)
536 dst[j] += src[j];
537 }
538 }
539 return length;
540}
541#endif
107f1634 542
73b38711 543static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
544{
545 struct proc_dir_entry *pde;
546
547 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
548 &rt_cache_seq_fops);
549 if (!pde)
550 goto err1;
551
77020720
WC
552 pde = proc_create("rt_cache", S_IRUGO,
553 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
554 if (!pde)
555 goto err2;
556
107f1634
PE
557#ifdef CONFIG_NET_CLS_ROUTE
558 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
559 ip_rt_acct_read, NULL);
560 if (!pde)
561 goto err3;
562#endif
563 return 0;
564
565#ifdef CONFIG_NET_CLS_ROUTE
566err3:
567 remove_proc_entry("rt_cache", net->proc_net_stat);
568#endif
569err2:
570 remove_proc_entry("rt_cache", net->proc_net);
571err1:
572 return -ENOMEM;
573}
73b38711
DL
574
575static void __net_exit ip_rt_do_proc_exit(struct net *net)
576{
577 remove_proc_entry("rt_cache", net->proc_net_stat);
578 remove_proc_entry("rt_cache", net->proc_net);
579 remove_proc_entry("rt_acct", net->proc_net);
580}
581
582static struct pernet_operations ip_rt_proc_ops __net_initdata = {
583 .init = ip_rt_do_proc_init,
584 .exit = ip_rt_do_proc_exit,
585};
586
587static int __init ip_rt_proc_init(void)
588{
589 return register_pernet_subsys(&ip_rt_proc_ops);
590}
591
107f1634 592#else
73b38711 593static inline int ip_rt_proc_init(void)
107f1634
PE
594{
595 return 0;
596}
1da177e4 597#endif /* CONFIG_PROC_FS */
e905a9ed 598
5969f71d 599static inline void rt_free(struct rtable *rt)
1da177e4 600{
1da177e4
LT
601 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
602}
603
5969f71d 604static inline void rt_drop(struct rtable *rt)
1da177e4 605{
1da177e4
LT
606 ip_rt_put(rt);
607 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
608}
609
5969f71d 610static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
611{
612 /* Kill broadcast/multicast entries very aggresively, if they
613 collide in hash table with more useful entries */
614 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
093c2ca4 615 rth->fl.iif && rth->u.dst.rt_next;
1da177e4
LT
616}
617
5969f71d 618static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
619{
620 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
621 rth->u.dst.expires;
622}
623
624static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
625{
626 unsigned long age;
627 int ret = 0;
628
629 if (atomic_read(&rth->u.dst.__refcnt))
630 goto out;
631
632 ret = 1;
633 if (rth->u.dst.expires &&
634 time_after_eq(jiffies, rth->u.dst.expires))
635 goto out;
636
637 age = jiffies - rth->u.dst.lastuse;
638 ret = 0;
639 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
640 (age <= tmo2 && rt_valuable(rth)))
641 goto out;
642 ret = 1;
643out: return ret;
644}
645
646/* Bits of score are:
647 * 31: very valuable
648 * 30: not quite useless
649 * 29..0: usage counter
650 */
651static inline u32 rt_score(struct rtable *rt)
652{
653 u32 score = jiffies - rt->u.dst.lastuse;
654
655 score = ~score & ~(3<<30);
656
657 if (rt_valuable(rt))
658 score |= (1<<31);
659
660 if (!rt->fl.iif ||
661 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
662 score |= (1<<30);
663
664 return score;
665}
666
667static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
668{
714e85be
AV
669 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
670 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
47dcf0cb 671 (fl1->mark ^ fl2->mark) |
8238b218
DM
672 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
673 *(u16 *)&fl2->nl_u.ip4_u.tos) |
674 (fl1->oif ^ fl2->oif) |
675 (fl1->iif ^ fl2->iif)) == 0;
1da177e4
LT
676}
677
b5921910
DL
678static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
679{
c346dca1 680 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
b5921910
DL
681}
682
beb659bd
ED
683/*
684 * Perform a full scan of hash table and free all entries.
685 * Can be called by a softirq or a process.
686 * In the later case, we want to be reschedule if necessary
687 */
688static void rt_do_flush(int process_context)
689{
690 unsigned int i;
691 struct rtable *rth, *next;
692
693 for (i = 0; i <= rt_hash_mask; i++) {
694 if (process_context && need_resched())
695 cond_resched();
696 rth = rt_hash_table[i].chain;
697 if (!rth)
698 continue;
699
700 spin_lock_bh(rt_hash_lock_addr(i));
701 rth = rt_hash_table[i].chain;
702 rt_hash_table[i].chain = NULL;
703 spin_unlock_bh(rt_hash_lock_addr(i));
704
705 for (; rth; rth = next) {
706 next = rth->u.dst.rt_next;
707 rt_free(rth);
708 }
709 }
710}
711
712static void rt_check_expire(void)
1da177e4 713{
bb1d23b0
ED
714 static unsigned int rover;
715 unsigned int i = rover, goal;
1da177e4 716 struct rtable *rth, **rthp;
bb1d23b0
ED
717 u64 mult;
718
719 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
720 if (ip_rt_gc_timeout > 1)
721 do_div(mult, ip_rt_gc_timeout);
722 goal = (unsigned int)mult;
39c90ece
ED
723 if (goal > rt_hash_mask)
724 goal = rt_hash_mask + 1;
bb1d23b0 725 for (; goal > 0; goal--) {
1da177e4
LT
726 unsigned long tmo = ip_rt_gc_timeout;
727
728 i = (i + 1) & rt_hash_mask;
729 rthp = &rt_hash_table[i].chain;
730
d90bf5a9
ED
731 if (need_resched())
732 cond_resched();
733
cfcabdcc 734 if (*rthp == NULL)
bb1d23b0 735 continue;
39c90ece 736 spin_lock_bh(rt_hash_lock_addr(i));
1da177e4 737 while ((rth = *rthp) != NULL) {
29e75252
ED
738 if (rth->rt_genid != atomic_read(&rt_genid)) {
739 *rthp = rth->u.dst.rt_next;
740 rt_free(rth);
741 continue;
742 }
1da177e4
LT
743 if (rth->u.dst.expires) {
744 /* Entry is expired even if it is in use */
39c90ece 745 if (time_before_eq(jiffies, rth->u.dst.expires)) {
1da177e4 746 tmo >>= 1;
093c2ca4 747 rthp = &rth->u.dst.rt_next;
1da177e4
LT
748 continue;
749 }
750 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
751 tmo >>= 1;
093c2ca4 752 rthp = &rth->u.dst.rt_next;
1da177e4
LT
753 continue;
754 }
755
756 /* Cleanup aged off entries. */
093c2ca4 757 *rthp = rth->u.dst.rt_next;
e905a9ed 758 rt_free(rth);
1da177e4 759 }
39c90ece 760 spin_unlock_bh(rt_hash_lock_addr(i));
1da177e4
LT
761 }
762 rover = i;
beb659bd
ED
763}
764
765/*
766 * rt_worker_func() is run in process context.
29e75252 767 * we call rt_check_expire() to scan part of the hash table
beb659bd
ED
768 */
769static void rt_worker_func(struct work_struct *work)
770{
29e75252 771 rt_check_expire();
39c90ece 772 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
1da177e4
LT
773}
774
29e75252
ED
775/*
776 * Pertubation of rt_genid by a small quantity [1..256]
777 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
778 * many times (2^24) without giving recent rt_genid.
779 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 780 */
29e75252 781static void rt_cache_invalidate(void)
1da177e4 782{
29e75252 783 unsigned char shuffle;
1da177e4 784
29e75252
ED
785 get_random_bytes(&shuffle, sizeof(shuffle));
786 atomic_add(shuffle + 1U, &rt_genid);
1da177e4
LT
787}
788
29e75252
ED
789/*
790 * delay < 0 : invalidate cache (fast : entries will be deleted later)
791 * delay >= 0 : invalidate & flush cache (can be long)
792 */
76e6ebfb 793void rt_cache_flush(struct net *net, int delay)
1da177e4 794{
29e75252
ED
795 rt_cache_invalidate();
796 if (delay >= 0)
797 rt_do_flush(!in_softirq());
1da177e4
LT
798}
799
beb659bd 800/*
29e75252 801 * We change rt_genid and let gc do the cleanup
beb659bd 802 */
9f5e97e5 803static void rt_secret_rebuild(unsigned long __net)
1da177e4 804{
9f5e97e5 805 struct net *net = (struct net *)__net;
29e75252 806 rt_cache_invalidate();
9f5e97e5 807 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
1da177e4
LT
808}
809
810/*
811 Short description of GC goals.
812
813 We want to build algorithm, which will keep routing cache
814 at some equilibrium point, when number of aged off entries
815 is kept approximately equal to newly generated ones.
816
817 Current expiration strength is variable "expire".
818 We try to adjust it dynamically, so that if networking
819 is idle expires is large enough to keep enough of warm entries,
820 and when load increases it reduces to limit cache size.
821 */
822
569d3645 823static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
824{
825 static unsigned long expire = RT_GC_TIMEOUT;
826 static unsigned long last_gc;
827 static int rover;
828 static int equilibrium;
829 struct rtable *rth, **rthp;
830 unsigned long now = jiffies;
831 int goal;
832
833 /*
834 * Garbage collection is pretty expensive,
835 * do not make it too frequently.
836 */
837
838 RT_CACHE_STAT_INC(gc_total);
839
840 if (now - last_gc < ip_rt_gc_min_interval &&
841 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
842 RT_CACHE_STAT_INC(gc_ignored);
843 goto out;
844 }
845
846 /* Calculate number of entries, which we want to expire now. */
847 goal = atomic_read(&ipv4_dst_ops.entries) -
848 (ip_rt_gc_elasticity << rt_hash_log);
849 if (goal <= 0) {
850 if (equilibrium < ipv4_dst_ops.gc_thresh)
851 equilibrium = ipv4_dst_ops.gc_thresh;
852 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
853 if (goal > 0) {
b790cedd 854 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1da177e4
LT
855 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
856 }
857 } else {
858 /* We are in dangerous area. Try to reduce cache really
859 * aggressively.
860 */
b790cedd 861 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1da177e4
LT
862 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
863 }
864
865 if (now - last_gc >= ip_rt_gc_min_interval)
866 last_gc = now;
867
868 if (goal <= 0) {
869 equilibrium += goal;
870 goto work_done;
871 }
872
873 do {
874 int i, k;
875
876 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
877 unsigned long tmo = expire;
878
879 k = (k + 1) & rt_hash_mask;
880 rthp = &rt_hash_table[k].chain;
22c047cc 881 spin_lock_bh(rt_hash_lock_addr(k));
1da177e4 882 while ((rth = *rthp) != NULL) {
29e75252
ED
883 if (rth->rt_genid == atomic_read(&rt_genid) &&
884 !rt_may_expire(rth, tmo, expire)) {
1da177e4 885 tmo >>= 1;
093c2ca4 886 rthp = &rth->u.dst.rt_next;
1da177e4
LT
887 continue;
888 }
093c2ca4 889 *rthp = rth->u.dst.rt_next;
1da177e4
LT
890 rt_free(rth);
891 goal--;
1da177e4 892 }
22c047cc 893 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
894 if (goal <= 0)
895 break;
896 }
897 rover = k;
898
899 if (goal <= 0)
900 goto work_done;
901
902 /* Goal is not achieved. We stop process if:
903
904 - if expire reduced to zero. Otherwise, expire is halfed.
905 - if table is not full.
906 - if we are called from interrupt.
907 - jiffies check is just fallback/debug loop breaker.
908 We will not spin here for long time in any case.
909 */
910
911 RT_CACHE_STAT_INC(gc_goal_miss);
912
913 if (expire == 0)
914 break;
915
916 expire >>= 1;
917#if RT_CACHE_DEBUG >= 2
918 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
919 atomic_read(&ipv4_dst_ops.entries), goal, i);
920#endif
921
922 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
923 goto out;
924 } while (!in_softirq() && time_before_eq(jiffies, now));
925
926 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
927 goto out;
928 if (net_ratelimit())
929 printk(KERN_WARNING "dst cache overflow\n");
930 RT_CACHE_STAT_INC(gc_dst_overflow);
931 return 1;
932
933work_done:
934 expire += ip_rt_gc_min_interval;
935 if (expire > ip_rt_gc_timeout ||
936 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
937 expire = ip_rt_gc_timeout;
938#if RT_CACHE_DEBUG >= 2
939 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
940 atomic_read(&ipv4_dst_ops.entries), goal, rover);
941#endif
942out: return 0;
943}
944
945static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
946{
947 struct rtable *rth, **rthp;
948 unsigned long now;
949 struct rtable *cand, **candp;
950 u32 min_score;
951 int chain_length;
952 int attempts = !in_softirq();
953
954restart:
955 chain_length = 0;
956 min_score = ~(u32)0;
957 cand = NULL;
958 candp = NULL;
959 now = jiffies;
960
961 rthp = &rt_hash_table[hash].chain;
962
22c047cc 963 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 964 while ((rth = *rthp) != NULL) {
29e75252
ED
965 if (rth->rt_genid != atomic_read(&rt_genid)) {
966 *rthp = rth->u.dst.rt_next;
967 rt_free(rth);
968 continue;
969 }
b5921910 970 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1da177e4 971 /* Put it first */
093c2ca4 972 *rthp = rth->u.dst.rt_next;
1da177e4
LT
973 /*
974 * Since lookup is lockfree, the deletion
975 * must be visible to another weakly ordered CPU before
976 * the insertion at the start of the hash chain.
977 */
093c2ca4 978 rcu_assign_pointer(rth->u.dst.rt_next,
1da177e4
LT
979 rt_hash_table[hash].chain);
980 /*
981 * Since lookup is lockfree, the update writes
982 * must be ordered for consistency on SMP.
983 */
984 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
985
03f49f34 986 dst_use(&rth->u.dst, now);
22c047cc 987 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
988
989 rt_drop(rt);
990 *rp = rth;
991 return 0;
992 }
993
994 if (!atomic_read(&rth->u.dst.__refcnt)) {
995 u32 score = rt_score(rth);
996
997 if (score <= min_score) {
998 cand = rth;
999 candp = rthp;
1000 min_score = score;
1001 }
1002 }
1003
1004 chain_length++;
1005
093c2ca4 1006 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1007 }
1008
1009 if (cand) {
1010 /* ip_rt_gc_elasticity used to be average length of chain
1011 * length, when exceeded gc becomes really aggressive.
1012 *
1013 * The second limit is less certain. At the moment it allows
1014 * only 2 entries per bucket. We will see.
1015 */
1016 if (chain_length > ip_rt_gc_elasticity) {
093c2ca4 1017 *candp = cand->u.dst.rt_next;
1da177e4
LT
1018 rt_free(cand);
1019 }
1020 }
1021
1022 /* Try to bind route to arp only if it is output
1023 route or unicast forwarding path.
1024 */
1025 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1026 int err = arp_bind_neighbour(&rt->u.dst);
1027 if (err) {
22c047cc 1028 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1029
1030 if (err != -ENOBUFS) {
1031 rt_drop(rt);
1032 return err;
1033 }
1034
1035 /* Neighbour tables are full and nothing
1036 can be released. Try to shrink route cache,
1037 it is most likely it holds some neighbour records.
1038 */
1039 if (attempts-- > 0) {
1040 int saved_elasticity = ip_rt_gc_elasticity;
1041 int saved_int = ip_rt_gc_min_interval;
1042 ip_rt_gc_elasticity = 1;
1043 ip_rt_gc_min_interval = 0;
569d3645 1044 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1045 ip_rt_gc_min_interval = saved_int;
1046 ip_rt_gc_elasticity = saved_elasticity;
1047 goto restart;
1048 }
1049
1050 if (net_ratelimit())
1051 printk(KERN_WARNING "Neighbour table overflow.\n");
1052 rt_drop(rt);
1053 return -ENOBUFS;
1054 }
1055 }
1056
093c2ca4 1057 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1da177e4 1058#if RT_CACHE_DEBUG >= 2
093c2ca4 1059 if (rt->u.dst.rt_next) {
1da177e4 1060 struct rtable *trt;
a7d632b6 1061 printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1da177e4 1062 NIPQUAD(rt->rt_dst));
093c2ca4 1063 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
a7d632b6 1064 printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1da177e4
LT
1065 printk("\n");
1066 }
1067#endif
1068 rt_hash_table[hash].chain = rt;
22c047cc 1069 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1070 *rp = rt;
1071 return 0;
1072}
1073
1074void rt_bind_peer(struct rtable *rt, int create)
1075{
1076 static DEFINE_SPINLOCK(rt_peer_lock);
1077 struct inet_peer *peer;
1078
1079 peer = inet_getpeer(rt->rt_dst, create);
1080
1081 spin_lock_bh(&rt_peer_lock);
1082 if (rt->peer == NULL) {
1083 rt->peer = peer;
1084 peer = NULL;
1085 }
1086 spin_unlock_bh(&rt_peer_lock);
1087 if (peer)
1088 inet_putpeer(peer);
1089}
1090
1091/*
1092 * Peer allocation may fail only in serious out-of-memory conditions. However
1093 * we still can generate some output.
1094 * Random ID selection looks a bit dangerous because we have no chances to
1095 * select ID being unique in a reasonable period of time.
1096 * But broken packet identifier may be better than no packet at all.
1097 */
1098static void ip_select_fb_ident(struct iphdr *iph)
1099{
1100 static DEFINE_SPINLOCK(ip_fb_id_lock);
1101 static u32 ip_fallback_id;
1102 u32 salt;
1103
1104 spin_lock_bh(&ip_fb_id_lock);
e448515c 1105 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1106 iph->id = htons(salt & 0xFFFF);
1107 ip_fallback_id = salt;
1108 spin_unlock_bh(&ip_fb_id_lock);
1109}
1110
1111void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1112{
1113 struct rtable *rt = (struct rtable *) dst;
1114
1115 if (rt) {
1116 if (rt->peer == NULL)
1117 rt_bind_peer(rt, 1);
1118
1119 /* If peer is attached to destination, it is never detached,
1120 so that we need not to grab a lock to dereference it.
1121 */
1122 if (rt->peer) {
1123 iph->id = htons(inet_getid(rt->peer, more));
1124 return;
1125 }
1126 } else
e905a9ed 1127 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1128 __builtin_return_address(0));
1da177e4
LT
1129
1130 ip_select_fb_ident(iph);
1131}
1132
1133static void rt_del(unsigned hash, struct rtable *rt)
1134{
29e75252 1135 struct rtable **rthp, *aux;
1da177e4 1136
29e75252 1137 rthp = &rt_hash_table[hash].chain;
22c047cc 1138 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1139 ip_rt_put(rt);
29e75252
ED
1140 while ((aux = *rthp) != NULL) {
1141 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1142 *rthp = aux->u.dst.rt_next;
1143 rt_free(aux);
1144 continue;
1da177e4 1145 }
29e75252
ED
1146 rthp = &aux->u.dst.rt_next;
1147 }
22c047cc 1148 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1149}
1150
f7655229
AV
1151void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1152 __be32 saddr, struct net_device *dev)
1da177e4
LT
1153{
1154 int i, k;
1155 struct in_device *in_dev = in_dev_get(dev);
1156 struct rtable *rth, **rthp;
f7655229 1157 __be32 skeys[2] = { saddr, 0 };
1da177e4 1158 int ikeys[2] = { dev->ifindex, 0 };
8d71740c 1159 struct netevent_redirect netevent;
317805b8 1160 struct net *net;
1da177e4 1161
1da177e4
LT
1162 if (!in_dev)
1163 return;
1164
c346dca1 1165 net = dev_net(dev);
1da177e4 1166 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1e637c74 1167 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
f97c1e0c 1168 || ipv4_is_zeronet(new_gw))
1da177e4
LT
1169 goto reject_redirect;
1170
1171 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1172 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1173 goto reject_redirect;
1174 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1175 goto reject_redirect;
1176 } else {
317805b8 1177 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1178 goto reject_redirect;
1179 }
1180
1181 for (i = 0; i < 2; i++) {
1182 for (k = 0; k < 2; k++) {
8c7bc840 1183 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1da177e4
LT
1184
1185 rthp=&rt_hash_table[hash].chain;
1186
1187 rcu_read_lock();
1188 while ((rth = rcu_dereference(*rthp)) != NULL) {
1189 struct rtable *rt;
1190
1191 if (rth->fl.fl4_dst != daddr ||
1192 rth->fl.fl4_src != skeys[i] ||
1da177e4 1193 rth->fl.oif != ikeys[k] ||
29e75252 1194 rth->fl.iif != 0 ||
317805b8 1195 rth->rt_genid != atomic_read(&rt_genid) ||
878628fb 1196 !net_eq(dev_net(rth->u.dst.dev), net)) {
093c2ca4 1197 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1198 continue;
1199 }
1200
1201 if (rth->rt_dst != daddr ||
1202 rth->rt_src != saddr ||
1203 rth->u.dst.error ||
1204 rth->rt_gateway != old_gw ||
1205 rth->u.dst.dev != dev)
1206 break;
1207
1208 dst_hold(&rth->u.dst);
1209 rcu_read_unlock();
1210
1211 rt = dst_alloc(&ipv4_dst_ops);
1212 if (rt == NULL) {
1213 ip_rt_put(rth);
1214 in_dev_put(in_dev);
1215 return;
1216 }
1217
1218 /* Copy all the information. */
1219 *rt = *rth;
e905a9ed 1220 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1da177e4
LT
1221 rt->u.dst.__use = 1;
1222 atomic_set(&rt->u.dst.__refcnt, 1);
1223 rt->u.dst.child = NULL;
1224 if (rt->u.dst.dev)
1225 dev_hold(rt->u.dst.dev);
1226 if (rt->idev)
1227 in_dev_hold(rt->idev);
1228 rt->u.dst.obsolete = 0;
1229 rt->u.dst.lastuse = jiffies;
1230 rt->u.dst.path = &rt->u.dst;
1231 rt->u.dst.neighbour = NULL;
1232 rt->u.dst.hh = NULL;
1233 rt->u.dst.xfrm = NULL;
29e75252 1234 rt->rt_genid = atomic_read(&rt_genid);
1da177e4
LT
1235 rt->rt_flags |= RTCF_REDIRECTED;
1236
1237 /* Gateway is different ... */
1238 rt->rt_gateway = new_gw;
1239
1240 /* Redirect received -> path was valid */
1241 dst_confirm(&rth->u.dst);
1242
1243 if (rt->peer)
1244 atomic_inc(&rt->peer->refcnt);
1245
1246 if (arp_bind_neighbour(&rt->u.dst) ||
1247 !(rt->u.dst.neighbour->nud_state &
1248 NUD_VALID)) {
1249 if (rt->u.dst.neighbour)
1250 neigh_event_send(rt->u.dst.neighbour, NULL);
1251 ip_rt_put(rth);
1252 rt_drop(rt);
1253 goto do_next;
1254 }
e905a9ed 1255
8d71740c
TT
1256 netevent.old = &rth->u.dst;
1257 netevent.new = &rt->u.dst;
e905a9ed
YH
1258 call_netevent_notifiers(NETEVENT_REDIRECT,
1259 &netevent);
1da177e4
LT
1260
1261 rt_del(hash, rth);
1262 if (!rt_intern_hash(hash, rt, &rt))
1263 ip_rt_put(rt);
1264 goto do_next;
1265 }
1266 rcu_read_unlock();
1267 do_next:
1268 ;
1269 }
1270 }
1271 in_dev_put(in_dev);
1272 return;
1273
1274reject_redirect:
1275#ifdef CONFIG_IP_ROUTE_VERBOSE
1276 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
a7d632b6
YH
1277 printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1278 NIPQUAD_FMT " ignored.\n"
1279 " Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1da177e4 1280 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
cef2685e 1281 NIPQUAD(saddr), NIPQUAD(daddr));
1da177e4
LT
1282#endif
1283 in_dev_put(in_dev);
1284}
1285
1286static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1287{
ee6b9673 1288 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1289 struct dst_entry *ret = dst;
1290
1291 if (rt) {
1292 if (dst->obsolete) {
1293 ip_rt_put(rt);
1294 ret = NULL;
1295 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1296 rt->u.dst.expires) {
8c7bc840
AV
1297 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1298 rt->fl.oif);
1da177e4 1299#if RT_CACHE_DEBUG >= 1
56c99d04 1300 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
a7d632b6 1301 NIPQUAD_FMT "/%02x dropped\n",
1da177e4
LT
1302 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1303#endif
1304 rt_del(hash, rt);
1305 ret = NULL;
1306 }
1307 }
1308 return ret;
1309}
1310
1311/*
1312 * Algorithm:
1313 * 1. The first ip_rt_redirect_number redirects are sent
1314 * with exponential backoff, then we stop sending them at all,
1315 * assuming that the host ignores our redirects.
1316 * 2. If we did not see packets requiring redirects
1317 * during ip_rt_redirect_silence, we assume that the host
1318 * forgot redirected route and start to send redirects again.
1319 *
1320 * This algorithm is much cheaper and more intelligent than dumb load limiting
1321 * in icmp.c.
1322 *
1323 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1324 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1325 */
1326
1327void ip_rt_send_redirect(struct sk_buff *skb)
1328{
ee6b9673 1329 struct rtable *rt = skb->rtable;
1da177e4
LT
1330 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1331
1332 if (!in_dev)
1333 return;
1334
1335 if (!IN_DEV_TX_REDIRECTS(in_dev))
1336 goto out;
1337
1338 /* No redirected packets during ip_rt_redirect_silence;
1339 * reset the algorithm.
1340 */
1341 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1342 rt->u.dst.rate_tokens = 0;
1343
1344 /* Too many ignored redirects; do not send anything
1345 * set u.dst.rate_last to the last seen redirected packet.
1346 */
1347 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1348 rt->u.dst.rate_last = jiffies;
1349 goto out;
1350 }
1351
1352 /* Check for load limit; set rate_last to the latest sent
1353 * redirect.
1354 */
14fb8a76
LY
1355 if (rt->u.dst.rate_tokens == 0 ||
1356 time_after(jiffies,
1da177e4
LT
1357 (rt->u.dst.rate_last +
1358 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1359 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1360 rt->u.dst.rate_last = jiffies;
1361 ++rt->u.dst.rate_tokens;
1362#ifdef CONFIG_IP_ROUTE_VERBOSE
1363 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1364 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1365 net_ratelimit())
a7d632b6
YH
1366 printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1367 "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1da177e4
LT
1368 NIPQUAD(rt->rt_src), rt->rt_iif,
1369 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1370#endif
1371 }
1372out:
e905a9ed 1373 in_dev_put(in_dev);
1da177e4
LT
1374}
1375
1376static int ip_error(struct sk_buff *skb)
1377{
ee6b9673 1378 struct rtable *rt = skb->rtable;
1da177e4
LT
1379 unsigned long now;
1380 int code;
1381
1382 switch (rt->u.dst.error) {
1383 case EINVAL:
1384 default:
1385 goto out;
1386 case EHOSTUNREACH:
1387 code = ICMP_HOST_UNREACH;
1388 break;
1389 case ENETUNREACH:
1390 code = ICMP_NET_UNREACH;
7f53878d 1391 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1da177e4
LT
1392 break;
1393 case EACCES:
1394 code = ICMP_PKT_FILTERED;
1395 break;
1396 }
1397
1398 now = jiffies;
1399 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1400 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1401 rt->u.dst.rate_tokens = ip_rt_error_burst;
1402 rt->u.dst.rate_last = now;
1403 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1404 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1405 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1406 }
1407
1408out: kfree_skb(skb);
1409 return 0;
e905a9ed 1410}
1da177e4
LT
1411
1412/*
1413 * The last two values are not from the RFC but
1414 * are needed for AMPRnet AX.25 paths.
1415 */
1416
9b5b5cff 1417static const unsigned short mtu_plateau[] =
1da177e4
LT
1418{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1419
5969f71d 1420static inline unsigned short guess_mtu(unsigned short old_mtu)
1da177e4
LT
1421{
1422 int i;
e905a9ed 1423
1da177e4
LT
1424 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1425 if (old_mtu > mtu_plateau[i])
1426 return mtu_plateau[i];
1427 return 68;
1428}
1429
b5921910 1430unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
0010e465
TT
1431 unsigned short new_mtu,
1432 struct net_device *dev)
1da177e4 1433{
0010e465 1434 int i, k;
1da177e4
LT
1435 unsigned short old_mtu = ntohs(iph->tot_len);
1436 struct rtable *rth;
0010e465 1437 int ikeys[2] = { dev->ifindex, 0 };
e448515c
AV
1438 __be32 skeys[2] = { iph->saddr, 0, };
1439 __be32 daddr = iph->daddr;
1da177e4
LT
1440 unsigned short est_mtu = 0;
1441
1442 if (ipv4_config.no_pmtu_disc)
1443 return 0;
1444
0010e465
TT
1445 for (k = 0; k < 2; k++) {
1446 for (i = 0; i < 2; i++) {
1447 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1448
1449 rcu_read_lock();
1450 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1451 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
1452 unsigned short mtu = new_mtu;
1453
0010e465
TT
1454 if (rth->fl.fl4_dst != daddr ||
1455 rth->fl.fl4_src != skeys[i] ||
1456 rth->rt_dst != daddr ||
1457 rth->rt_src != iph->saddr ||
1458 rth->fl.oif != ikeys[k] ||
1459 rth->fl.iif != 0 ||
1460 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1461 !net_eq(dev_net(rth->u.dst.dev), net) ||
1462 rth->rt_genid != atomic_read(&rt_genid))
1463 continue;
1464
1da177e4
LT
1465 if (new_mtu < 68 || new_mtu >= old_mtu) {
1466
1467 /* BSD 4.2 compatibility hack :-( */
1468 if (mtu == 0 &&
5ffc02a1 1469 old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
1da177e4
LT
1470 old_mtu >= 68 + (iph->ihl << 2))
1471 old_mtu -= iph->ihl << 2;
1472
1473 mtu = guess_mtu(old_mtu);
1474 }
5ffc02a1
SS
1475 if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
1476 if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
1da177e4
LT
1477 dst_confirm(&rth->u.dst);
1478 if (mtu < ip_rt_min_pmtu) {
1479 mtu = ip_rt_min_pmtu;
1480 rth->u.dst.metrics[RTAX_LOCK-1] |=
1481 (1 << RTAX_MTU);
1482 }
1483 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1484 dst_set_expires(&rth->u.dst,
1485 ip_rt_mtu_expires);
1486 }
1487 est_mtu = mtu;
1488 }
1489 }
0010e465 1490 rcu_read_unlock();
1da177e4 1491 }
1da177e4
LT
1492 }
1493 return est_mtu ? : new_mtu;
1494}
1495
1496static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1497{
5ffc02a1 1498 if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
1da177e4
LT
1499 !(dst_metric_locked(dst, RTAX_MTU))) {
1500 if (mtu < ip_rt_min_pmtu) {
1501 mtu = ip_rt_min_pmtu;
1502 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1503 }
1504 dst->metrics[RTAX_MTU-1] = mtu;
1505 dst_set_expires(dst, ip_rt_mtu_expires);
8d71740c 1506 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1da177e4
LT
1507 }
1508}
1509
1510static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1511{
1512 return NULL;
1513}
1514
1515static void ipv4_dst_destroy(struct dst_entry *dst)
1516{
1517 struct rtable *rt = (struct rtable *) dst;
1518 struct inet_peer *peer = rt->peer;
1519 struct in_device *idev = rt->idev;
1520
1521 if (peer) {
1522 rt->peer = NULL;
1523 inet_putpeer(peer);
1524 }
1525
1526 if (idev) {
1527 rt->idev = NULL;
1528 in_dev_put(idev);
1529 }
1530}
1531
1532static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1533 int how)
1534{
1535 struct rtable *rt = (struct rtable *) dst;
1536 struct in_device *idev = rt->idev;
c346dca1 1537 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
5a3e55d6 1538 struct in_device *loopback_idev =
c346dca1 1539 in_dev_get(dev_net(dev)->loopback_dev);
1da177e4
LT
1540 if (loopback_idev) {
1541 rt->idev = loopback_idev;
1542 in_dev_put(idev);
1543 }
1544 }
1545}
1546
1547static void ipv4_link_failure(struct sk_buff *skb)
1548{
1549 struct rtable *rt;
1550
1551 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1552
ee6b9673 1553 rt = skb->rtable;
1da177e4
LT
1554 if (rt)
1555 dst_set_expires(&rt->u.dst, 0);
1556}
1557
1558static int ip_rt_bug(struct sk_buff *skb)
1559{
a7d632b6 1560 printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
eddc9ec5 1561 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1da177e4
LT
1562 skb->dev ? skb->dev->name : "?");
1563 kfree_skb(skb);
1564 return 0;
1565}
1566
1567/*
1568 We do not cache source address of outgoing interface,
1569 because it is used only by IP RR, TS and SRR options,
1570 so that it out of fast path.
1571
1572 BTW remember: "addr" is allowed to be not aligned
1573 in IP options!
1574 */
1575
1576void ip_rt_get_source(u8 *addr, struct rtable *rt)
1577{
a61ced5d 1578 __be32 src;
1da177e4
LT
1579 struct fib_result res;
1580
1581 if (rt->fl.iif == 0)
1582 src = rt->rt_src;
c346dca1 1583 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1da177e4
LT
1584 src = FIB_RES_PREFSRC(res);
1585 fib_res_put(&res);
1586 } else
1587 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1588 RT_SCOPE_UNIVERSE);
1589 memcpy(addr, &src, 4);
1590}
1591
1592#ifdef CONFIG_NET_CLS_ROUTE
1593static void set_class_tag(struct rtable *rt, u32 tag)
1594{
1595 if (!(rt->u.dst.tclassid & 0xFFFF))
1596 rt->u.dst.tclassid |= tag & 0xFFFF;
1597 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1598 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1599}
1600#endif
1601
1602static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1603{
1604 struct fib_info *fi = res->fi;
1605
1606 if (fi) {
1607 if (FIB_RES_GW(*res) &&
1608 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1609 rt->rt_gateway = FIB_RES_GW(*res);
1610 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1611 sizeof(rt->u.dst.metrics));
1612 if (fi->fib_mtu == 0) {
1613 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
0bbeafd0 1614 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1da177e4
LT
1615 rt->rt_gateway != rt->rt_dst &&
1616 rt->u.dst.dev->mtu > 576)
1617 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1618 }
1619#ifdef CONFIG_NET_CLS_ROUTE
1620 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1621#endif
1622 } else
1623 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1624
5ffc02a1 1625 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1da177e4 1626 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
5ffc02a1 1627 if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
1da177e4 1628 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
5ffc02a1 1629 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1da177e4
LT
1630 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1631 ip_rt_min_advmss);
5ffc02a1 1632 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1da177e4
LT
1633 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1634
1635#ifdef CONFIG_NET_CLS_ROUTE
1636#ifdef CONFIG_IP_MULTIPLE_TABLES
1637 set_class_tag(rt, fib_rules_tclass(res));
1638#endif
1639 set_class_tag(rt, itag);
1640#endif
e905a9ed 1641 rt->rt_type = res->type;
1da177e4
LT
1642}
1643
9e12bb22 1644static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1645 u8 tos, struct net_device *dev, int our)
1646{
1647 unsigned hash;
1648 struct rtable *rth;
a61ced5d 1649 __be32 spec_dst;
1da177e4
LT
1650 struct in_device *in_dev = in_dev_get(dev);
1651 u32 itag = 0;
1652
1653 /* Primary sanity checks. */
1654
1655 if (in_dev == NULL)
1656 return -EINVAL;
1657
1e637c74 1658 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 1659 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1660 goto e_inval;
1661
f97c1e0c
JP
1662 if (ipv4_is_zeronet(saddr)) {
1663 if (!ipv4_is_local_multicast(daddr))
1da177e4
LT
1664 goto e_inval;
1665 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1666 } else if (fib_validate_source(saddr, 0, tos, 0,
1667 dev, &spec_dst, &itag) < 0)
1668 goto e_inval;
1669
1670 rth = dst_alloc(&ipv4_dst_ops);
1671 if (!rth)
1672 goto e_nobufs;
1673
1674 rth->u.dst.output= ip_rt_bug;
1675
1676 atomic_set(&rth->u.dst.__refcnt, 1);
1677 rth->u.dst.flags= DST_HOST;
42f811b8 1678 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
1679 rth->u.dst.flags |= DST_NOPOLICY;
1680 rth->fl.fl4_dst = daddr;
1681 rth->rt_dst = daddr;
1682 rth->fl.fl4_tos = tos;
47dcf0cb 1683 rth->fl.mark = skb->mark;
1da177e4
LT
1684 rth->fl.fl4_src = saddr;
1685 rth->rt_src = saddr;
1686#ifdef CONFIG_NET_CLS_ROUTE
1687 rth->u.dst.tclassid = itag;
1688#endif
1689 rth->rt_iif =
1690 rth->fl.iif = dev->ifindex;
2774c7ab 1691 rth->u.dst.dev = init_net.loopback_dev;
1da177e4
LT
1692 dev_hold(rth->u.dst.dev);
1693 rth->idev = in_dev_get(rth->u.dst.dev);
1694 rth->fl.oif = 0;
1695 rth->rt_gateway = daddr;
1696 rth->rt_spec_dst= spec_dst;
29e75252 1697 rth->rt_genid = atomic_read(&rt_genid);
1da177e4 1698 rth->rt_flags = RTCF_MULTICAST;
29e75252 1699 rth->rt_type = RTN_MULTICAST;
1da177e4
LT
1700 if (our) {
1701 rth->u.dst.input= ip_local_deliver;
1702 rth->rt_flags |= RTCF_LOCAL;
1703 }
1704
1705#ifdef CONFIG_IP_MROUTE
f97c1e0c 1706 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1da177e4
LT
1707 rth->u.dst.input = ip_mr_input;
1708#endif
1709 RT_CACHE_STAT_INC(in_slow_mc);
1710
1711 in_dev_put(in_dev);
8c7bc840 1712 hash = rt_hash(daddr, saddr, dev->ifindex);
ee6b9673 1713 return rt_intern_hash(hash, rth, &skb->rtable);
1da177e4
LT
1714
1715e_nobufs:
1716 in_dev_put(in_dev);
1717 return -ENOBUFS;
1718
1719e_inval:
1720 in_dev_put(in_dev);
1721 return -EINVAL;
1722}
1723
1724
1725static void ip_handle_martian_source(struct net_device *dev,
1726 struct in_device *in_dev,
1727 struct sk_buff *skb,
9e12bb22
AV
1728 __be32 daddr,
1729 __be32 saddr)
1da177e4
LT
1730{
1731 RT_CACHE_STAT_INC(in_martian_src);
1732#ifdef CONFIG_IP_ROUTE_VERBOSE
1733 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1734 /*
1735 * RFC1812 recommendation, if source is martian,
1736 * the only hint is MAC header.
1737 */
a7d632b6
YH
1738 printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1739 NIPQUAD_FMT", on dev %s\n",
1da177e4 1740 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
98e399f8 1741 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 1742 int i;
98e399f8 1743 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
1744 printk(KERN_WARNING "ll header: ");
1745 for (i = 0; i < dev->hard_header_len; i++, p++) {
1746 printk("%02x", *p);
1747 if (i < (dev->hard_header_len - 1))
1748 printk(":");
1749 }
1750 printk("\n");
1751 }
1752 }
1753#endif
1754}
1755
5969f71d
SH
1756static int __mkroute_input(struct sk_buff *skb,
1757 struct fib_result *res,
1758 struct in_device *in_dev,
1759 __be32 daddr, __be32 saddr, u32 tos,
1760 struct rtable **result)
1da177e4
LT
1761{
1762
1763 struct rtable *rth;
1764 int err;
1765 struct in_device *out_dev;
1766 unsigned flags = 0;
d9c9df8c
AV
1767 __be32 spec_dst;
1768 u32 itag;
1da177e4
LT
1769
1770 /* get a working reference to the output device */
1771 out_dev = in_dev_get(FIB_RES_DEV(*res));
1772 if (out_dev == NULL) {
1773 if (net_ratelimit())
1774 printk(KERN_CRIT "Bug in ip_route_input" \
1775 "_slow(). Please, report\n");
1776 return -EINVAL;
1777 }
1778
1779
e905a9ed 1780 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1da177e4
LT
1781 in_dev->dev, &spec_dst, &itag);
1782 if (err < 0) {
e905a9ed 1783 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1784 saddr);
e905a9ed 1785
1da177e4
LT
1786 err = -EINVAL;
1787 goto cleanup;
1788 }
1789
1790 if (err)
1791 flags |= RTCF_DIRECTSRC;
1792
51b77cae 1793 if (out_dev == in_dev && err &&
1da177e4
LT
1794 (IN_DEV_SHARED_MEDIA(out_dev) ||
1795 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1796 flags |= RTCF_DOREDIRECT;
1797
1798 if (skb->protocol != htons(ETH_P_IP)) {
1799 /* Not IP (i.e. ARP). Do not create route, if it is
1800 * invalid for proxy arp. DNAT routes are always valid.
1801 */
cb7928a5 1802 if (out_dev == in_dev) {
1da177e4
LT
1803 err = -EINVAL;
1804 goto cleanup;
1805 }
1806 }
1807
1808
1809 rth = dst_alloc(&ipv4_dst_ops);
1810 if (!rth) {
1811 err = -ENOBUFS;
1812 goto cleanup;
1813 }
1814
ce723d8e 1815 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 1816 rth->u.dst.flags= DST_HOST;
42f811b8 1817 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4 1818 rth->u.dst.flags |= DST_NOPOLICY;
42f811b8 1819 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1da177e4
LT
1820 rth->u.dst.flags |= DST_NOXFRM;
1821 rth->fl.fl4_dst = daddr;
1822 rth->rt_dst = daddr;
1823 rth->fl.fl4_tos = tos;
47dcf0cb 1824 rth->fl.mark = skb->mark;
1da177e4
LT
1825 rth->fl.fl4_src = saddr;
1826 rth->rt_src = saddr;
1827 rth->rt_gateway = daddr;
1828 rth->rt_iif =
1829 rth->fl.iif = in_dev->dev->ifindex;
1830 rth->u.dst.dev = (out_dev)->dev;
1831 dev_hold(rth->u.dst.dev);
1832 rth->idev = in_dev_get(rth->u.dst.dev);
1833 rth->fl.oif = 0;
1834 rth->rt_spec_dst= spec_dst;
1835
1836 rth->u.dst.input = ip_forward;
1837 rth->u.dst.output = ip_output;
29e75252 1838 rth->rt_genid = atomic_read(&rt_genid);
1da177e4
LT
1839
1840 rt_set_nexthop(rth, res, itag);
1841
1842 rth->rt_flags = flags;
1843
1844 *result = rth;
1845 err = 0;
1846 cleanup:
1847 /* release the working reference to the output device */
1848 in_dev_put(out_dev);
1849 return err;
e905a9ed 1850}
1da177e4 1851
5969f71d
SH
1852static int ip_mkroute_input(struct sk_buff *skb,
1853 struct fib_result *res,
1854 const struct flowi *fl,
1855 struct in_device *in_dev,
1856 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1857{
7abaa27c 1858 struct rtable* rth = NULL;
1da177e4
LT
1859 int err;
1860 unsigned hash;
1861
1862#ifdef CONFIG_IP_ROUTE_MULTIPATH
1863 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1864 fib_select_multipath(fl, res);
1865#endif
1866
1867 /* create a routing cache entry */
1868 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1869 if (err)
1870 return err;
1da177e4
LT
1871
1872 /* put it into the cache */
8c7bc840 1873 hash = rt_hash(daddr, saddr, fl->iif);
ee6b9673 1874 return rt_intern_hash(hash, rth, &skb->rtable);
1da177e4
LT
1875}
1876
1da177e4
LT
1877/*
1878 * NOTE. We drop all the packets that has local source
1879 * addresses, because every properly looped back packet
1880 * must have correct destination already attached by output routine.
1881 *
1882 * Such approach solves two big problems:
1883 * 1. Not simplex devices are handled properly.
1884 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1885 */
1886
9e12bb22 1887static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1888 u8 tos, struct net_device *dev)
1889{
1890 struct fib_result res;
1891 struct in_device *in_dev = in_dev_get(dev);
1892 struct flowi fl = { .nl_u = { .ip4_u =
1893 { .daddr = daddr,
1894 .saddr = saddr,
1895 .tos = tos,
1896 .scope = RT_SCOPE_UNIVERSE,
1da177e4 1897 } },
47dcf0cb 1898 .mark = skb->mark,
1da177e4
LT
1899 .iif = dev->ifindex };
1900 unsigned flags = 0;
1901 u32 itag = 0;
1902 struct rtable * rth;
1903 unsigned hash;
9e12bb22 1904 __be32 spec_dst;
1da177e4
LT
1905 int err = -EINVAL;
1906 int free_res = 0;
c346dca1 1907 struct net * net = dev_net(dev);
1da177e4
LT
1908
1909 /* IP on this device is disabled. */
1910
1911 if (!in_dev)
1912 goto out;
1913
1914 /* Check for the most weird martians, which can be not detected
1915 by fib_lookup.
1916 */
1917
1e637c74 1918 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 1919 ipv4_is_loopback(saddr))
1da177e4
LT
1920 goto martian_source;
1921
e448515c 1922 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1da177e4
LT
1923 goto brd_input;
1924
1925 /* Accept zero addresses only to limited broadcast;
1926 * I even do not know to fix it or not. Waiting for complains :-)
1927 */
f97c1e0c 1928 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1929 goto martian_source;
1930
1e637c74 1931 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
f97c1e0c 1932 ipv4_is_loopback(daddr))
1da177e4
LT
1933 goto martian_destination;
1934
1935 /*
1936 * Now we are ready to route packet.
1937 */
84a885f4 1938 if ((err = fib_lookup(net, &fl, &res)) != 0) {
1da177e4 1939 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1940 goto e_hostunreach;
1da177e4
LT
1941 goto no_route;
1942 }
1943 free_res = 1;
1944
1945 RT_CACHE_STAT_INC(in_slow_tot);
1946
1947 if (res.type == RTN_BROADCAST)
1948 goto brd_input;
1949
1950 if (res.type == RTN_LOCAL) {
1951 int result;
1952 result = fib_validate_source(saddr, daddr, tos,
84a885f4 1953 net->loopback_dev->ifindex,
1da177e4
LT
1954 dev, &spec_dst, &itag);
1955 if (result < 0)
1956 goto martian_source;
1957 if (result)
1958 flags |= RTCF_DIRECTSRC;
1959 spec_dst = daddr;
1960 goto local_input;
1961 }
1962
1963 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1964 goto e_hostunreach;
1da177e4
LT
1965 if (res.type != RTN_UNICAST)
1966 goto martian_destination;
1967
1968 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1da177e4
LT
1969done:
1970 in_dev_put(in_dev);
1971 if (free_res)
1972 fib_res_put(&res);
1973out: return err;
1974
1975brd_input:
1976 if (skb->protocol != htons(ETH_P_IP))
1977 goto e_inval;
1978
f97c1e0c 1979 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1980 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1981 else {
1982 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1983 &itag);
1984 if (err < 0)
1985 goto martian_source;
1986 if (err)
1987 flags |= RTCF_DIRECTSRC;
1988 }
1989 flags |= RTCF_BROADCAST;
1990 res.type = RTN_BROADCAST;
1991 RT_CACHE_STAT_INC(in_brd);
1992
1993local_input:
1994 rth = dst_alloc(&ipv4_dst_ops);
1995 if (!rth)
1996 goto e_nobufs;
1997
1998 rth->u.dst.output= ip_rt_bug;
29e75252 1999 rth->rt_genid = atomic_read(&rt_genid);
1da177e4
LT
2000
2001 atomic_set(&rth->u.dst.__refcnt, 1);
2002 rth->u.dst.flags= DST_HOST;
42f811b8 2003 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
2004 rth->u.dst.flags |= DST_NOPOLICY;
2005 rth->fl.fl4_dst = daddr;
2006 rth->rt_dst = daddr;
2007 rth->fl.fl4_tos = tos;
47dcf0cb 2008 rth->fl.mark = skb->mark;
1da177e4
LT
2009 rth->fl.fl4_src = saddr;
2010 rth->rt_src = saddr;
2011#ifdef CONFIG_NET_CLS_ROUTE
2012 rth->u.dst.tclassid = itag;
2013#endif
2014 rth->rt_iif =
2015 rth->fl.iif = dev->ifindex;
84a885f4 2016 rth->u.dst.dev = net->loopback_dev;
1da177e4
LT
2017 dev_hold(rth->u.dst.dev);
2018 rth->idev = in_dev_get(rth->u.dst.dev);
2019 rth->rt_gateway = daddr;
2020 rth->rt_spec_dst= spec_dst;
2021 rth->u.dst.input= ip_local_deliver;
2022 rth->rt_flags = flags|RTCF_LOCAL;
2023 if (res.type == RTN_UNREACHABLE) {
2024 rth->u.dst.input= ip_error;
2025 rth->u.dst.error= -err;
2026 rth->rt_flags &= ~RTCF_LOCAL;
2027 }
2028 rth->rt_type = res.type;
8c7bc840 2029 hash = rt_hash(daddr, saddr, fl.iif);
ee6b9673 2030 err = rt_intern_hash(hash, rth, &skb->rtable);
1da177e4
LT
2031 goto done;
2032
2033no_route:
2034 RT_CACHE_STAT_INC(in_no_route);
2035 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2036 res.type = RTN_UNREACHABLE;
7f53878d
MC
2037 if (err == -ESRCH)
2038 err = -ENETUNREACH;
1da177e4
LT
2039 goto local_input;
2040
2041 /*
2042 * Do not cache martian addresses: they should be logged (RFC1812)
2043 */
2044martian_destination:
2045 RT_CACHE_STAT_INC(in_martian_dst);
2046#ifdef CONFIG_IP_ROUTE_VERBOSE
2047 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
a7d632b6
YH
2048 printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2049 NIPQUAD_FMT ", dev %s\n",
1da177e4
LT
2050 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2051#endif
2c2910a4
DE
2052
2053e_hostunreach:
e905a9ed
YH
2054 err = -EHOSTUNREACH;
2055 goto done;
2c2910a4 2056
1da177e4
LT
2057e_inval:
2058 err = -EINVAL;
2059 goto done;
2060
2061e_nobufs:
2062 err = -ENOBUFS;
2063 goto done;
2064
2065martian_source:
2066 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2067 goto e_inval;
2068}
2069
9e12bb22 2070int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2071 u8 tos, struct net_device *dev)
2072{
2073 struct rtable * rth;
2074 unsigned hash;
2075 int iif = dev->ifindex;
b5921910 2076 struct net *net;
1da177e4 2077
c346dca1 2078 net = dev_net(dev);
1da177e4 2079 tos &= IPTOS_RT_MASK;
8c7bc840 2080 hash = rt_hash(daddr, saddr, iif);
1da177e4
LT
2081
2082 rcu_read_lock();
2083 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 2084 rth = rcu_dereference(rth->u.dst.rt_next)) {
c0b8c32b
SH
2085 if (((rth->fl.fl4_dst ^ daddr) |
2086 (rth->fl.fl4_src ^ saddr) |
2087 (rth->fl.iif ^ iif) |
2088 rth->fl.oif |
2089 (rth->fl.fl4_tos ^ tos)) == 0 &&
47dcf0cb 2090 rth->fl.mark == skb->mark &&
878628fb 2091 net_eq(dev_net(rth->u.dst.dev), net) &&
29e75252 2092 rth->rt_genid == atomic_read(&rt_genid)) {
03f49f34 2093 dst_use(&rth->u.dst, jiffies);
1da177e4
LT
2094 RT_CACHE_STAT_INC(in_hit);
2095 rcu_read_unlock();
ee6b9673 2096 skb->rtable = rth;
1da177e4
LT
2097 return 0;
2098 }
2099 RT_CACHE_STAT_INC(in_hlist_search);
2100 }
2101 rcu_read_unlock();
2102
2103 /* Multicast recognition logic is moved from route cache to here.
2104 The problem was that too many Ethernet cards have broken/missing
2105 hardware multicast filters :-( As result the host on multicasting
2106 network acquires a lot of useless route cache entries, sort of
2107 SDR messages from all the world. Now we try to get rid of them.
2108 Really, provided software IP multicast filter is organized
2109 reasonably (at least, hashed), it does not result in a slowdown
2110 comparing with route cache reject entries.
2111 Note, that multicast routers are not affected, because
2112 route cache entry is created eventually.
2113 */
f97c1e0c 2114 if (ipv4_is_multicast(daddr)) {
1da177e4
LT
2115 struct in_device *in_dev;
2116
2117 rcu_read_lock();
e5ed6399 2118 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1da177e4 2119 int our = ip_check_mc(in_dev, daddr, saddr,
eddc9ec5 2120 ip_hdr(skb)->protocol);
1da177e4
LT
2121 if (our
2122#ifdef CONFIG_IP_MROUTE
f97c1e0c
JP
2123 || (!ipv4_is_local_multicast(daddr) &&
2124 IN_DEV_MFORWARD(in_dev))
1da177e4
LT
2125#endif
2126 ) {
2127 rcu_read_unlock();
2128 return ip_route_input_mc(skb, daddr, saddr,
2129 tos, dev, our);
2130 }
2131 }
2132 rcu_read_unlock();
2133 return -EINVAL;
2134 }
2135 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2136}
2137
5969f71d
SH
2138static int __mkroute_output(struct rtable **result,
2139 struct fib_result *res,
2140 const struct flowi *fl,
2141 const struct flowi *oldflp,
2142 struct net_device *dev_out,
2143 unsigned flags)
1da177e4
LT
2144{
2145 struct rtable *rth;
2146 struct in_device *in_dev;
2147 u32 tos = RT_FL_TOS(oldflp);
2148 int err = 0;
2149
f97c1e0c 2150 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
1da177e4
LT
2151 return -EINVAL;
2152
e448515c 2153 if (fl->fl4_dst == htonl(0xFFFFFFFF))
1da177e4 2154 res->type = RTN_BROADCAST;
f97c1e0c 2155 else if (ipv4_is_multicast(fl->fl4_dst))
1da177e4 2156 res->type = RTN_MULTICAST;
1e637c74 2157 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
1da177e4
LT
2158 return -EINVAL;
2159
2160 if (dev_out->flags & IFF_LOOPBACK)
2161 flags |= RTCF_LOCAL;
2162
2163 /* get work reference to inet device */
2164 in_dev = in_dev_get(dev_out);
2165 if (!in_dev)
2166 return -EINVAL;
2167
2168 if (res->type == RTN_BROADCAST) {
2169 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2170 if (res->fi) {
2171 fib_info_put(res->fi);
2172 res->fi = NULL;
2173 }
2174 } else if (res->type == RTN_MULTICAST) {
2175 flags |= RTCF_MULTICAST|RTCF_LOCAL;
e905a9ed 2176 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
1da177e4
LT
2177 oldflp->proto))
2178 flags &= ~RTCF_LOCAL;
2179 /* If multicast route do not exist use
2180 default one, but do not gateway in this case.
2181 Yes, it is hack.
2182 */
2183 if (res->fi && res->prefixlen < 4) {
2184 fib_info_put(res->fi);
2185 res->fi = NULL;
2186 }
2187 }
2188
2189
2190 rth = dst_alloc(&ipv4_dst_ops);
2191 if (!rth) {
2192 err = -ENOBUFS;
2193 goto cleanup;
e905a9ed 2194 }
1da177e4 2195
ce723d8e 2196 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 2197 rth->u.dst.flags= DST_HOST;
42f811b8 2198 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
1da177e4 2199 rth->u.dst.flags |= DST_NOXFRM;
42f811b8 2200 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
2201 rth->u.dst.flags |= DST_NOPOLICY;
2202
2203 rth->fl.fl4_dst = oldflp->fl4_dst;
2204 rth->fl.fl4_tos = tos;
2205 rth->fl.fl4_src = oldflp->fl4_src;
2206 rth->fl.oif = oldflp->oif;
47dcf0cb 2207 rth->fl.mark = oldflp->mark;
1da177e4
LT
2208 rth->rt_dst = fl->fl4_dst;
2209 rth->rt_src = fl->fl4_src;
2210 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
e905a9ed 2211 /* get references to the devices that are to be hold by the routing
1da177e4
LT
2212 cache entry */
2213 rth->u.dst.dev = dev_out;
2214 dev_hold(dev_out);
2215 rth->idev = in_dev_get(dev_out);
2216 rth->rt_gateway = fl->fl4_dst;
2217 rth->rt_spec_dst= fl->fl4_src;
2218
2219 rth->u.dst.output=ip_output;
29e75252 2220 rth->rt_genid = atomic_read(&rt_genid);
1da177e4
LT
2221
2222 RT_CACHE_STAT_INC(out_slow_tot);
2223
2224 if (flags & RTCF_LOCAL) {
2225 rth->u.dst.input = ip_local_deliver;
2226 rth->rt_spec_dst = fl->fl4_dst;
2227 }
2228 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2229 rth->rt_spec_dst = fl->fl4_src;
e905a9ed 2230 if (flags & RTCF_LOCAL &&
1da177e4
LT
2231 !(dev_out->flags & IFF_LOOPBACK)) {
2232 rth->u.dst.output = ip_mc_output;
2233 RT_CACHE_STAT_INC(out_slow_mc);
2234 }
2235#ifdef CONFIG_IP_MROUTE
2236 if (res->type == RTN_MULTICAST) {
2237 if (IN_DEV_MFORWARD(in_dev) &&
f97c1e0c 2238 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
1da177e4
LT
2239 rth->u.dst.input = ip_mr_input;
2240 rth->u.dst.output = ip_mc_output;
2241 }
2242 }
2243#endif
2244 }
2245
2246 rt_set_nexthop(rth, res, 0);
2247
2248 rth->rt_flags = flags;
2249
2250 *result = rth;
2251 cleanup:
2252 /* release work reference to inet device */
2253 in_dev_put(in_dev);
2254
2255 return err;
2256}
2257
5969f71d
SH
2258static int ip_mkroute_output(struct rtable **rp,
2259 struct fib_result *res,
2260 const struct flowi *fl,
2261 const struct flowi *oldflp,
2262 struct net_device *dev_out,
2263 unsigned flags)
1da177e4 2264{
7abaa27c 2265 struct rtable *rth = NULL;
1da177e4
LT
2266 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2267 unsigned hash;
2268 if (err == 0) {
8c7bc840 2269 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
1da177e4
LT
2270 err = rt_intern_hash(hash, rth, rp);
2271 }
e905a9ed 2272
1da177e4
LT
2273 return err;
2274}
2275
1da177e4
LT
2276/*
2277 * Major route resolver routine.
2278 */
2279
b40afd0e
DL
2280static int ip_route_output_slow(struct net *net, struct rtable **rp,
2281 const struct flowi *oldflp)
1da177e4
LT
2282{
2283 u32 tos = RT_FL_TOS(oldflp);
2284 struct flowi fl = { .nl_u = { .ip4_u =
2285 { .daddr = oldflp->fl4_dst,
2286 .saddr = oldflp->fl4_src,
2287 .tos = tos & IPTOS_RT_MASK,
2288 .scope = ((tos & RTO_ONLINK) ?
2289 RT_SCOPE_LINK :
2290 RT_SCOPE_UNIVERSE),
1da177e4 2291 } },
47dcf0cb 2292 .mark = oldflp->mark,
b40afd0e 2293 .iif = net->loopback_dev->ifindex,
1da177e4
LT
2294 .oif = oldflp->oif };
2295 struct fib_result res;
2296 unsigned flags = 0;
2297 struct net_device *dev_out = NULL;
2298 int free_res = 0;
2299 int err;
2300
2301
2302 res.fi = NULL;
2303#ifdef CONFIG_IP_MULTIPLE_TABLES
2304 res.r = NULL;
2305#endif
2306
2307 if (oldflp->fl4_src) {
2308 err = -EINVAL;
f97c1e0c 2309 if (ipv4_is_multicast(oldflp->fl4_src) ||
1e637c74 2310 ipv4_is_lbcast(oldflp->fl4_src) ||
f97c1e0c 2311 ipv4_is_zeronet(oldflp->fl4_src))
1da177e4
LT
2312 goto out;
2313
2314 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
b40afd0e 2315 dev_out = ip_dev_find(net, oldflp->fl4_src);
f6c5d736 2316 if (dev_out == NULL)
1da177e4
LT
2317 goto out;
2318
2319 /* I removed check for oif == dev_out->oif here.
2320 It was wrong for two reasons:
1ab35276
DL
2321 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2322 is assigned to multiple interfaces.
1da177e4
LT
2323 2. Moreover, we are allowed to send packets with saddr
2324 of another iface. --ANK
2325 */
2326
f6c5d736 2327 if (oldflp->oif == 0
f97c1e0c
JP
2328 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2329 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
1da177e4
LT
2330 /* Special hack: user can direct multicasts
2331 and limited broadcast via necessary interface
2332 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2333 This hack is not just for fun, it allows
2334 vic,vat and friends to work.
2335 They bind socket to loopback, set ttl to zero
2336 and expect that it will work.
2337 From the viewpoint of routing cache they are broken,
2338 because we are not allowed to build multicast path
2339 with loopback source addr (look, routing cache
2340 cannot know, that ttl is zero, so that packet
2341 will not leave this host and route is valid).
2342 Luckily, this hack is good workaround.
2343 */
2344
2345 fl.oif = dev_out->ifindex;
2346 goto make_route;
2347 }
2348 if (dev_out)
2349 dev_put(dev_out);
2350 dev_out = NULL;
2351 }
2352
2353
2354 if (oldflp->oif) {
b40afd0e 2355 dev_out = dev_get_by_index(net, oldflp->oif);
1da177e4
LT
2356 err = -ENODEV;
2357 if (dev_out == NULL)
2358 goto out;
e5ed6399
HX
2359
2360 /* RACE: Check return value of inet_select_addr instead. */
2361 if (__in_dev_get_rtnl(dev_out) == NULL) {
1da177e4
LT
2362 dev_put(dev_out);
2363 goto out; /* Wrong error code */
2364 }
2365
f97c1e0c
JP
2366 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2367 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
1da177e4
LT
2368 if (!fl.fl4_src)
2369 fl.fl4_src = inet_select_addr(dev_out, 0,
2370 RT_SCOPE_LINK);
2371 goto make_route;
2372 }
2373 if (!fl.fl4_src) {
f97c1e0c 2374 if (ipv4_is_multicast(oldflp->fl4_dst))
1da177e4
LT
2375 fl.fl4_src = inet_select_addr(dev_out, 0,
2376 fl.fl4_scope);
2377 else if (!oldflp->fl4_dst)
2378 fl.fl4_src = inet_select_addr(dev_out, 0,
2379 RT_SCOPE_HOST);
2380 }
2381 }
2382
2383 if (!fl.fl4_dst) {
2384 fl.fl4_dst = fl.fl4_src;
2385 if (!fl.fl4_dst)
2386 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2387 if (dev_out)
2388 dev_put(dev_out);
b40afd0e 2389 dev_out = net->loopback_dev;
1da177e4 2390 dev_hold(dev_out);
b40afd0e 2391 fl.oif = net->loopback_dev->ifindex;
1da177e4
LT
2392 res.type = RTN_LOCAL;
2393 flags |= RTCF_LOCAL;
2394 goto make_route;
2395 }
2396
b40afd0e 2397 if (fib_lookup(net, &fl, &res)) {
1da177e4
LT
2398 res.fi = NULL;
2399 if (oldflp->oif) {
2400 /* Apparently, routing tables are wrong. Assume,
2401 that the destination is on link.
2402
2403 WHY? DW.
2404 Because we are allowed to send to iface
2405 even if it has NO routes and NO assigned
2406 addresses. When oif is specified, routing
2407 tables are looked up with only one purpose:
2408 to catch if destination is gatewayed, rather than
2409 direct. Moreover, if MSG_DONTROUTE is set,
2410 we send packet, ignoring both routing tables
2411 and ifaddr state. --ANK
2412
2413
2414 We could make it even if oif is unknown,
2415 likely IPv6, but we do not.
2416 */
2417
2418 if (fl.fl4_src == 0)
2419 fl.fl4_src = inet_select_addr(dev_out, 0,
2420 RT_SCOPE_LINK);
2421 res.type = RTN_UNICAST;
2422 goto make_route;
2423 }
2424 if (dev_out)
2425 dev_put(dev_out);
2426 err = -ENETUNREACH;
2427 goto out;
2428 }
2429 free_res = 1;
2430
2431 if (res.type == RTN_LOCAL) {
2432 if (!fl.fl4_src)
2433 fl.fl4_src = fl.fl4_dst;
2434 if (dev_out)
2435 dev_put(dev_out);
b40afd0e 2436 dev_out = net->loopback_dev;
1da177e4
LT
2437 dev_hold(dev_out);
2438 fl.oif = dev_out->ifindex;
2439 if (res.fi)
2440 fib_info_put(res.fi);
2441 res.fi = NULL;
2442 flags |= RTCF_LOCAL;
2443 goto make_route;
2444 }
2445
2446#ifdef CONFIG_IP_ROUTE_MULTIPATH
2447 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2448 fib_select_multipath(&fl, &res);
2449 else
2450#endif
2451 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
b40afd0e 2452 fib_select_default(net, &fl, &res);
1da177e4
LT
2453
2454 if (!fl.fl4_src)
2455 fl.fl4_src = FIB_RES_PREFSRC(res);
2456
2457 if (dev_out)
2458 dev_put(dev_out);
2459 dev_out = FIB_RES_DEV(res);
2460 dev_hold(dev_out);
2461 fl.oif = dev_out->ifindex;
2462
2463
2464make_route:
2465 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2466
2467
2468 if (free_res)
2469 fib_res_put(&res);
2470 if (dev_out)
2471 dev_put(dev_out);
2472out: return err;
2473}
2474
611c183e
DL
2475int __ip_route_output_key(struct net *net, struct rtable **rp,
2476 const struct flowi *flp)
1da177e4
LT
2477{
2478 unsigned hash;
2479 struct rtable *rth;
2480
8c7bc840 2481 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
1da177e4
LT
2482
2483 rcu_read_lock_bh();
2484 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 2485 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
2486 if (rth->fl.fl4_dst == flp->fl4_dst &&
2487 rth->fl.fl4_src == flp->fl4_src &&
2488 rth->fl.iif == 0 &&
2489 rth->fl.oif == flp->oif &&
47dcf0cb 2490 rth->fl.mark == flp->mark &&
1da177e4 2491 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
b5921910 2492 (IPTOS_RT_MASK | RTO_ONLINK)) &&
878628fb 2493 net_eq(dev_net(rth->u.dst.dev), net) &&
29e75252 2494 rth->rt_genid == atomic_read(&rt_genid)) {
03f49f34 2495 dst_use(&rth->u.dst, jiffies);
1da177e4
LT
2496 RT_CACHE_STAT_INC(out_hit);
2497 rcu_read_unlock_bh();
2498 *rp = rth;
2499 return 0;
2500 }
2501 RT_CACHE_STAT_INC(out_hlist_search);
2502 }
2503 rcu_read_unlock_bh();
2504
611c183e 2505 return ip_route_output_slow(net, rp, flp);
1da177e4
LT
2506}
2507
d8c97a94
ACM
2508EXPORT_SYMBOL_GPL(__ip_route_output_key);
2509
14e50e57
DM
2510static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2511{
2512}
2513
2514static struct dst_ops ipv4_dst_blackhole_ops = {
2515 .family = AF_INET,
2516 .protocol = __constant_htons(ETH_P_IP),
2517 .destroy = ipv4_dst_destroy,
2518 .check = ipv4_dst_check,
2519 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2520 .entry_size = sizeof(struct rtable),
e2422970 2521 .entries = ATOMIC_INIT(0),
14e50e57
DM
2522};
2523
2524
ce259990 2525static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
14e50e57
DM
2526{
2527 struct rtable *ort = *rp;
2528 struct rtable *rt = (struct rtable *)
2529 dst_alloc(&ipv4_dst_blackhole_ops);
2530
2531 if (rt) {
2532 struct dst_entry *new = &rt->u.dst;
2533
2534 atomic_set(&new->__refcnt, 1);
2535 new->__use = 1;
352e512c
HX
2536 new->input = dst_discard;
2537 new->output = dst_discard;
14e50e57
DM
2538 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2539
2540 new->dev = ort->u.dst.dev;
2541 if (new->dev)
2542 dev_hold(new->dev);
2543
2544 rt->fl = ort->fl;
2545
2546 rt->idev = ort->idev;
2547 if (rt->idev)
2548 in_dev_hold(rt->idev);
29e75252 2549 rt->rt_genid = atomic_read(&rt_genid);
14e50e57
DM
2550 rt->rt_flags = ort->rt_flags;
2551 rt->rt_type = ort->rt_type;
2552 rt->rt_dst = ort->rt_dst;
2553 rt->rt_src = ort->rt_src;
2554 rt->rt_iif = ort->rt_iif;
2555 rt->rt_gateway = ort->rt_gateway;
2556 rt->rt_spec_dst = ort->rt_spec_dst;
2557 rt->peer = ort->peer;
2558 if (rt->peer)
2559 atomic_inc(&rt->peer->refcnt);
2560
2561 dst_free(new);
2562 }
2563
2564 dst_release(&(*rp)->u.dst);
2565 *rp = rt;
2566 return (rt ? 0 : -ENOMEM);
2567}
2568
f1b050bf
DL
2569int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2570 struct sock *sk, int flags)
1da177e4
LT
2571{
2572 int err;
2573
f1b050bf 2574 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
1da177e4
LT
2575 return err;
2576
2577 if (flp->proto) {
2578 if (!flp->fl4_src)
2579 flp->fl4_src = (*rp)->rt_src;
2580 if (!flp->fl4_dst)
2581 flp->fl4_dst = (*rp)->rt_dst;
bb72845e
HX
2582 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2583 flags ? XFRM_LOOKUP_WAIT : 0);
14e50e57 2584 if (err == -EREMOTE)
ce259990 2585 err = ipv4_dst_blackhole(rp, flp);
14e50e57
DM
2586
2587 return err;
1da177e4
LT
2588 }
2589
2590 return 0;
2591}
2592
d8c97a94
ACM
2593EXPORT_SYMBOL_GPL(ip_route_output_flow);
2594
f206351a 2595int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
1da177e4 2596{
f206351a 2597 return ip_route_output_flow(net, rp, flp, NULL, 0);
1da177e4
LT
2598}
2599
2600static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2601 int nowait, unsigned int flags)
1da177e4 2602{
ee6b9673 2603 struct rtable *rt = skb->rtable;
1da177e4 2604 struct rtmsg *r;
be403ea1 2605 struct nlmsghdr *nlh;
e3703b3d
TG
2606 long expires;
2607 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2608
2609 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2610 if (nlh == NULL)
26932566 2611 return -EMSGSIZE;
be403ea1
TG
2612
2613 r = nlmsg_data(nlh);
1da177e4
LT
2614 r->rtm_family = AF_INET;
2615 r->rtm_dst_len = 32;
2616 r->rtm_src_len = 0;
2617 r->rtm_tos = rt->fl.fl4_tos;
2618 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2619 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2620 r->rtm_type = rt->rt_type;
2621 r->rtm_scope = RT_SCOPE_UNIVERSE;
2622 r->rtm_protocol = RTPROT_UNSPEC;
2623 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2624 if (rt->rt_flags & RTCF_NOTIFY)
2625 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2626
17fb2c64 2627 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2628
1da177e4
LT
2629 if (rt->fl.fl4_src) {
2630 r->rtm_src_len = 32;
17fb2c64 2631 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
1da177e4
LT
2632 }
2633 if (rt->u.dst.dev)
be403ea1 2634 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
1da177e4
LT
2635#ifdef CONFIG_NET_CLS_ROUTE
2636 if (rt->u.dst.tclassid)
be403ea1 2637 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
1da177e4
LT
2638#endif
2639 if (rt->fl.iif)
17fb2c64 2640 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
1da177e4 2641 else if (rt->rt_src != rt->fl.fl4_src)
17fb2c64 2642 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2643
1da177e4 2644 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2645 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2646
1da177e4 2647 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
be403ea1
TG
2648 goto nla_put_failure;
2649
e3703b3d
TG
2650 error = rt->u.dst.error;
2651 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
1da177e4 2652 if (rt->peer) {
e3703b3d 2653 id = rt->peer->ip_id_count;
1da177e4 2654 if (rt->peer->tcp_ts_stamp) {
e3703b3d 2655 ts = rt->peer->tcp_ts;
9d729f72 2656 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
1da177e4
LT
2657 }
2658 }
be403ea1 2659
1da177e4
LT
2660 if (rt->fl.iif) {
2661#ifdef CONFIG_IP_MROUTE
e448515c 2662 __be32 dst = rt->rt_dst;
1da177e4 2663
f97c1e0c 2664 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
586f1211 2665 IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
1da177e4
LT
2666 int err = ipmr_get_route(skb, r, nowait);
2667 if (err <= 0) {
2668 if (!nowait) {
2669 if (err == 0)
2670 return 0;
be403ea1 2671 goto nla_put_failure;
1da177e4
LT
2672 } else {
2673 if (err == -EMSGSIZE)
be403ea1 2674 goto nla_put_failure;
e3703b3d 2675 error = err;
1da177e4
LT
2676 }
2677 }
2678 } else
2679#endif
be403ea1 2680 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
1da177e4
LT
2681 }
2682
e3703b3d
TG
2683 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2684 expires, error) < 0)
2685 goto nla_put_failure;
be403ea1
TG
2686
2687 return nlmsg_end(skb, nlh);
1da177e4 2688
be403ea1 2689nla_put_failure:
26932566
PM
2690 nlmsg_cancel(skb, nlh);
2691 return -EMSGSIZE;
1da177e4
LT
2692}
2693
63f3444f 2694static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 2695{
3b1e0a65 2696 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2697 struct rtmsg *rtm;
2698 struct nlattr *tb[RTA_MAX+1];
1da177e4 2699 struct rtable *rt = NULL;
9e12bb22
AV
2700 __be32 dst = 0;
2701 __be32 src = 0;
2702 u32 iif;
d889ce3b 2703 int err;
1da177e4
LT
2704 struct sk_buff *skb;
2705
d889ce3b
TG
2706 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2707 if (err < 0)
2708 goto errout;
2709
2710 rtm = nlmsg_data(nlh);
2711
1da177e4 2712 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2713 if (skb == NULL) {
2714 err = -ENOBUFS;
2715 goto errout;
2716 }
1da177e4
LT
2717
2718 /* Reserve room for dummy headers, this skb can pass
2719 through good chunk of routing engine.
2720 */
459a98ed 2721 skb_reset_mac_header(skb);
c1d2bbe1 2722 skb_reset_network_header(skb);
d2c962b8
SH
2723
2724 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2725 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2726 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2727
17fb2c64
AV
2728 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2729 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2730 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
1da177e4
LT
2731
2732 if (iif) {
d889ce3b
TG
2733 struct net_device *dev;
2734
1937504d 2735 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
2736 if (dev == NULL) {
2737 err = -ENODEV;
2738 goto errout_free;
2739 }
2740
1da177e4
LT
2741 skb->protocol = htons(ETH_P_IP);
2742 skb->dev = dev;
2743 local_bh_disable();
2744 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2745 local_bh_enable();
d889ce3b 2746
ee6b9673 2747 rt = skb->rtable;
d889ce3b 2748 if (err == 0 && rt->u.dst.error)
1da177e4
LT
2749 err = -rt->u.dst.error;
2750 } else {
d889ce3b
TG
2751 struct flowi fl = {
2752 .nl_u = {
2753 .ip4_u = {
2754 .daddr = dst,
2755 .saddr = src,
2756 .tos = rtm->rtm_tos,
2757 },
2758 },
2759 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2760 };
1937504d 2761 err = ip_route_output_key(net, &rt, &fl);
1da177e4 2762 }
d889ce3b 2763
1da177e4 2764 if (err)
d889ce3b 2765 goto errout_free;
1da177e4 2766
ee6b9673 2767 skb->rtable = rt;
1da177e4
LT
2768 if (rtm->rtm_flags & RTM_F_NOTIFY)
2769 rt->rt_flags |= RTCF_NOTIFY;
2770
1da177e4 2771 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 2772 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2773 if (err <= 0)
2774 goto errout_free;
1da177e4 2775
1937504d 2776 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 2777errout:
2942e900 2778 return err;
1da177e4 2779
d889ce3b 2780errout_free:
1da177e4 2781 kfree_skb(skb);
d889ce3b 2782 goto errout;
1da177e4
LT
2783}
2784
2785int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2786{
2787 struct rtable *rt;
2788 int h, s_h;
2789 int idx, s_idx;
1937504d
DL
2790 struct net *net;
2791
3b1e0a65 2792 net = sock_net(skb->sk);
1da177e4
LT
2793
2794 s_h = cb->args[0];
d8c92830
ED
2795 if (s_h < 0)
2796 s_h = 0;
1da177e4 2797 s_idx = idx = cb->args[1];
d8c92830 2798 for (h = s_h; h <= rt_hash_mask; h++) {
1da177e4
LT
2799 rcu_read_lock_bh();
2800 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
093c2ca4 2801 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
878628fb 2802 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
1da177e4 2803 continue;
29e75252
ED
2804 if (rt->rt_genid != atomic_read(&rt_genid))
2805 continue;
1da177e4
LT
2806 skb->dst = dst_clone(&rt->u.dst);
2807 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
e905a9ed 2808 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 2809 1, NLM_F_MULTI) <= 0) {
1da177e4
LT
2810 dst_release(xchg(&skb->dst, NULL));
2811 rcu_read_unlock_bh();
2812 goto done;
2813 }
2814 dst_release(xchg(&skb->dst, NULL));
2815 }
2816 rcu_read_unlock_bh();
d8c92830 2817 s_idx = 0;
1da177e4
LT
2818 }
2819
2820done:
2821 cb->args[0] = h;
2822 cb->args[1] = idx;
2823 return skb->len;
2824}
2825
2826void ip_rt_multicast_event(struct in_device *in_dev)
2827{
76e6ebfb 2828 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
2829}
2830
2831#ifdef CONFIG_SYSCTL
1da177e4
LT
2832static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2833 struct file *filp, void __user *buffer,
2834 size_t *lenp, loff_t *ppos)
2835{
2836 if (write) {
639e104f 2837 int flush_delay;
39a23e75 2838 struct net *net;
639e104f
DL
2839 static DEFINE_MUTEX(flush_mutex);
2840
2841 mutex_lock(&flush_mutex);
2842 ctl->data = &flush_delay;
1da177e4 2843 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
639e104f
DL
2844 ctl->data = NULL;
2845 mutex_unlock(&flush_mutex);
2846
39a23e75
DL
2847 net = (struct net *)ctl->extra1;
2848 rt_cache_flush(net, flush_delay);
1da177e4 2849 return 0;
e905a9ed 2850 }
1da177e4
LT
2851
2852 return -EINVAL;
2853}
2854
2855static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2856 int __user *name,
2857 int nlen,
2858 void __user *oldval,
2859 size_t __user *oldlenp,
2860 void __user *newval,
1f29bcd7 2861 size_t newlen)
1da177e4
LT
2862{
2863 int delay;
39a23e75 2864 struct net *net;
1da177e4
LT
2865 if (newlen != sizeof(int))
2866 return -EINVAL;
2867 if (get_user(delay, (int __user *)newval))
e905a9ed 2868 return -EFAULT;
39a23e75
DL
2869 net = (struct net *)table->extra1;
2870 rt_cache_flush(net, delay);
1da177e4
LT
2871 return 0;
2872}
2873
2874ctl_table ipv4_route_table[] = {
1da177e4
LT
2875 {
2876 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2877 .procname = "gc_thresh",
2878 .data = &ipv4_dst_ops.gc_thresh,
2879 .maxlen = sizeof(int),
2880 .mode = 0644,
2881 .proc_handler = &proc_dointvec,
2882 },
2883 {
2884 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2885 .procname = "max_size",
2886 .data = &ip_rt_max_size,
2887 .maxlen = sizeof(int),
2888 .mode = 0644,
2889 .proc_handler = &proc_dointvec,
2890 },
2891 {
2892 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2893
1da177e4
LT
2894 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2895 .procname = "gc_min_interval",
2896 .data = &ip_rt_gc_min_interval,
2897 .maxlen = sizeof(int),
2898 .mode = 0644,
2899 .proc_handler = &proc_dointvec_jiffies,
2900 .strategy = &sysctl_jiffies,
2901 },
2902 {
2903 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2904 .procname = "gc_min_interval_ms",
2905 .data = &ip_rt_gc_min_interval,
2906 .maxlen = sizeof(int),
2907 .mode = 0644,
2908 .proc_handler = &proc_dointvec_ms_jiffies,
2909 .strategy = &sysctl_ms_jiffies,
2910 },
2911 {
2912 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2913 .procname = "gc_timeout",
2914 .data = &ip_rt_gc_timeout,
2915 .maxlen = sizeof(int),
2916 .mode = 0644,
2917 .proc_handler = &proc_dointvec_jiffies,
2918 .strategy = &sysctl_jiffies,
2919 },
2920 {
2921 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2922 .procname = "gc_interval",
2923 .data = &ip_rt_gc_interval,
2924 .maxlen = sizeof(int),
2925 .mode = 0644,
2926 .proc_handler = &proc_dointvec_jiffies,
2927 .strategy = &sysctl_jiffies,
2928 },
2929 {
2930 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2931 .procname = "redirect_load",
2932 .data = &ip_rt_redirect_load,
2933 .maxlen = sizeof(int),
2934 .mode = 0644,
2935 .proc_handler = &proc_dointvec,
2936 },
2937 {
2938 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2939 .procname = "redirect_number",
2940 .data = &ip_rt_redirect_number,
2941 .maxlen = sizeof(int),
2942 .mode = 0644,
2943 .proc_handler = &proc_dointvec,
2944 },
2945 {
2946 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2947 .procname = "redirect_silence",
2948 .data = &ip_rt_redirect_silence,
2949 .maxlen = sizeof(int),
2950 .mode = 0644,
2951 .proc_handler = &proc_dointvec,
2952 },
2953 {
2954 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2955 .procname = "error_cost",
2956 .data = &ip_rt_error_cost,
2957 .maxlen = sizeof(int),
2958 .mode = 0644,
2959 .proc_handler = &proc_dointvec,
2960 },
2961 {
2962 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2963 .procname = "error_burst",
2964 .data = &ip_rt_error_burst,
2965 .maxlen = sizeof(int),
2966 .mode = 0644,
2967 .proc_handler = &proc_dointvec,
2968 },
2969 {
2970 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2971 .procname = "gc_elasticity",
2972 .data = &ip_rt_gc_elasticity,
2973 .maxlen = sizeof(int),
2974 .mode = 0644,
2975 .proc_handler = &proc_dointvec,
2976 },
2977 {
2978 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2979 .procname = "mtu_expires",
2980 .data = &ip_rt_mtu_expires,
2981 .maxlen = sizeof(int),
2982 .mode = 0644,
2983 .proc_handler = &proc_dointvec_jiffies,
2984 .strategy = &sysctl_jiffies,
2985 },
2986 {
2987 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2988 .procname = "min_pmtu",
2989 .data = &ip_rt_min_pmtu,
2990 .maxlen = sizeof(int),
2991 .mode = 0644,
2992 .proc_handler = &proc_dointvec,
2993 },
2994 {
2995 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2996 .procname = "min_adv_mss",
2997 .data = &ip_rt_min_advmss,
2998 .maxlen = sizeof(int),
2999 .mode = 0644,
3000 .proc_handler = &proc_dointvec,
3001 },
3002 {
3003 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3004 .procname = "secret_interval",
3005 .data = &ip_rt_secret_interval,
3006 .maxlen = sizeof(int),
3007 .mode = 0644,
3008 .proc_handler = &proc_dointvec_jiffies,
3009 .strategy = &sysctl_jiffies,
3010 },
3011 { .ctl_name = 0 }
3012};
39a23e75
DL
3013
3014static __net_initdata struct ctl_path ipv4_route_path[] = {
3015 { .procname = "net", .ctl_name = CTL_NET, },
3016 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3017 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3018 { },
3019};
3020
3021
3022static struct ctl_table ipv4_route_flush_table[] = {
3023 {
3024 .ctl_name = NET_IPV4_ROUTE_FLUSH,
3025 .procname = "flush",
3026 .maxlen = sizeof(int),
3027 .mode = 0200,
3028 .proc_handler = &ipv4_sysctl_rtcache_flush,
3029 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
3030 },
3031 { .ctl_name = 0 },
3032};
3033
3034static __net_init int sysctl_route_net_init(struct net *net)
3035{
3036 struct ctl_table *tbl;
3037
3038 tbl = ipv4_route_flush_table;
3039 if (net != &init_net) {
3040 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3041 if (tbl == NULL)
3042 goto err_dup;
3043 }
3044 tbl[0].extra1 = net;
3045
3046 net->ipv4.route_hdr =
3047 register_net_sysctl_table(net, ipv4_route_path, tbl);
3048 if (net->ipv4.route_hdr == NULL)
3049 goto err_reg;
3050 return 0;
3051
3052err_reg:
3053 if (tbl != ipv4_route_flush_table)
3054 kfree(tbl);
3055err_dup:
3056 return -ENOMEM;
3057}
3058
3059static __net_exit void sysctl_route_net_exit(struct net *net)
3060{
3061 struct ctl_table *tbl;
3062
3063 tbl = net->ipv4.route_hdr->ctl_table_arg;
3064 unregister_net_sysctl_table(net->ipv4.route_hdr);
3065 BUG_ON(tbl == ipv4_route_flush_table);
3066 kfree(tbl);
3067}
3068
3069static __net_initdata struct pernet_operations sysctl_route_ops = {
3070 .init = sysctl_route_net_init,
3071 .exit = sysctl_route_net_exit,
3072};
1da177e4
LT
3073#endif
3074
9f5e97e5
DL
3075
3076static __net_init int rt_secret_timer_init(struct net *net)
3077{
3078 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3079 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3080 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3081
3082 net->ipv4.rt_secret_timer.expires =
3083 jiffies + net_random() % ip_rt_secret_interval +
3084 ip_rt_secret_interval;
3085 add_timer(&net->ipv4.rt_secret_timer);
3086 return 0;
3087}
3088
3089static __net_exit void rt_secret_timer_exit(struct net *net)
3090{
3091 del_timer_sync(&net->ipv4.rt_secret_timer);
3092}
3093
3094static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3095 .init = rt_secret_timer_init,
3096 .exit = rt_secret_timer_exit,
3097};
3098
3099
1da177e4 3100#ifdef CONFIG_NET_CLS_ROUTE
8dbde28d 3101struct ip_rt_acct *ip_rt_acct __read_mostly;
1da177e4
LT
3102#endif /* CONFIG_NET_CLS_ROUTE */
3103
3104static __initdata unsigned long rhash_entries;
3105static int __init set_rhash_entries(char *str)
3106{
3107 if (!str)
3108 return 0;
3109 rhash_entries = simple_strtoul(str, &str, 0);
3110 return 1;
3111}
3112__setup("rhash_entries=", set_rhash_entries);
3113
3114int __init ip_rt_init(void)
3115{
424c4b70 3116 int rc = 0;
1da177e4 3117
29e75252
ED
3118 atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3119 (jiffies ^ (jiffies >> 7))));
1da177e4
LT
3120
3121#ifdef CONFIG_NET_CLS_ROUTE
8dbde28d 3122 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
1da177e4
LT
3123 if (!ip_rt_acct)
3124 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3125#endif
3126
e5d679f3
AD
3127 ipv4_dst_ops.kmem_cachep =
3128 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3129 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3130
14e50e57
DM
3131 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3132
424c4b70
ED
3133 rt_hash_table = (struct rt_hash_bucket *)
3134 alloc_large_system_hash("IP route cache",
3135 sizeof(struct rt_hash_bucket),
3136 rhash_entries,
3137 (num_physpages >= 128 * 1024) ?
18955cfc 3138 15 : 17,
8d1502de 3139 0,
424c4b70
ED
3140 &rt_hash_log,
3141 &rt_hash_mask,
3142 0);
22c047cc
ED
3143 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3144 rt_hash_lock_init();
1da177e4
LT
3145
3146 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3147 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3148
1da177e4
LT
3149 devinet_init();
3150 ip_fib_init();
3151
1da177e4
LT
3152 /* All the timers, started at system startup tend
3153 to synchronize. Perturb it a bit.
3154 */
39c90ece
ED
3155 schedule_delayed_work(&expires_work,
3156 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
1da177e4 3157
9f5e97e5
DL
3158 if (register_pernet_subsys(&rt_secret_timer_ops))
3159 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
1da177e4 3160
73b38711 3161 if (ip_rt_proc_init())
107f1634 3162 printk(KERN_ERR "Unable to create route proc files\n");
1da177e4
LT
3163#ifdef CONFIG_XFRM
3164 xfrm_init();
3165 xfrm4_init();
3166#endif
63f3444f
TG
3167 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3168
39a23e75
DL
3169#ifdef CONFIG_SYSCTL
3170 register_pernet_subsys(&sysctl_route_ops);
3171#endif
1da177e4
LT
3172 return rc;
3173}
3174
3175EXPORT_SYMBOL(__ip_select_ident);
3176EXPORT_SYMBOL(ip_route_input);
3177EXPORT_SYMBOL(ip_route_output_key);