]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/route.c
tg3: Enable GRO by default.
[net-next-2.6.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
1da177e4
LT
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
1da177e4 71#include <linux/mm.h>
424c4b70 72#include <linux/bootmem.h>
1da177e4
LT
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
39c90ece 82#include <linux/workqueue.h>
1da177e4 83#include <linux/skbuff.h>
1da177e4
LT
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
5a0e3ad6 93#include <linux/slab.h>
352e512c 94#include <net/dst.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
8d71740c 106#include <net/netevent.h>
63f3444f 107#include <net/rtnetlink.h>
1da177e4
LT
108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
112#define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
1da177e4 119static int ip_rt_max_size;
817bc4db
SH
120static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123static int ip_rt_redirect_number __read_mostly = 9;
124static int ip_rt_redirect_load __read_mostly = HZ / 50;
125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost __read_mostly = HZ;
127static int ip_rt_error_burst __read_mostly = 5 * HZ;
128static int ip_rt_gc_elasticity __read_mostly = 8;
129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256;
132static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
1080d709 133static int rt_chain_length_max __read_mostly = 20;
1da177e4 134
125bb8f5
ED
135static struct delayed_work expires_work;
136static unsigned long expires_ljiffies;
1da177e4
LT
137
138/*
139 * Interface to generic destination cache.
140 */
141
142static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
143static void ipv4_dst_destroy(struct dst_entry *dst);
144static void ipv4_dst_ifdown(struct dst_entry *dst,
145 struct net_device *dev, int how);
146static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
147static void ipv4_link_failure(struct sk_buff *skb);
148static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 149static int rt_garbage_collect(struct dst_ops *ops);
1da177e4
LT
150
151
152static struct dst_ops ipv4_dst_ops = {
153 .family = AF_INET,
09640e63 154 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4
LT
155 .gc = rt_garbage_collect,
156 .check = ipv4_dst_check,
157 .destroy = ipv4_dst_destroy,
158 .ifdown = ipv4_dst_ifdown,
159 .negative_advice = ipv4_negative_advice,
160 .link_failure = ipv4_link_failure,
161 .update_pmtu = ip_rt_update_pmtu,
1ac06e03 162 .local_out = __ip_local_out,
e2422970 163 .entries = ATOMIC_INIT(0),
1da177e4
LT
164};
165
166#define ECN_OR_COST(class) TC_PRIO_##class
167
4839c52b 168const __u8 ip_tos2prio[16] = {
1da177e4
LT
169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
185};
186
187
188/*
189 * Route cache.
190 */
191
192/* The locking scheme is rather straight forward:
193 *
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
199 * lock held.
200 */
201
202struct rt_hash_bucket {
203 struct rtable *chain;
22c047cc 204};
1080d709 205
8a25d5de
IM
206#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
62051200 211 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 212 */
62051200
IM
213#ifdef CONFIG_LOCKDEP
214# define RT_HASH_LOCK_SZ 256
22c047cc 215#else
62051200
IM
216# if NR_CPUS >= 32
217# define RT_HASH_LOCK_SZ 4096
218# elif NR_CPUS >= 16
219# define RT_HASH_LOCK_SZ 2048
220# elif NR_CPUS >= 8
221# define RT_HASH_LOCK_SZ 1024
222# elif NR_CPUS >= 4
223# define RT_HASH_LOCK_SZ 512
224# else
225# define RT_HASH_LOCK_SZ 256
226# endif
22c047cc
ED
227#endif
228
229static spinlock_t *rt_hash_locks;
230# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
231
232static __init void rt_hash_lock_init(void)
233{
234 int i;
235
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 GFP_KERNEL);
238 if (!rt_hash_locks)
239 panic("IP: failed to allocate rt_hash_locks\n");
240
241 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 spin_lock_init(&rt_hash_locks[i]);
243}
22c047cc
ED
244#else
245# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
246
247static inline void rt_hash_lock_init(void)
248{
249}
22c047cc 250#endif
1da177e4 251
817bc4db
SH
252static struct rt_hash_bucket *rt_hash_table __read_mostly;
253static unsigned rt_hash_mask __read_mostly;
254static unsigned int rt_hash_log __read_mostly;
1da177e4 255
2f970d83 256static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
dbd2915c 257#define RT_CACHE_STAT_INC(field) \
bfe5d834 258 (__raw_get_cpu_var(rt_cache_stat).field++)
1da177e4 259
b00180de
DL
260static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 int genid)
1da177e4 262{
1294fc4a
SH
263 return jhash_3words((__force u32)(__be32)(daddr),
264 (__force u32)(__be32)(saddr),
b00180de 265 idx, genid)
29e75252 266 & rt_hash_mask;
1da177e4
LT
267}
268
e84f84f2
DL
269static inline int rt_genid(struct net *net)
270{
271 return atomic_read(&net->ipv4.rt_genid);
272}
273
1da177e4
LT
274#ifdef CONFIG_PROC_FS
275struct rt_cache_iter_state {
a75e936f 276 struct seq_net_private p;
1da177e4 277 int bucket;
29e75252 278 int genid;
1da177e4
LT
279};
280
1218854a 281static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 282{
1218854a 283 struct rt_cache_iter_state *st = seq->private;
1da177e4 284 struct rtable *r = NULL;
1da177e4
LT
285
286 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
a6272665
ED
287 if (!rt_hash_table[st->bucket].chain)
288 continue;
1da177e4 289 rcu_read_lock_bh();
a898def2 290 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
29e75252 291 while (r) {
1218854a 292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
a75e936f 293 r->rt_genid == st->genid)
29e75252 294 return r;
a898def2 295 r = rcu_dereference_bh(r->u.dst.rt_next);
29e75252 296 }
1da177e4
LT
297 rcu_read_unlock_bh();
298 }
29e75252 299 return r;
1da177e4
LT
300}
301
1218854a 302static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 303 struct rtable *r)
1da177e4 304{
1218854a 305 struct rt_cache_iter_state *st = seq->private;
a6272665 306
093c2ca4 307 r = r->u.dst.rt_next;
1da177e4
LT
308 while (!r) {
309 rcu_read_unlock_bh();
a6272665
ED
310 do {
311 if (--st->bucket < 0)
312 return NULL;
313 } while (!rt_hash_table[st->bucket].chain);
1da177e4
LT
314 rcu_read_lock_bh();
315 r = rt_hash_table[st->bucket].chain;
316 }
a898def2 317 return rcu_dereference_bh(r);
1da177e4
LT
318}
319
1218854a 320static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
321 struct rtable *r)
322{
1218854a
YH
323 struct rt_cache_iter_state *st = seq->private;
324 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
a75e936f 326 continue;
642d6318
DL
327 if (r->rt_genid == st->genid)
328 break;
329 }
330 return r;
331}
332
1218854a 333static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 334{
1218854a 335 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
336
337 if (r)
1218854a 338 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
339 --pos;
340 return pos ? NULL : r;
341}
342
343static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344{
29e75252 345 struct rt_cache_iter_state *st = seq->private;
29e75252 346 if (*pos)
1218854a 347 return rt_cache_get_idx(seq, *pos - 1);
e84f84f2 348 st->genid = rt_genid(seq_file_net(seq));
29e75252 349 return SEQ_START_TOKEN;
1da177e4
LT
350}
351
352static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353{
29e75252 354 struct rtable *r;
1da177e4
LT
355
356 if (v == SEQ_START_TOKEN)
1218854a 357 r = rt_cache_get_first(seq);
1da177e4 358 else
1218854a 359 r = rt_cache_get_next(seq, v);
1da177e4
LT
360 ++*pos;
361 return r;
362}
363
364static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365{
366 if (v && v != SEQ_START_TOKEN)
367 rcu_read_unlock_bh();
368}
369
370static int rt_cache_seq_show(struct seq_file *seq, void *v)
371{
372 if (v == SEQ_START_TOKEN)
373 seq_printf(seq, "%-127s\n",
374 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 "HHUptod\tSpecDst");
377 else {
378 struct rtable *r = v;
5e659e4c 379 int len;
1da177e4 380
5e659e4c
PE
381 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
1da177e4
LT
383 r->u.dst.dev ? r->u.dst.dev->name : "*",
384 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 dst_metric(&r->u.dst, RTAX_WINDOW),
390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 r->fl.fl4_tos,
393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 dev_queue_xmit) : 0,
5e659e4c
PE
396 r->rt_spec_dst, &len);
397
398 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
399 }
400 return 0;
1da177e4
LT
401}
402
f690808e 403static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
404 .start = rt_cache_seq_start,
405 .next = rt_cache_seq_next,
406 .stop = rt_cache_seq_stop,
407 .show = rt_cache_seq_show,
408};
409
410static int rt_cache_seq_open(struct inode *inode, struct file *file)
411{
a75e936f 412 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 413 sizeof(struct rt_cache_iter_state));
1da177e4
LT
414}
415
9a32144e 416static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
417 .owner = THIS_MODULE,
418 .open = rt_cache_seq_open,
419 .read = seq_read,
420 .llseek = seq_lseek,
a75e936f 421 .release = seq_release_net,
1da177e4
LT
422};
423
424
425static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426{
427 int cpu;
428
429 if (*pos == 0)
430 return SEQ_START_TOKEN;
431
0f23174a 432 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
433 if (!cpu_possible(cpu))
434 continue;
435 *pos = cpu+1;
2f970d83 436 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
437 }
438 return NULL;
439}
440
441static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442{
443 int cpu;
444
0f23174a 445 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
446 if (!cpu_possible(cpu))
447 continue;
448 *pos = cpu+1;
2f970d83 449 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
450 }
451 return NULL;
e905a9ed 452
1da177e4
LT
453}
454
455static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456{
457
458}
459
460static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461{
462 struct rt_cache_stat *st = v;
463
464 if (v == SEQ_START_TOKEN) {
5bec0039 465 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
466 return 0;
467 }
e905a9ed 468
1da177e4
LT
469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops.entries),
472 st->in_hit,
473 st->in_slow_tot,
474 st->in_slow_mc,
475 st->in_no_route,
476 st->in_brd,
477 st->in_martian_dst,
478 st->in_martian_src,
479
480 st->out_hit,
481 st->out_slow_tot,
e905a9ed 482 st->out_slow_mc,
1da177e4
LT
483
484 st->gc_total,
485 st->gc_ignored,
486 st->gc_goal_miss,
487 st->gc_dst_overflow,
488 st->in_hlist_search,
489 st->out_hlist_search
490 );
491 return 0;
492}
493
f690808e 494static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
495 .start = rt_cpu_seq_start,
496 .next = rt_cpu_seq_next,
497 .stop = rt_cpu_seq_stop,
498 .show = rt_cpu_seq_show,
499};
500
501
502static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503{
504 return seq_open(file, &rt_cpu_seq_ops);
505}
506
9a32144e 507static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
508 .owner = THIS_MODULE,
509 .open = rt_cpu_seq_open,
510 .read = seq_read,
511 .llseek = seq_lseek,
512 .release = seq_release,
513};
514
78c686e9 515#ifdef CONFIG_NET_CLS_ROUTE
a661c419 516static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 517{
a661c419
AD
518 struct ip_rt_acct *dst, *src;
519 unsigned int i, j;
520
521 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 if (!dst)
523 return -ENOMEM;
524
525 for_each_possible_cpu(i) {
526 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527 for (j = 0; j < 256; j++) {
528 dst[j].o_bytes += src[j].o_bytes;
529 dst[j].o_packets += src[j].o_packets;
530 dst[j].i_bytes += src[j].i_bytes;
531 dst[j].i_packets += src[j].i_packets;
532 }
78c686e9
PE
533 }
534
a661c419
AD
535 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
536 kfree(dst);
537 return 0;
538}
78c686e9 539
a661c419
AD
540static int rt_acct_proc_open(struct inode *inode, struct file *file)
541{
542 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 543}
a661c419
AD
544
545static const struct file_operations rt_acct_proc_fops = {
546 .owner = THIS_MODULE,
547 .open = rt_acct_proc_open,
548 .read = seq_read,
549 .llseek = seq_lseek,
550 .release = single_release,
551};
78c686e9 552#endif
107f1634 553
73b38711 554static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
555{
556 struct proc_dir_entry *pde;
557
558 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
559 &rt_cache_seq_fops);
560 if (!pde)
561 goto err1;
562
77020720
WC
563 pde = proc_create("rt_cache", S_IRUGO,
564 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
565 if (!pde)
566 goto err2;
567
107f1634 568#ifdef CONFIG_NET_CLS_ROUTE
a661c419 569 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
570 if (!pde)
571 goto err3;
572#endif
573 return 0;
574
575#ifdef CONFIG_NET_CLS_ROUTE
576err3:
577 remove_proc_entry("rt_cache", net->proc_net_stat);
578#endif
579err2:
580 remove_proc_entry("rt_cache", net->proc_net);
581err1:
582 return -ENOMEM;
583}
73b38711
DL
584
585static void __net_exit ip_rt_do_proc_exit(struct net *net)
586{
587 remove_proc_entry("rt_cache", net->proc_net_stat);
588 remove_proc_entry("rt_cache", net->proc_net);
0a931acf 589#ifdef CONFIG_NET_CLS_ROUTE
73b38711 590 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 591#endif
73b38711
DL
592}
593
594static struct pernet_operations ip_rt_proc_ops __net_initdata = {
595 .init = ip_rt_do_proc_init,
596 .exit = ip_rt_do_proc_exit,
597};
598
599static int __init ip_rt_proc_init(void)
600{
601 return register_pernet_subsys(&ip_rt_proc_ops);
602}
603
107f1634 604#else
73b38711 605static inline int ip_rt_proc_init(void)
107f1634
PE
606{
607 return 0;
608}
1da177e4 609#endif /* CONFIG_PROC_FS */
e905a9ed 610
5969f71d 611static inline void rt_free(struct rtable *rt)
1da177e4 612{
1da177e4
LT
613 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
614}
615
5969f71d 616static inline void rt_drop(struct rtable *rt)
1da177e4 617{
1da177e4
LT
618 ip_rt_put(rt);
619 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
620}
621
5969f71d 622static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
623{
624 /* Kill broadcast/multicast entries very aggresively, if they
625 collide in hash table with more useful entries */
626 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
093c2ca4 627 rth->fl.iif && rth->u.dst.rt_next;
1da177e4
LT
628}
629
5969f71d 630static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
631{
632 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 rth->u.dst.expires;
634}
635
636static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
637{
638 unsigned long age;
639 int ret = 0;
640
641 if (atomic_read(&rth->u.dst.__refcnt))
642 goto out;
643
644 ret = 1;
645 if (rth->u.dst.expires &&
646 time_after_eq(jiffies, rth->u.dst.expires))
647 goto out;
648
649 age = jiffies - rth->u.dst.lastuse;
650 ret = 0;
651 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 (age <= tmo2 && rt_valuable(rth)))
653 goto out;
654 ret = 1;
655out: return ret;
656}
657
658/* Bits of score are:
659 * 31: very valuable
660 * 30: not quite useless
661 * 29..0: usage counter
662 */
663static inline u32 rt_score(struct rtable *rt)
664{
665 u32 score = jiffies - rt->u.dst.lastuse;
666
667 score = ~score & ~(3<<30);
668
669 if (rt_valuable(rt))
670 score |= (1<<31);
671
672 if (!rt->fl.iif ||
673 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 score |= (1<<30);
675
676 return score;
677}
678
1080d709
NH
679static inline bool rt_caching(const struct net *net)
680{
681 return net->ipv4.current_rt_cache_rebuild_count <=
682 net->ipv4.sysctl_rt_cache_rebuild_count;
683}
684
685static inline bool compare_hash_inputs(const struct flowi *fl1,
686 const struct flowi *fl2)
687{
688 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690 (fl1->iif ^ fl2->iif)) == 0);
691}
692
1da177e4
LT
693static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694{
714e85be
AV
695 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
696 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
47dcf0cb 697 (fl1->mark ^ fl2->mark) |
8238b218
DM
698 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
699 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 (fl1->oif ^ fl2->oif) |
701 (fl1->iif ^ fl2->iif)) == 0;
1da177e4
LT
702}
703
b5921910
DL
704static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705{
09ad9bc7 706 return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
b5921910
DL
707}
708
e84f84f2
DL
709static inline int rt_is_expired(struct rtable *rth)
710{
711 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
712}
713
beb659bd
ED
714/*
715 * Perform a full scan of hash table and free all entries.
716 * Can be called by a softirq or a process.
717 * In the later case, we want to be reschedule if necessary
718 */
719static void rt_do_flush(int process_context)
720{
721 unsigned int i;
722 struct rtable *rth, *next;
32cb5b4e 723 struct rtable * tail;
beb659bd
ED
724
725 for (i = 0; i <= rt_hash_mask; i++) {
726 if (process_context && need_resched())
727 cond_resched();
728 rth = rt_hash_table[i].chain;
729 if (!rth)
730 continue;
731
732 spin_lock_bh(rt_hash_lock_addr(i));
32cb5b4e
DL
733#ifdef CONFIG_NET_NS
734 {
735 struct rtable ** prev, * p;
736
737 rth = rt_hash_table[i].chain;
738
739 /* defer releasing the head of the list after spin_unlock */
740 for (tail = rth; tail; tail = tail->u.dst.rt_next)
741 if (!rt_is_expired(tail))
742 break;
743 if (rth != tail)
744 rt_hash_table[i].chain = tail;
745
746 /* call rt_free on entries after the tail requiring flush */
747 prev = &rt_hash_table[i].chain;
748 for (p = *prev; p; p = next) {
749 next = p->u.dst.rt_next;
750 if (!rt_is_expired(p)) {
751 prev = &p->u.dst.rt_next;
752 } else {
753 *prev = next;
754 rt_free(p);
755 }
756 }
757 }
758#else
beb659bd
ED
759 rth = rt_hash_table[i].chain;
760 rt_hash_table[i].chain = NULL;
32cb5b4e
DL
761 tail = NULL;
762#endif
beb659bd
ED
763 spin_unlock_bh(rt_hash_lock_addr(i));
764
32cb5b4e 765 for (; rth != tail; rth = next) {
beb659bd
ED
766 next = rth->u.dst.rt_next;
767 rt_free(rth);
768 }
769 }
770}
771
1080d709
NH
772/*
773 * While freeing expired entries, we compute average chain length
774 * and standard deviation, using fixed-point arithmetic.
775 * This to have an estimation of rt_chain_length_max
776 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
777 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
778 */
779
780#define FRACT_BITS 3
781#define ONE (1UL << FRACT_BITS)
782
98376387
ED
783/*
784 * Given a hash chain and an item in this hash chain,
785 * find if a previous entry has the same hash_inputs
786 * (but differs on tos, mark or oif)
787 * Returns 0 if an alias is found.
788 * Returns ONE if rth has no alias before itself.
789 */
790static int has_noalias(const struct rtable *head, const struct rtable *rth)
791{
792 const struct rtable *aux = head;
793
794 while (aux != rth) {
795 if (compare_hash_inputs(&aux->fl, &rth->fl))
796 return 0;
797 aux = aux->u.dst.rt_next;
798 }
799 return ONE;
800}
801
beb659bd 802static void rt_check_expire(void)
1da177e4 803{
bb1d23b0
ED
804 static unsigned int rover;
805 unsigned int i = rover, goal;
98376387 806 struct rtable *rth, **rthp;
cf8da764 807 unsigned long samples = 0;
1080d709 808 unsigned long sum = 0, sum2 = 0;
125bb8f5 809 unsigned long delta;
bb1d23b0
ED
810 u64 mult;
811
125bb8f5
ED
812 delta = jiffies - expires_ljiffies;
813 expires_ljiffies = jiffies;
814 mult = ((u64)delta) << rt_hash_log;
bb1d23b0
ED
815 if (ip_rt_gc_timeout > 1)
816 do_div(mult, ip_rt_gc_timeout);
817 goal = (unsigned int)mult;
39c90ece
ED
818 if (goal > rt_hash_mask)
819 goal = rt_hash_mask + 1;
bb1d23b0 820 for (; goal > 0; goal--) {
1da177e4 821 unsigned long tmo = ip_rt_gc_timeout;
cf8da764 822 unsigned long length;
1da177e4
LT
823
824 i = (i + 1) & rt_hash_mask;
825 rthp = &rt_hash_table[i].chain;
826
d90bf5a9
ED
827 if (need_resched())
828 cond_resched();
829
1080d709
NH
830 samples++;
831
cfcabdcc 832 if (*rthp == NULL)
bb1d23b0 833 continue;
cf8da764 834 length = 0;
39c90ece 835 spin_lock_bh(rt_hash_lock_addr(i));
1da177e4 836 while ((rth = *rthp) != NULL) {
1ddbcb00 837 prefetch(rth->u.dst.rt_next);
e84f84f2 838 if (rt_is_expired(rth)) {
29e75252
ED
839 *rthp = rth->u.dst.rt_next;
840 rt_free(rth);
841 continue;
842 }
1da177e4
LT
843 if (rth->u.dst.expires) {
844 /* Entry is expired even if it is in use */
39c90ece 845 if (time_before_eq(jiffies, rth->u.dst.expires)) {
1ddbcb00 846nofree:
1da177e4 847 tmo >>= 1;
093c2ca4 848 rthp = &rth->u.dst.rt_next;
1080d709 849 /*
1ddbcb00 850 * We only count entries on
1080d709
NH
851 * a chain with equal hash inputs once
852 * so that entries for different QOS
853 * levels, and other non-hash input
854 * attributes don't unfairly skew
855 * the length computation
856 */
98376387 857 length += has_noalias(rt_hash_table[i].chain, rth);
1da177e4
LT
858 continue;
859 }
1ddbcb00
ED
860 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
861 goto nofree;
1da177e4
LT
862
863 /* Cleanup aged off entries. */
093c2ca4 864 *rthp = rth->u.dst.rt_next;
e905a9ed 865 rt_free(rth);
1da177e4 866 }
39c90ece 867 spin_unlock_bh(rt_hash_lock_addr(i));
1080d709
NH
868 sum += length;
869 sum2 += length*length;
870 }
871 if (samples) {
872 unsigned long avg = sum / samples;
873 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
874 rt_chain_length_max = max_t(unsigned long,
875 ip_rt_gc_elasticity,
876 (avg + 4*sd) >> FRACT_BITS);
1da177e4
LT
877 }
878 rover = i;
beb659bd
ED
879}
880
881/*
882 * rt_worker_func() is run in process context.
29e75252 883 * we call rt_check_expire() to scan part of the hash table
beb659bd
ED
884 */
885static void rt_worker_func(struct work_struct *work)
886{
29e75252 887 rt_check_expire();
39c90ece 888 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
1da177e4
LT
889}
890
29e75252
ED
891/*
892 * Pertubation of rt_genid by a small quantity [1..256]
893 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
894 * many times (2^24) without giving recent rt_genid.
895 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 896 */
86c657f6 897static void rt_cache_invalidate(struct net *net)
1da177e4 898{
29e75252 899 unsigned char shuffle;
1da177e4 900
29e75252 901 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 902 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
1da177e4
LT
903}
904
29e75252
ED
905/*
906 * delay < 0 : invalidate cache (fast : entries will be deleted later)
907 * delay >= 0 : invalidate & flush cache (can be long)
908 */
76e6ebfb 909void rt_cache_flush(struct net *net, int delay)
1da177e4 910{
86c657f6 911 rt_cache_invalidate(net);
29e75252
ED
912 if (delay >= 0)
913 rt_do_flush(!in_softirq());
1da177e4
LT
914}
915
a5ee1551
EB
916/* Flush previous cache invalidated entries from the cache */
917void rt_cache_flush_batch(void)
918{
919 rt_do_flush(!in_softirq());
920}
921
beb659bd 922/*
29e75252 923 * We change rt_genid and let gc do the cleanup
beb659bd 924 */
9f5e97e5 925static void rt_secret_rebuild(unsigned long __net)
1da177e4 926{
9f5e97e5 927 struct net *net = (struct net *)__net;
86c657f6 928 rt_cache_invalidate(net);
9f5e97e5 929 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
1da177e4
LT
930}
931
1080d709
NH
932static void rt_secret_rebuild_oneshot(struct net *net)
933{
934 del_timer_sync(&net->ipv4.rt_secret_timer);
935 rt_cache_invalidate(net);
858a18a6
VG
936 if (ip_rt_secret_interval)
937 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
1080d709
NH
938}
939
940static void rt_emergency_hash_rebuild(struct net *net)
941{
942 if (net_ratelimit()) {
943 printk(KERN_WARNING "Route hash chain too long!\n");
944 printk(KERN_WARNING "Adjust your secret_interval!\n");
945 }
946
947 rt_secret_rebuild_oneshot(net);
948}
949
1da177e4
LT
950/*
951 Short description of GC goals.
952
953 We want to build algorithm, which will keep routing cache
954 at some equilibrium point, when number of aged off entries
955 is kept approximately equal to newly generated ones.
956
957 Current expiration strength is variable "expire".
958 We try to adjust it dynamically, so that if networking
959 is idle expires is large enough to keep enough of warm entries,
960 and when load increases it reduces to limit cache size.
961 */
962
569d3645 963static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
964{
965 static unsigned long expire = RT_GC_TIMEOUT;
966 static unsigned long last_gc;
967 static int rover;
968 static int equilibrium;
969 struct rtable *rth, **rthp;
970 unsigned long now = jiffies;
971 int goal;
972
973 /*
974 * Garbage collection is pretty expensive,
975 * do not make it too frequently.
976 */
977
978 RT_CACHE_STAT_INC(gc_total);
979
980 if (now - last_gc < ip_rt_gc_min_interval &&
981 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
982 RT_CACHE_STAT_INC(gc_ignored);
983 goto out;
984 }
985
986 /* Calculate number of entries, which we want to expire now. */
987 goal = atomic_read(&ipv4_dst_ops.entries) -
988 (ip_rt_gc_elasticity << rt_hash_log);
989 if (goal <= 0) {
990 if (equilibrium < ipv4_dst_ops.gc_thresh)
991 equilibrium = ipv4_dst_ops.gc_thresh;
992 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
993 if (goal > 0) {
b790cedd 994 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1da177e4
LT
995 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
996 }
997 } else {
998 /* We are in dangerous area. Try to reduce cache really
999 * aggressively.
1000 */
b790cedd 1001 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1da177e4
LT
1002 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
1003 }
1004
1005 if (now - last_gc >= ip_rt_gc_min_interval)
1006 last_gc = now;
1007
1008 if (goal <= 0) {
1009 equilibrium += goal;
1010 goto work_done;
1011 }
1012
1013 do {
1014 int i, k;
1015
1016 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1017 unsigned long tmo = expire;
1018
1019 k = (k + 1) & rt_hash_mask;
1020 rthp = &rt_hash_table[k].chain;
22c047cc 1021 spin_lock_bh(rt_hash_lock_addr(k));
1da177e4 1022 while ((rth = *rthp) != NULL) {
e84f84f2 1023 if (!rt_is_expired(rth) &&
29e75252 1024 !rt_may_expire(rth, tmo, expire)) {
1da177e4 1025 tmo >>= 1;
093c2ca4 1026 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1027 continue;
1028 }
093c2ca4 1029 *rthp = rth->u.dst.rt_next;
1da177e4
LT
1030 rt_free(rth);
1031 goal--;
1da177e4 1032 }
22c047cc 1033 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
1034 if (goal <= 0)
1035 break;
1036 }
1037 rover = k;
1038
1039 if (goal <= 0)
1040 goto work_done;
1041
1042 /* Goal is not achieved. We stop process if:
1043
1044 - if expire reduced to zero. Otherwise, expire is halfed.
1045 - if table is not full.
1046 - if we are called from interrupt.
1047 - jiffies check is just fallback/debug loop breaker.
1048 We will not spin here for long time in any case.
1049 */
1050
1051 RT_CACHE_STAT_INC(gc_goal_miss);
1052
1053 if (expire == 0)
1054 break;
1055
1056 expire >>= 1;
1057#if RT_CACHE_DEBUG >= 2
1058 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1059 atomic_read(&ipv4_dst_ops.entries), goal, i);
1060#endif
1061
1062 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1063 goto out;
1064 } while (!in_softirq() && time_before_eq(jiffies, now));
1065
1066 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1067 goto out;
1068 if (net_ratelimit())
1069 printk(KERN_WARNING "dst cache overflow\n");
1070 RT_CACHE_STAT_INC(gc_dst_overflow);
1071 return 1;
1072
1073work_done:
1074 expire += ip_rt_gc_min_interval;
1075 if (expire > ip_rt_gc_timeout ||
1076 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1077 expire = ip_rt_gc_timeout;
1078#if RT_CACHE_DEBUG >= 2
1079 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1080 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1081#endif
1082out: return 0;
1083}
1084
98376387
ED
1085/*
1086 * Returns number of entries in a hash chain that have different hash_inputs
1087 */
1088static int slow_chain_length(const struct rtable *head)
1089{
1090 int length = 0;
1091 const struct rtable *rth = head;
1092
1093 while (rth) {
1094 length += has_noalias(head, rth);
1095 rth = rth->u.dst.rt_next;
1096 }
1097 return length >> FRACT_BITS;
1098}
1099
511c3f92 1100static int rt_intern_hash(unsigned hash, struct rtable *rt,
6a2bad70 1101 struct rtable **rp, struct sk_buff *skb, int ifindex)
1da177e4
LT
1102{
1103 struct rtable *rth, **rthp;
1104 unsigned long now;
1105 struct rtable *cand, **candp;
1106 u32 min_score;
1107 int chain_length;
1108 int attempts = !in_softirq();
1109
1110restart:
1111 chain_length = 0;
1112 min_score = ~(u32)0;
1113 cand = NULL;
1114 candp = NULL;
1115 now = jiffies;
1116
1080d709 1117 if (!rt_caching(dev_net(rt->u.dst.dev))) {
73e42897
NH
1118 /*
1119 * If we're not caching, just tell the caller we
1120 * were successful and don't touch the route. The
1121 * caller hold the sole reference to the cache entry, and
1122 * it will be released when the caller is done with it.
1123 * If we drop it here, the callers have no way to resolve routes
1124 * when we're not caching. Instead, just point *rp at rt, so
1125 * the caller gets a single use out of the route
b6280b47
NH
1126 * Note that we do rt_free on this new route entry, so that
1127 * once its refcount hits zero, we are still able to reap it
1128 * (Thanks Alexey)
1129 * Note also the rt_free uses call_rcu. We don't actually
1130 * need rcu protection here, this is just our path to get
1131 * on the route gc list.
73e42897 1132 */
b6280b47
NH
1133
1134 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1135 int err = arp_bind_neighbour(&rt->u.dst);
1136 if (err) {
1137 if (net_ratelimit())
1138 printk(KERN_WARNING
1139 "Neighbour table failure & not caching routes.\n");
1140 rt_drop(rt);
1141 return err;
1142 }
1143 }
1144
1145 rt_free(rt);
1146 goto skip_hashing;
1080d709
NH
1147 }
1148
1da177e4
LT
1149 rthp = &rt_hash_table[hash].chain;
1150
22c047cc 1151 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1152 while ((rth = *rthp) != NULL) {
e84f84f2 1153 if (rt_is_expired(rth)) {
29e75252
ED
1154 *rthp = rth->u.dst.rt_next;
1155 rt_free(rth);
1156 continue;
1157 }
b5921910 1158 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1da177e4 1159 /* Put it first */
093c2ca4 1160 *rthp = rth->u.dst.rt_next;
1da177e4
LT
1161 /*
1162 * Since lookup is lockfree, the deletion
1163 * must be visible to another weakly ordered CPU before
1164 * the insertion at the start of the hash chain.
1165 */
093c2ca4 1166 rcu_assign_pointer(rth->u.dst.rt_next,
1da177e4
LT
1167 rt_hash_table[hash].chain);
1168 /*
1169 * Since lookup is lockfree, the update writes
1170 * must be ordered for consistency on SMP.
1171 */
1172 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1173
03f49f34 1174 dst_use(&rth->u.dst, now);
22c047cc 1175 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1176
1177 rt_drop(rt);
511c3f92
ED
1178 if (rp)
1179 *rp = rth;
1180 else
adf30907 1181 skb_dst_set(skb, &rth->u.dst);
1da177e4
LT
1182 return 0;
1183 }
1184
1185 if (!atomic_read(&rth->u.dst.__refcnt)) {
1186 u32 score = rt_score(rth);
1187
1188 if (score <= min_score) {
1189 cand = rth;
1190 candp = rthp;
1191 min_score = score;
1192 }
1193 }
1194
1195 chain_length++;
1196
093c2ca4 1197 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1198 }
1199
1200 if (cand) {
1201 /* ip_rt_gc_elasticity used to be average length of chain
1202 * length, when exceeded gc becomes really aggressive.
1203 *
1204 * The second limit is less certain. At the moment it allows
1205 * only 2 entries per bucket. We will see.
1206 */
1207 if (chain_length > ip_rt_gc_elasticity) {
093c2ca4 1208 *candp = cand->u.dst.rt_next;
1da177e4
LT
1209 rt_free(cand);
1210 }
1080d709 1211 } else {
98376387
ED
1212 if (chain_length > rt_chain_length_max &&
1213 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1080d709
NH
1214 struct net *net = dev_net(rt->u.dst.dev);
1215 int num = ++net->ipv4.current_rt_cache_rebuild_count;
b35ecb5d 1216 if (!rt_caching(net)) {
1080d709
NH
1217 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1218 rt->u.dst.dev->name, num);
1219 }
b35ecb5d 1220 rt_emergency_hash_rebuild(net);
6a2bad70
PE
1221 spin_unlock_bh(rt_hash_lock_addr(hash));
1222
1223 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1224 ifindex, rt_genid(net));
1225 goto restart;
1080d709 1226 }
1da177e4
LT
1227 }
1228
1229 /* Try to bind route to arp only if it is output
1230 route or unicast forwarding path.
1231 */
1232 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1233 int err = arp_bind_neighbour(&rt->u.dst);
1234 if (err) {
22c047cc 1235 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1236
1237 if (err != -ENOBUFS) {
1238 rt_drop(rt);
1239 return err;
1240 }
1241
1242 /* Neighbour tables are full and nothing
1243 can be released. Try to shrink route cache,
1244 it is most likely it holds some neighbour records.
1245 */
1246 if (attempts-- > 0) {
1247 int saved_elasticity = ip_rt_gc_elasticity;
1248 int saved_int = ip_rt_gc_min_interval;
1249 ip_rt_gc_elasticity = 1;
1250 ip_rt_gc_min_interval = 0;
569d3645 1251 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1252 ip_rt_gc_min_interval = saved_int;
1253 ip_rt_gc_elasticity = saved_elasticity;
1254 goto restart;
1255 }
1256
1257 if (net_ratelimit())
1258 printk(KERN_WARNING "Neighbour table overflow.\n");
1259 rt_drop(rt);
1260 return -ENOBUFS;
1261 }
1262 }
1263
1ddbcb00 1264 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1080d709 1265
1da177e4 1266#if RT_CACHE_DEBUG >= 2
093c2ca4 1267 if (rt->u.dst.rt_next) {
1da177e4 1268 struct rtable *trt;
b6280b47
NH
1269 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1270 hash, &rt->rt_dst);
093c2ca4 1271 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
673d57e7 1272 printk(" . %pI4", &trt->rt_dst);
1da177e4
LT
1273 printk("\n");
1274 }
1275#endif
00269b54
ED
1276 /*
1277 * Since lookup is lockfree, we must make sure
1278 * previous writes to rt are comitted to memory
1279 * before making rt visible to other CPUS.
1280 */
1ddbcb00 1281 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1080d709 1282
22c047cc 1283 spin_unlock_bh(rt_hash_lock_addr(hash));
73e42897 1284
b6280b47 1285skip_hashing:
511c3f92
ED
1286 if (rp)
1287 *rp = rt;
1288 else
adf30907 1289 skb_dst_set(skb, &rt->u.dst);
1da177e4
LT
1290 return 0;
1291}
1292
1293void rt_bind_peer(struct rtable *rt, int create)
1294{
1295 static DEFINE_SPINLOCK(rt_peer_lock);
1296 struct inet_peer *peer;
1297
1298 peer = inet_getpeer(rt->rt_dst, create);
1299
1300 spin_lock_bh(&rt_peer_lock);
1301 if (rt->peer == NULL) {
1302 rt->peer = peer;
1303 peer = NULL;
1304 }
1305 spin_unlock_bh(&rt_peer_lock);
1306 if (peer)
1307 inet_putpeer(peer);
1308}
1309
1310/*
1311 * Peer allocation may fail only in serious out-of-memory conditions. However
1312 * we still can generate some output.
1313 * Random ID selection looks a bit dangerous because we have no chances to
1314 * select ID being unique in a reasonable period of time.
1315 * But broken packet identifier may be better than no packet at all.
1316 */
1317static void ip_select_fb_ident(struct iphdr *iph)
1318{
1319 static DEFINE_SPINLOCK(ip_fb_id_lock);
1320 static u32 ip_fallback_id;
1321 u32 salt;
1322
1323 spin_lock_bh(&ip_fb_id_lock);
e448515c 1324 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1325 iph->id = htons(salt & 0xFFFF);
1326 ip_fallback_id = salt;
1327 spin_unlock_bh(&ip_fb_id_lock);
1328}
1329
1330void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1331{
1332 struct rtable *rt = (struct rtable *) dst;
1333
1334 if (rt) {
1335 if (rt->peer == NULL)
1336 rt_bind_peer(rt, 1);
1337
1338 /* If peer is attached to destination, it is never detached,
1339 so that we need not to grab a lock to dereference it.
1340 */
1341 if (rt->peer) {
1342 iph->id = htons(inet_getid(rt->peer, more));
1343 return;
1344 }
1345 } else
e905a9ed 1346 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1347 __builtin_return_address(0));
1da177e4
LT
1348
1349 ip_select_fb_ident(iph);
1350}
1351
1352static void rt_del(unsigned hash, struct rtable *rt)
1353{
29e75252 1354 struct rtable **rthp, *aux;
1da177e4 1355
29e75252 1356 rthp = &rt_hash_table[hash].chain;
22c047cc 1357 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1358 ip_rt_put(rt);
29e75252 1359 while ((aux = *rthp) != NULL) {
e84f84f2 1360 if (aux == rt || rt_is_expired(aux)) {
29e75252
ED
1361 *rthp = aux->u.dst.rt_next;
1362 rt_free(aux);
1363 continue;
1da177e4 1364 }
29e75252
ED
1365 rthp = &aux->u.dst.rt_next;
1366 }
22c047cc 1367 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1368}
1369
f7655229
AV
1370void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1371 __be32 saddr, struct net_device *dev)
1da177e4
LT
1372{
1373 int i, k;
1374 struct in_device *in_dev = in_dev_get(dev);
1375 struct rtable *rth, **rthp;
f7655229 1376 __be32 skeys[2] = { saddr, 0 };
1da177e4 1377 int ikeys[2] = { dev->ifindex, 0 };
8d71740c 1378 struct netevent_redirect netevent;
317805b8 1379 struct net *net;
1da177e4 1380
1da177e4
LT
1381 if (!in_dev)
1382 return;
1383
c346dca1 1384 net = dev_net(dev);
9d4fb27d
JP
1385 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1386 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1387 ipv4_is_zeronet(new_gw))
1da177e4
LT
1388 goto reject_redirect;
1389
1080d709
NH
1390 if (!rt_caching(net))
1391 goto reject_redirect;
1392
1da177e4
LT
1393 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1394 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1395 goto reject_redirect;
1396 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1397 goto reject_redirect;
1398 } else {
317805b8 1399 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1400 goto reject_redirect;
1401 }
1402
1403 for (i = 0; i < 2; i++) {
1404 for (k = 0; k < 2; k++) {
b00180de 1405 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
e84f84f2 1406 rt_genid(net));
1da177e4
LT
1407
1408 rthp=&rt_hash_table[hash].chain;
1409
1410 rcu_read_lock();
1411 while ((rth = rcu_dereference(*rthp)) != NULL) {
1412 struct rtable *rt;
1413
1414 if (rth->fl.fl4_dst != daddr ||
1415 rth->fl.fl4_src != skeys[i] ||
1da177e4 1416 rth->fl.oif != ikeys[k] ||
29e75252 1417 rth->fl.iif != 0 ||
e84f84f2 1418 rt_is_expired(rth) ||
878628fb 1419 !net_eq(dev_net(rth->u.dst.dev), net)) {
093c2ca4 1420 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1421 continue;
1422 }
1423
1424 if (rth->rt_dst != daddr ||
1425 rth->rt_src != saddr ||
1426 rth->u.dst.error ||
1427 rth->rt_gateway != old_gw ||
1428 rth->u.dst.dev != dev)
1429 break;
1430
1431 dst_hold(&rth->u.dst);
1432 rcu_read_unlock();
1433
1434 rt = dst_alloc(&ipv4_dst_ops);
1435 if (rt == NULL) {
1436 ip_rt_put(rth);
1437 in_dev_put(in_dev);
1438 return;
1439 }
1440
1441 /* Copy all the information. */
1442 *rt = *rth;
1da177e4
LT
1443 rt->u.dst.__use = 1;
1444 atomic_set(&rt->u.dst.__refcnt, 1);
1445 rt->u.dst.child = NULL;
1446 if (rt->u.dst.dev)
1447 dev_hold(rt->u.dst.dev);
1448 if (rt->idev)
1449 in_dev_hold(rt->idev);
d11a4dc1 1450 rt->u.dst.obsolete = -1;
1da177e4
LT
1451 rt->u.dst.lastuse = jiffies;
1452 rt->u.dst.path = &rt->u.dst;
1453 rt->u.dst.neighbour = NULL;
1454 rt->u.dst.hh = NULL;
def8b4fa 1455#ifdef CONFIG_XFRM
1da177e4 1456 rt->u.dst.xfrm = NULL;
def8b4fa 1457#endif
e84f84f2 1458 rt->rt_genid = rt_genid(net);
1da177e4
LT
1459 rt->rt_flags |= RTCF_REDIRECTED;
1460
1461 /* Gateway is different ... */
1462 rt->rt_gateway = new_gw;
1463
1464 /* Redirect received -> path was valid */
1465 dst_confirm(&rth->u.dst);
1466
1467 if (rt->peer)
1468 atomic_inc(&rt->peer->refcnt);
1469
1470 if (arp_bind_neighbour(&rt->u.dst) ||
1471 !(rt->u.dst.neighbour->nud_state &
1472 NUD_VALID)) {
1473 if (rt->u.dst.neighbour)
1474 neigh_event_send(rt->u.dst.neighbour, NULL);
1475 ip_rt_put(rth);
1476 rt_drop(rt);
1477 goto do_next;
1478 }
e905a9ed 1479
8d71740c
TT
1480 netevent.old = &rth->u.dst;
1481 netevent.new = &rt->u.dst;
e905a9ed
YH
1482 call_netevent_notifiers(NETEVENT_REDIRECT,
1483 &netevent);
1da177e4
LT
1484
1485 rt_del(hash, rth);
6a2bad70 1486 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1da177e4
LT
1487 ip_rt_put(rt);
1488 goto do_next;
1489 }
1490 rcu_read_unlock();
1491 do_next:
1492 ;
1493 }
1494 }
1495 in_dev_put(in_dev);
1496 return;
1497
1498reject_redirect:
1499#ifdef CONFIG_IP_ROUTE_VERBOSE
1500 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
1501 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1502 " Advised path = %pI4 -> %pI4\n",
1503 &old_gw, dev->name, &new_gw,
1504 &saddr, &daddr);
1da177e4
LT
1505#endif
1506 in_dev_put(in_dev);
1507}
1508
1509static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1510{
ee6b9673 1511 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1512 struct dst_entry *ret = dst;
1513
1514 if (rt) {
d11a4dc1 1515 if (dst->obsolete > 0) {
1da177e4
LT
1516 ip_rt_put(rt);
1517 ret = NULL;
1518 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
5e016cbf
GR
1519 (rt->u.dst.expires &&
1520 time_after_eq(jiffies, rt->u.dst.expires))) {
8c7bc840 1521 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
b00180de 1522 rt->fl.oif,
e84f84f2 1523 rt_genid(dev_net(dst->dev)));
1da177e4 1524#if RT_CACHE_DEBUG >= 1
673d57e7
HH
1525 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1526 &rt->rt_dst, rt->fl.fl4_tos);
1da177e4
LT
1527#endif
1528 rt_del(hash, rt);
1529 ret = NULL;
1530 }
1531 }
1532 return ret;
1533}
1534
1535/*
1536 * Algorithm:
1537 * 1. The first ip_rt_redirect_number redirects are sent
1538 * with exponential backoff, then we stop sending them at all,
1539 * assuming that the host ignores our redirects.
1540 * 2. If we did not see packets requiring redirects
1541 * during ip_rt_redirect_silence, we assume that the host
1542 * forgot redirected route and start to send redirects again.
1543 *
1544 * This algorithm is much cheaper and more intelligent than dumb load limiting
1545 * in icmp.c.
1546 *
1547 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1548 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1549 */
1550
1551void ip_rt_send_redirect(struct sk_buff *skb)
1552{
511c3f92 1553 struct rtable *rt = skb_rtable(skb);
30038fc6
ED
1554 struct in_device *in_dev;
1555 int log_martians;
1da177e4 1556
30038fc6
ED
1557 rcu_read_lock();
1558 in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1559 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1560 rcu_read_unlock();
1da177e4 1561 return;
30038fc6
ED
1562 }
1563 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1564 rcu_read_unlock();
1da177e4
LT
1565
1566 /* No redirected packets during ip_rt_redirect_silence;
1567 * reset the algorithm.
1568 */
1569 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1570 rt->u.dst.rate_tokens = 0;
1571
1572 /* Too many ignored redirects; do not send anything
1573 * set u.dst.rate_last to the last seen redirected packet.
1574 */
1575 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1576 rt->u.dst.rate_last = jiffies;
30038fc6 1577 return;
1da177e4
LT
1578 }
1579
1580 /* Check for load limit; set rate_last to the latest sent
1581 * redirect.
1582 */
14fb8a76
LY
1583 if (rt->u.dst.rate_tokens == 0 ||
1584 time_after(jiffies,
1da177e4
LT
1585 (rt->u.dst.rate_last +
1586 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1587 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1588 rt->u.dst.rate_last = jiffies;
1589 ++rt->u.dst.rate_tokens;
1590#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 1591 if (log_martians &&
1da177e4
LT
1592 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1593 net_ratelimit())
673d57e7
HH
1594 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1595 &rt->rt_src, rt->rt_iif,
1596 &rt->rt_dst, &rt->rt_gateway);
1da177e4
LT
1597#endif
1598 }
1da177e4
LT
1599}
1600
1601static int ip_error(struct sk_buff *skb)
1602{
511c3f92 1603 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
1604 unsigned long now;
1605 int code;
1606
1607 switch (rt->u.dst.error) {
1608 case EINVAL:
1609 default:
1610 goto out;
1611 case EHOSTUNREACH:
1612 code = ICMP_HOST_UNREACH;
1613 break;
1614 case ENETUNREACH:
1615 code = ICMP_NET_UNREACH;
7c73a6fa
PE
1616 IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1617 IPSTATS_MIB_INNOROUTES);
1da177e4
LT
1618 break;
1619 case EACCES:
1620 code = ICMP_PKT_FILTERED;
1621 break;
1622 }
1623
1624 now = jiffies;
1625 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1626 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1627 rt->u.dst.rate_tokens = ip_rt_error_burst;
1628 rt->u.dst.rate_last = now;
1629 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1630 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1631 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1632 }
1633
1634out: kfree_skb(skb);
1635 return 0;
e905a9ed 1636}
1da177e4
LT
1637
1638/*
1639 * The last two values are not from the RFC but
1640 * are needed for AMPRnet AX.25 paths.
1641 */
1642
9b5b5cff 1643static const unsigned short mtu_plateau[] =
1da177e4
LT
1644{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1645
5969f71d 1646static inline unsigned short guess_mtu(unsigned short old_mtu)
1da177e4
LT
1647{
1648 int i;
e905a9ed 1649
1da177e4
LT
1650 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1651 if (old_mtu > mtu_plateau[i])
1652 return mtu_plateau[i];
1653 return 68;
1654}
1655
b5921910 1656unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
0010e465
TT
1657 unsigned short new_mtu,
1658 struct net_device *dev)
1da177e4 1659{
0010e465 1660 int i, k;
1da177e4
LT
1661 unsigned short old_mtu = ntohs(iph->tot_len);
1662 struct rtable *rth;
0010e465 1663 int ikeys[2] = { dev->ifindex, 0 };
e448515c
AV
1664 __be32 skeys[2] = { iph->saddr, 0, };
1665 __be32 daddr = iph->daddr;
1da177e4
LT
1666 unsigned short est_mtu = 0;
1667
0010e465
TT
1668 for (k = 0; k < 2; k++) {
1669 for (i = 0; i < 2; i++) {
b00180de 1670 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
e84f84f2 1671 rt_genid(net));
0010e465
TT
1672
1673 rcu_read_lock();
1674 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1675 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
1676 unsigned short mtu = new_mtu;
1677
0010e465
TT
1678 if (rth->fl.fl4_dst != daddr ||
1679 rth->fl.fl4_src != skeys[i] ||
1680 rth->rt_dst != daddr ||
1681 rth->rt_src != iph->saddr ||
1682 rth->fl.oif != ikeys[k] ||
1683 rth->fl.iif != 0 ||
1684 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1685 !net_eq(dev_net(rth->u.dst.dev), net) ||
6c3b8fc6 1686 rt_is_expired(rth))
0010e465
TT
1687 continue;
1688
1da177e4
LT
1689 if (new_mtu < 68 || new_mtu >= old_mtu) {
1690
1691 /* BSD 4.2 compatibility hack :-( */
1692 if (mtu == 0 &&
6d273f8d 1693 old_mtu >= dst_mtu(&rth->u.dst) &&
1da177e4
LT
1694 old_mtu >= 68 + (iph->ihl << 2))
1695 old_mtu -= iph->ihl << 2;
1696
1697 mtu = guess_mtu(old_mtu);
1698 }
6d273f8d
RR
1699 if (mtu <= dst_mtu(&rth->u.dst)) {
1700 if (mtu < dst_mtu(&rth->u.dst)) {
1da177e4
LT
1701 dst_confirm(&rth->u.dst);
1702 if (mtu < ip_rt_min_pmtu) {
1703 mtu = ip_rt_min_pmtu;
1704 rth->u.dst.metrics[RTAX_LOCK-1] |=
1705 (1 << RTAX_MTU);
1706 }
1707 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1708 dst_set_expires(&rth->u.dst,
1709 ip_rt_mtu_expires);
1710 }
1711 est_mtu = mtu;
1712 }
1713 }
0010e465 1714 rcu_read_unlock();
1da177e4 1715 }
1da177e4
LT
1716 }
1717 return est_mtu ? : new_mtu;
1718}
1719
1720static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1721{
6d273f8d 1722 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1da177e4
LT
1723 !(dst_metric_locked(dst, RTAX_MTU))) {
1724 if (mtu < ip_rt_min_pmtu) {
1725 mtu = ip_rt_min_pmtu;
1726 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1727 }
1728 dst->metrics[RTAX_MTU-1] = mtu;
1729 dst_set_expires(dst, ip_rt_mtu_expires);
8d71740c 1730 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1da177e4
LT
1731 }
1732}
1733
1734static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1735{
d11a4dc1
TT
1736 if (rt_is_expired((struct rtable *)dst))
1737 return NULL;
1738 return dst;
1da177e4
LT
1739}
1740
1741static void ipv4_dst_destroy(struct dst_entry *dst)
1742{
1743 struct rtable *rt = (struct rtable *) dst;
1744 struct inet_peer *peer = rt->peer;
1745 struct in_device *idev = rt->idev;
1746
1747 if (peer) {
1748 rt->peer = NULL;
1749 inet_putpeer(peer);
1750 }
1751
1752 if (idev) {
1753 rt->idev = NULL;
1754 in_dev_put(idev);
1755 }
1756}
1757
1758static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1759 int how)
1760{
1761 struct rtable *rt = (struct rtable *) dst;
1762 struct in_device *idev = rt->idev;
c346dca1 1763 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
5a3e55d6 1764 struct in_device *loopback_idev =
c346dca1 1765 in_dev_get(dev_net(dev)->loopback_dev);
1da177e4
LT
1766 if (loopback_idev) {
1767 rt->idev = loopback_idev;
1768 in_dev_put(idev);
1769 }
1770 }
1771}
1772
1773static void ipv4_link_failure(struct sk_buff *skb)
1774{
1775 struct rtable *rt;
1776
1777 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1778
511c3f92 1779 rt = skb_rtable(skb);
1da177e4
LT
1780 if (rt)
1781 dst_set_expires(&rt->u.dst, 0);
1782}
1783
1784static int ip_rt_bug(struct sk_buff *skb)
1785{
673d57e7
HH
1786 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1787 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1da177e4
LT
1788 skb->dev ? skb->dev->name : "?");
1789 kfree_skb(skb);
1790 return 0;
1791}
1792
1793/*
1794 We do not cache source address of outgoing interface,
1795 because it is used only by IP RR, TS and SRR options,
1796 so that it out of fast path.
1797
1798 BTW remember: "addr" is allowed to be not aligned
1799 in IP options!
1800 */
1801
1802void ip_rt_get_source(u8 *addr, struct rtable *rt)
1803{
a61ced5d 1804 __be32 src;
1da177e4
LT
1805 struct fib_result res;
1806
1807 if (rt->fl.iif == 0)
1808 src = rt->rt_src;
c346dca1 1809 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1da177e4
LT
1810 src = FIB_RES_PREFSRC(res);
1811 fib_res_put(&res);
1812 } else
1813 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1814 RT_SCOPE_UNIVERSE);
1815 memcpy(addr, &src, 4);
1816}
1817
1818#ifdef CONFIG_NET_CLS_ROUTE
1819static void set_class_tag(struct rtable *rt, u32 tag)
1820{
1821 if (!(rt->u.dst.tclassid & 0xFFFF))
1822 rt->u.dst.tclassid |= tag & 0xFFFF;
1823 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1824 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1825}
1826#endif
1827
1828static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1829{
1830 struct fib_info *fi = res->fi;
1831
1832 if (fi) {
1833 if (FIB_RES_GW(*res) &&
1834 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1835 rt->rt_gateway = FIB_RES_GW(*res);
1836 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1837 sizeof(rt->u.dst.metrics));
1838 if (fi->fib_mtu == 0) {
1839 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
0bbeafd0 1840 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1da177e4
LT
1841 rt->rt_gateway != rt->rt_dst &&
1842 rt->u.dst.dev->mtu > 576)
1843 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1844 }
1845#ifdef CONFIG_NET_CLS_ROUTE
1846 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1847#endif
1848 } else
1849 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1850
5ffc02a1 1851 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1da177e4 1852 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
6d273f8d 1853 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1da177e4 1854 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
5ffc02a1 1855 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1da177e4
LT
1856 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1857 ip_rt_min_advmss);
5ffc02a1 1858 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1da177e4
LT
1859 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1860
1861#ifdef CONFIG_NET_CLS_ROUTE
1862#ifdef CONFIG_IP_MULTIPLE_TABLES
1863 set_class_tag(rt, fib_rules_tclass(res));
1864#endif
1865 set_class_tag(rt, itag);
1866#endif
e905a9ed 1867 rt->rt_type = res->type;
1da177e4
LT
1868}
1869
9e12bb22 1870static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1871 u8 tos, struct net_device *dev, int our)
1872{
1873 unsigned hash;
1874 struct rtable *rth;
a61ced5d 1875 __be32 spec_dst;
1da177e4
LT
1876 struct in_device *in_dev = in_dev_get(dev);
1877 u32 itag = 0;
1878
1879 /* Primary sanity checks. */
1880
1881 if (in_dev == NULL)
1882 return -EINVAL;
1883
1e637c74 1884 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 1885 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1886 goto e_inval;
1887
f97c1e0c
JP
1888 if (ipv4_is_zeronet(saddr)) {
1889 if (!ipv4_is_local_multicast(daddr))
1da177e4
LT
1890 goto e_inval;
1891 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1892 } else if (fib_validate_source(saddr, 0, tos, 0,
b0c110ca 1893 dev, &spec_dst, &itag, 0) < 0)
1da177e4
LT
1894 goto e_inval;
1895
1896 rth = dst_alloc(&ipv4_dst_ops);
1897 if (!rth)
1898 goto e_nobufs;
1899
d11a4dc1
TT
1900 rth->u.dst.output = ip_rt_bug;
1901 rth->u.dst.obsolete = -1;
1da177e4
LT
1902
1903 atomic_set(&rth->u.dst.__refcnt, 1);
1904 rth->u.dst.flags= DST_HOST;
42f811b8 1905 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
1906 rth->u.dst.flags |= DST_NOPOLICY;
1907 rth->fl.fl4_dst = daddr;
1908 rth->rt_dst = daddr;
1909 rth->fl.fl4_tos = tos;
47dcf0cb 1910 rth->fl.mark = skb->mark;
1da177e4
LT
1911 rth->fl.fl4_src = saddr;
1912 rth->rt_src = saddr;
1913#ifdef CONFIG_NET_CLS_ROUTE
1914 rth->u.dst.tclassid = itag;
1915#endif
1916 rth->rt_iif =
1917 rth->fl.iif = dev->ifindex;
2774c7ab 1918 rth->u.dst.dev = init_net.loopback_dev;
1da177e4
LT
1919 dev_hold(rth->u.dst.dev);
1920 rth->idev = in_dev_get(rth->u.dst.dev);
1921 rth->fl.oif = 0;
1922 rth->rt_gateway = daddr;
1923 rth->rt_spec_dst= spec_dst;
e84f84f2 1924 rth->rt_genid = rt_genid(dev_net(dev));
1da177e4 1925 rth->rt_flags = RTCF_MULTICAST;
29e75252 1926 rth->rt_type = RTN_MULTICAST;
1da177e4
LT
1927 if (our) {
1928 rth->u.dst.input= ip_local_deliver;
1929 rth->rt_flags |= RTCF_LOCAL;
1930 }
1931
1932#ifdef CONFIG_IP_MROUTE
f97c1e0c 1933 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1da177e4
LT
1934 rth->u.dst.input = ip_mr_input;
1935#endif
1936 RT_CACHE_STAT_INC(in_slow_mc);
1937
1938 in_dev_put(in_dev);
e84f84f2 1939 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
6a2bad70 1940 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1da177e4
LT
1941
1942e_nobufs:
1943 in_dev_put(in_dev);
1944 return -ENOBUFS;
1945
1946e_inval:
1947 in_dev_put(in_dev);
1948 return -EINVAL;
1949}
1950
1951
1952static void ip_handle_martian_source(struct net_device *dev,
1953 struct in_device *in_dev,
1954 struct sk_buff *skb,
9e12bb22
AV
1955 __be32 daddr,
1956 __be32 saddr)
1da177e4
LT
1957{
1958 RT_CACHE_STAT_INC(in_martian_src);
1959#ifdef CONFIG_IP_ROUTE_VERBOSE
1960 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1961 /*
1962 * RFC1812 recommendation, if source is martian,
1963 * the only hint is MAC header.
1964 */
673d57e7
HH
1965 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1966 &daddr, &saddr, dev->name);
98e399f8 1967 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 1968 int i;
98e399f8 1969 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
1970 printk(KERN_WARNING "ll header: ");
1971 for (i = 0; i < dev->hard_header_len; i++, p++) {
1972 printk("%02x", *p);
1973 if (i < (dev->hard_header_len - 1))
1974 printk(":");
1975 }
1976 printk("\n");
1977 }
1978 }
1979#endif
1980}
1981
5969f71d
SH
1982static int __mkroute_input(struct sk_buff *skb,
1983 struct fib_result *res,
1984 struct in_device *in_dev,
1985 __be32 daddr, __be32 saddr, u32 tos,
1986 struct rtable **result)
1da177e4
LT
1987{
1988
1989 struct rtable *rth;
1990 int err;
1991 struct in_device *out_dev;
1992 unsigned flags = 0;
d9c9df8c
AV
1993 __be32 spec_dst;
1994 u32 itag;
1da177e4
LT
1995
1996 /* get a working reference to the output device */
1997 out_dev = in_dev_get(FIB_RES_DEV(*res));
1998 if (out_dev == NULL) {
1999 if (net_ratelimit())
2000 printk(KERN_CRIT "Bug in ip_route_input" \
2001 "_slow(). Please, report\n");
2002 return -EINVAL;
2003 }
2004
2005
e905a9ed 2006 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
b0c110ca 2007 in_dev->dev, &spec_dst, &itag, skb->mark);
1da177e4 2008 if (err < 0) {
e905a9ed 2009 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 2010 saddr);
e905a9ed 2011
1da177e4
LT
2012 err = -EINVAL;
2013 goto cleanup;
2014 }
2015
2016 if (err)
2017 flags |= RTCF_DIRECTSRC;
2018
51b77cae 2019 if (out_dev == in_dev && err &&
1da177e4
LT
2020 (IN_DEV_SHARED_MEDIA(out_dev) ||
2021 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2022 flags |= RTCF_DOREDIRECT;
2023
2024 if (skb->protocol != htons(ETH_P_IP)) {
2025 /* Not IP (i.e. ARP). Do not create route, if it is
2026 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
2027 *
2028 * Proxy arp feature have been extended to allow, ARP
2029 * replies back to the same interface, to support
2030 * Private VLAN switch technologies. See arp.c.
1da177e4 2031 */
65324144
JDB
2032 if (out_dev == in_dev &&
2033 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
2034 err = -EINVAL;
2035 goto cleanup;
2036 }
2037 }
2038
2039
2040 rth = dst_alloc(&ipv4_dst_ops);
2041 if (!rth) {
2042 err = -ENOBUFS;
2043 goto cleanup;
2044 }
2045
ce723d8e 2046 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 2047 rth->u.dst.flags= DST_HOST;
42f811b8 2048 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4 2049 rth->u.dst.flags |= DST_NOPOLICY;
42f811b8 2050 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1da177e4
LT
2051 rth->u.dst.flags |= DST_NOXFRM;
2052 rth->fl.fl4_dst = daddr;
2053 rth->rt_dst = daddr;
2054 rth->fl.fl4_tos = tos;
47dcf0cb 2055 rth->fl.mark = skb->mark;
1da177e4
LT
2056 rth->fl.fl4_src = saddr;
2057 rth->rt_src = saddr;
2058 rth->rt_gateway = daddr;
2059 rth->rt_iif =
2060 rth->fl.iif = in_dev->dev->ifindex;
2061 rth->u.dst.dev = (out_dev)->dev;
2062 dev_hold(rth->u.dst.dev);
2063 rth->idev = in_dev_get(rth->u.dst.dev);
2064 rth->fl.oif = 0;
2065 rth->rt_spec_dst= spec_dst;
2066
d11a4dc1 2067 rth->u.dst.obsolete = -1;
1da177e4
LT
2068 rth->u.dst.input = ip_forward;
2069 rth->u.dst.output = ip_output;
e84f84f2 2070 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
1da177e4
LT
2071
2072 rt_set_nexthop(rth, res, itag);
2073
2074 rth->rt_flags = flags;
2075
2076 *result = rth;
2077 err = 0;
2078 cleanup:
2079 /* release the working reference to the output device */
2080 in_dev_put(out_dev);
2081 return err;
e905a9ed 2082}
1da177e4 2083
5969f71d
SH
2084static int ip_mkroute_input(struct sk_buff *skb,
2085 struct fib_result *res,
2086 const struct flowi *fl,
2087 struct in_device *in_dev,
2088 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 2089{
7abaa27c 2090 struct rtable* rth = NULL;
1da177e4
LT
2091 int err;
2092 unsigned hash;
2093
2094#ifdef CONFIG_IP_ROUTE_MULTIPATH
2095 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2096 fib_select_multipath(fl, res);
2097#endif
2098
2099 /* create a routing cache entry */
2100 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2101 if (err)
2102 return err;
1da177e4
LT
2103
2104 /* put it into the cache */
e84f84f2
DL
2105 hash = rt_hash(daddr, saddr, fl->iif,
2106 rt_genid(dev_net(rth->u.dst.dev)));
6a2bad70 2107 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
1da177e4
LT
2108}
2109
1da177e4
LT
2110/*
2111 * NOTE. We drop all the packets that has local source
2112 * addresses, because every properly looped back packet
2113 * must have correct destination already attached by output routine.
2114 *
2115 * Such approach solves two big problems:
2116 * 1. Not simplex devices are handled properly.
2117 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2118 */
2119
9e12bb22 2120static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2121 u8 tos, struct net_device *dev)
2122{
2123 struct fib_result res;
2124 struct in_device *in_dev = in_dev_get(dev);
2125 struct flowi fl = { .nl_u = { .ip4_u =
2126 { .daddr = daddr,
2127 .saddr = saddr,
2128 .tos = tos,
2129 .scope = RT_SCOPE_UNIVERSE,
1da177e4 2130 } },
47dcf0cb 2131 .mark = skb->mark,
1da177e4
LT
2132 .iif = dev->ifindex };
2133 unsigned flags = 0;
2134 u32 itag = 0;
2135 struct rtable * rth;
2136 unsigned hash;
9e12bb22 2137 __be32 spec_dst;
1da177e4
LT
2138 int err = -EINVAL;
2139 int free_res = 0;
c346dca1 2140 struct net * net = dev_net(dev);
1da177e4
LT
2141
2142 /* IP on this device is disabled. */
2143
2144 if (!in_dev)
2145 goto out;
2146
2147 /* Check for the most weird martians, which can be not detected
2148 by fib_lookup.
2149 */
2150
1e637c74 2151 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2152 ipv4_is_loopback(saddr))
1da177e4
LT
2153 goto martian_source;
2154
e448515c 2155 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1da177e4
LT
2156 goto brd_input;
2157
2158 /* Accept zero addresses only to limited broadcast;
2159 * I even do not know to fix it or not. Waiting for complains :-)
2160 */
f97c1e0c 2161 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2162 goto martian_source;
2163
1e637c74 2164 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
f97c1e0c 2165 ipv4_is_loopback(daddr))
1da177e4
LT
2166 goto martian_destination;
2167
2168 /*
2169 * Now we are ready to route packet.
2170 */
84a885f4 2171 if ((err = fib_lookup(net, &fl, &res)) != 0) {
1da177e4 2172 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2173 goto e_hostunreach;
1da177e4
LT
2174 goto no_route;
2175 }
2176 free_res = 1;
2177
2178 RT_CACHE_STAT_INC(in_slow_tot);
2179
2180 if (res.type == RTN_BROADCAST)
2181 goto brd_input;
2182
2183 if (res.type == RTN_LOCAL) {
2184 int result;
2185 result = fib_validate_source(saddr, daddr, tos,
84a885f4 2186 net->loopback_dev->ifindex,
b0c110ca 2187 dev, &spec_dst, &itag, skb->mark);
1da177e4
LT
2188 if (result < 0)
2189 goto martian_source;
2190 if (result)
2191 flags |= RTCF_DIRECTSRC;
2192 spec_dst = daddr;
2193 goto local_input;
2194 }
2195
2196 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2197 goto e_hostunreach;
1da177e4
LT
2198 if (res.type != RTN_UNICAST)
2199 goto martian_destination;
2200
2201 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1da177e4
LT
2202done:
2203 in_dev_put(in_dev);
2204 if (free_res)
2205 fib_res_put(&res);
2206out: return err;
2207
2208brd_input:
2209 if (skb->protocol != htons(ETH_P_IP))
2210 goto e_inval;
2211
f97c1e0c 2212 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2213 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2214 else {
2215 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
b0c110ca 2216 &itag, skb->mark);
1da177e4
LT
2217 if (err < 0)
2218 goto martian_source;
2219 if (err)
2220 flags |= RTCF_DIRECTSRC;
2221 }
2222 flags |= RTCF_BROADCAST;
2223 res.type = RTN_BROADCAST;
2224 RT_CACHE_STAT_INC(in_brd);
2225
2226local_input:
2227 rth = dst_alloc(&ipv4_dst_ops);
2228 if (!rth)
2229 goto e_nobufs;
2230
2231 rth->u.dst.output= ip_rt_bug;
d11a4dc1 2232 rth->u.dst.obsolete = -1;
e84f84f2 2233 rth->rt_genid = rt_genid(net);
1da177e4
LT
2234
2235 atomic_set(&rth->u.dst.__refcnt, 1);
2236 rth->u.dst.flags= DST_HOST;
42f811b8 2237 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
2238 rth->u.dst.flags |= DST_NOPOLICY;
2239 rth->fl.fl4_dst = daddr;
2240 rth->rt_dst = daddr;
2241 rth->fl.fl4_tos = tos;
47dcf0cb 2242 rth->fl.mark = skb->mark;
1da177e4
LT
2243 rth->fl.fl4_src = saddr;
2244 rth->rt_src = saddr;
2245#ifdef CONFIG_NET_CLS_ROUTE
2246 rth->u.dst.tclassid = itag;
2247#endif
2248 rth->rt_iif =
2249 rth->fl.iif = dev->ifindex;
84a885f4 2250 rth->u.dst.dev = net->loopback_dev;
1da177e4
LT
2251 dev_hold(rth->u.dst.dev);
2252 rth->idev = in_dev_get(rth->u.dst.dev);
2253 rth->rt_gateway = daddr;
2254 rth->rt_spec_dst= spec_dst;
2255 rth->u.dst.input= ip_local_deliver;
2256 rth->rt_flags = flags|RTCF_LOCAL;
2257 if (res.type == RTN_UNREACHABLE) {
2258 rth->u.dst.input= ip_error;
2259 rth->u.dst.error= -err;
2260 rth->rt_flags &= ~RTCF_LOCAL;
2261 }
2262 rth->rt_type = res.type;
e84f84f2 2263 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
6a2bad70 2264 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
1da177e4
LT
2265 goto done;
2266
2267no_route:
2268 RT_CACHE_STAT_INC(in_no_route);
2269 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2270 res.type = RTN_UNREACHABLE;
7f53878d
MC
2271 if (err == -ESRCH)
2272 err = -ENETUNREACH;
1da177e4
LT
2273 goto local_input;
2274
2275 /*
2276 * Do not cache martian addresses: they should be logged (RFC1812)
2277 */
2278martian_destination:
2279 RT_CACHE_STAT_INC(in_martian_dst);
2280#ifdef CONFIG_IP_ROUTE_VERBOSE
2281 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
2282 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2283 &daddr, &saddr, dev->name);
1da177e4 2284#endif
2c2910a4
DE
2285
2286e_hostunreach:
e905a9ed
YH
2287 err = -EHOSTUNREACH;
2288 goto done;
2c2910a4 2289
1da177e4
LT
2290e_inval:
2291 err = -EINVAL;
2292 goto done;
2293
2294e_nobufs:
2295 err = -ENOBUFS;
2296 goto done;
2297
2298martian_source:
2299 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2300 goto e_inval;
2301}
2302
9e12bb22 2303int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2304 u8 tos, struct net_device *dev)
2305{
2306 struct rtable * rth;
2307 unsigned hash;
2308 int iif = dev->ifindex;
b5921910 2309 struct net *net;
1da177e4 2310
c346dca1 2311 net = dev_net(dev);
1080d709
NH
2312
2313 if (!rt_caching(net))
2314 goto skip_cache;
2315
1da177e4 2316 tos &= IPTOS_RT_MASK;
e84f84f2 2317 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
1da177e4
LT
2318
2319 rcu_read_lock();
2320 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 2321 rth = rcu_dereference(rth->u.dst.rt_next)) {
c0b8c32b
SH
2322 if (((rth->fl.fl4_dst ^ daddr) |
2323 (rth->fl.fl4_src ^ saddr) |
2324 (rth->fl.iif ^ iif) |
2325 rth->fl.oif |
2326 (rth->fl.fl4_tos ^ tos)) == 0 &&
47dcf0cb 2327 rth->fl.mark == skb->mark &&
878628fb 2328 net_eq(dev_net(rth->u.dst.dev), net) &&
e84f84f2 2329 !rt_is_expired(rth)) {
03f49f34 2330 dst_use(&rth->u.dst, jiffies);
1da177e4
LT
2331 RT_CACHE_STAT_INC(in_hit);
2332 rcu_read_unlock();
adf30907 2333 skb_dst_set(skb, &rth->u.dst);
1da177e4
LT
2334 return 0;
2335 }
2336 RT_CACHE_STAT_INC(in_hlist_search);
2337 }
2338 rcu_read_unlock();
2339
1080d709 2340skip_cache:
1da177e4
LT
2341 /* Multicast recognition logic is moved from route cache to here.
2342 The problem was that too many Ethernet cards have broken/missing
2343 hardware multicast filters :-( As result the host on multicasting
2344 network acquires a lot of useless route cache entries, sort of
2345 SDR messages from all the world. Now we try to get rid of them.
2346 Really, provided software IP multicast filter is organized
2347 reasonably (at least, hashed), it does not result in a slowdown
2348 comparing with route cache reject entries.
2349 Note, that multicast routers are not affected, because
2350 route cache entry is created eventually.
2351 */
f97c1e0c 2352 if (ipv4_is_multicast(daddr)) {
1da177e4
LT
2353 struct in_device *in_dev;
2354
2355 rcu_read_lock();
e5ed6399 2356 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1da177e4 2357 int our = ip_check_mc(in_dev, daddr, saddr,
eddc9ec5 2358 ip_hdr(skb)->protocol);
1da177e4
LT
2359 if (our
2360#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
2361 ||
2362 (!ipv4_is_local_multicast(daddr) &&
2363 IN_DEV_MFORWARD(in_dev))
1da177e4 2364#endif
9d4fb27d 2365 ) {
1da177e4
LT
2366 rcu_read_unlock();
2367 return ip_route_input_mc(skb, daddr, saddr,
2368 tos, dev, our);
2369 }
2370 }
2371 rcu_read_unlock();
2372 return -EINVAL;
2373 }
2374 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2375}
2376
5969f71d
SH
2377static int __mkroute_output(struct rtable **result,
2378 struct fib_result *res,
2379 const struct flowi *fl,
2380 const struct flowi *oldflp,
2381 struct net_device *dev_out,
2382 unsigned flags)
1da177e4
LT
2383{
2384 struct rtable *rth;
2385 struct in_device *in_dev;
2386 u32 tos = RT_FL_TOS(oldflp);
2387 int err = 0;
2388
f97c1e0c 2389 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
1da177e4
LT
2390 return -EINVAL;
2391
e448515c 2392 if (fl->fl4_dst == htonl(0xFFFFFFFF))
1da177e4 2393 res->type = RTN_BROADCAST;
f97c1e0c 2394 else if (ipv4_is_multicast(fl->fl4_dst))
1da177e4 2395 res->type = RTN_MULTICAST;
1e637c74 2396 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
1da177e4
LT
2397 return -EINVAL;
2398
2399 if (dev_out->flags & IFF_LOOPBACK)
2400 flags |= RTCF_LOCAL;
2401
2402 /* get work reference to inet device */
2403 in_dev = in_dev_get(dev_out);
2404 if (!in_dev)
2405 return -EINVAL;
2406
2407 if (res->type == RTN_BROADCAST) {
2408 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2409 if (res->fi) {
2410 fib_info_put(res->fi);
2411 res->fi = NULL;
2412 }
2413 } else if (res->type == RTN_MULTICAST) {
2414 flags |= RTCF_MULTICAST|RTCF_LOCAL;
e905a9ed 2415 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
1da177e4
LT
2416 oldflp->proto))
2417 flags &= ~RTCF_LOCAL;
2418 /* If multicast route do not exist use
2419 default one, but do not gateway in this case.
2420 Yes, it is hack.
2421 */
2422 if (res->fi && res->prefixlen < 4) {
2423 fib_info_put(res->fi);
2424 res->fi = NULL;
2425 }
2426 }
2427
2428
2429 rth = dst_alloc(&ipv4_dst_ops);
2430 if (!rth) {
2431 err = -ENOBUFS;
2432 goto cleanup;
e905a9ed 2433 }
1da177e4 2434
ce723d8e 2435 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 2436 rth->u.dst.flags= DST_HOST;
42f811b8 2437 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
1da177e4 2438 rth->u.dst.flags |= DST_NOXFRM;
42f811b8 2439 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
2440 rth->u.dst.flags |= DST_NOPOLICY;
2441
2442 rth->fl.fl4_dst = oldflp->fl4_dst;
2443 rth->fl.fl4_tos = tos;
2444 rth->fl.fl4_src = oldflp->fl4_src;
2445 rth->fl.oif = oldflp->oif;
47dcf0cb 2446 rth->fl.mark = oldflp->mark;
1da177e4
LT
2447 rth->rt_dst = fl->fl4_dst;
2448 rth->rt_src = fl->fl4_src;
2449 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
e905a9ed 2450 /* get references to the devices that are to be hold by the routing
1da177e4
LT
2451 cache entry */
2452 rth->u.dst.dev = dev_out;
2453 dev_hold(dev_out);
2454 rth->idev = in_dev_get(dev_out);
2455 rth->rt_gateway = fl->fl4_dst;
2456 rth->rt_spec_dst= fl->fl4_src;
2457
2458 rth->u.dst.output=ip_output;
d11a4dc1 2459 rth->u.dst.obsolete = -1;
e84f84f2 2460 rth->rt_genid = rt_genid(dev_net(dev_out));
1da177e4
LT
2461
2462 RT_CACHE_STAT_INC(out_slow_tot);
2463
2464 if (flags & RTCF_LOCAL) {
2465 rth->u.dst.input = ip_local_deliver;
2466 rth->rt_spec_dst = fl->fl4_dst;
2467 }
2468 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2469 rth->rt_spec_dst = fl->fl4_src;
e905a9ed 2470 if (flags & RTCF_LOCAL &&
1da177e4
LT
2471 !(dev_out->flags & IFF_LOOPBACK)) {
2472 rth->u.dst.output = ip_mc_output;
2473 RT_CACHE_STAT_INC(out_slow_mc);
2474 }
2475#ifdef CONFIG_IP_MROUTE
2476 if (res->type == RTN_MULTICAST) {
2477 if (IN_DEV_MFORWARD(in_dev) &&
f97c1e0c 2478 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
1da177e4
LT
2479 rth->u.dst.input = ip_mr_input;
2480 rth->u.dst.output = ip_mc_output;
2481 }
2482 }
2483#endif
2484 }
2485
2486 rt_set_nexthop(rth, res, 0);
2487
2488 rth->rt_flags = flags;
2489
2490 *result = rth;
2491 cleanup:
2492 /* release work reference to inet device */
2493 in_dev_put(in_dev);
2494
2495 return err;
2496}
2497
5969f71d
SH
2498static int ip_mkroute_output(struct rtable **rp,
2499 struct fib_result *res,
2500 const struct flowi *fl,
2501 const struct flowi *oldflp,
2502 struct net_device *dev_out,
2503 unsigned flags)
1da177e4 2504{
7abaa27c 2505 struct rtable *rth = NULL;
1da177e4
LT
2506 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2507 unsigned hash;
2508 if (err == 0) {
b00180de 2509 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
e84f84f2 2510 rt_genid(dev_net(dev_out)));
6a2bad70 2511 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
1da177e4 2512 }
e905a9ed 2513
1da177e4
LT
2514 return err;
2515}
2516
1da177e4
LT
2517/*
2518 * Major route resolver routine.
2519 */
2520
b40afd0e
DL
2521static int ip_route_output_slow(struct net *net, struct rtable **rp,
2522 const struct flowi *oldflp)
1da177e4
LT
2523{
2524 u32 tos = RT_FL_TOS(oldflp);
2525 struct flowi fl = { .nl_u = { .ip4_u =
2526 { .daddr = oldflp->fl4_dst,
2527 .saddr = oldflp->fl4_src,
2528 .tos = tos & IPTOS_RT_MASK,
2529 .scope = ((tos & RTO_ONLINK) ?
2530 RT_SCOPE_LINK :
2531 RT_SCOPE_UNIVERSE),
1da177e4 2532 } },
47dcf0cb 2533 .mark = oldflp->mark,
b40afd0e 2534 .iif = net->loopback_dev->ifindex,
1da177e4
LT
2535 .oif = oldflp->oif };
2536 struct fib_result res;
2537 unsigned flags = 0;
2538 struct net_device *dev_out = NULL;
2539 int free_res = 0;
2540 int err;
2541
2542
2543 res.fi = NULL;
2544#ifdef CONFIG_IP_MULTIPLE_TABLES
2545 res.r = NULL;
2546#endif
2547
2548 if (oldflp->fl4_src) {
2549 err = -EINVAL;
f97c1e0c 2550 if (ipv4_is_multicast(oldflp->fl4_src) ||
1e637c74 2551 ipv4_is_lbcast(oldflp->fl4_src) ||
f97c1e0c 2552 ipv4_is_zeronet(oldflp->fl4_src))
1da177e4
LT
2553 goto out;
2554
1da177e4
LT
2555 /* I removed check for oif == dev_out->oif here.
2556 It was wrong for two reasons:
1ab35276
DL
2557 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2558 is assigned to multiple interfaces.
1da177e4
LT
2559 2. Moreover, we are allowed to send packets with saddr
2560 of another iface. --ANK
2561 */
2562
9d4fb27d
JP
2563 if (oldflp->oif == 0 &&
2564 (ipv4_is_multicast(oldflp->fl4_dst) ||
2565 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
a210d01a
JA
2566 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2567 dev_out = ip_dev_find(net, oldflp->fl4_src);
2568 if (dev_out == NULL)
2569 goto out;
2570
1da177e4
LT
2571 /* Special hack: user can direct multicasts
2572 and limited broadcast via necessary interface
2573 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2574 This hack is not just for fun, it allows
2575 vic,vat and friends to work.
2576 They bind socket to loopback, set ttl to zero
2577 and expect that it will work.
2578 From the viewpoint of routing cache they are broken,
2579 because we are not allowed to build multicast path
2580 with loopback source addr (look, routing cache
2581 cannot know, that ttl is zero, so that packet
2582 will not leave this host and route is valid).
2583 Luckily, this hack is good workaround.
2584 */
2585
2586 fl.oif = dev_out->ifindex;
2587 goto make_route;
2588 }
a210d01a
JA
2589
2590 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2591 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2592 dev_out = ip_dev_find(net, oldflp->fl4_src);
2593 if (dev_out == NULL)
2594 goto out;
1da177e4 2595 dev_put(dev_out);
a210d01a
JA
2596 dev_out = NULL;
2597 }
1da177e4
LT
2598 }
2599
2600
2601 if (oldflp->oif) {
b40afd0e 2602 dev_out = dev_get_by_index(net, oldflp->oif);
1da177e4
LT
2603 err = -ENODEV;
2604 if (dev_out == NULL)
2605 goto out;
e5ed6399
HX
2606
2607 /* RACE: Check return value of inet_select_addr instead. */
2608 if (__in_dev_get_rtnl(dev_out) == NULL) {
1da177e4
LT
2609 dev_put(dev_out);
2610 goto out; /* Wrong error code */
2611 }
2612
f97c1e0c
JP
2613 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2614 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
1da177e4
LT
2615 if (!fl.fl4_src)
2616 fl.fl4_src = inet_select_addr(dev_out, 0,
2617 RT_SCOPE_LINK);
2618 goto make_route;
2619 }
2620 if (!fl.fl4_src) {
f97c1e0c 2621 if (ipv4_is_multicast(oldflp->fl4_dst))
1da177e4
LT
2622 fl.fl4_src = inet_select_addr(dev_out, 0,
2623 fl.fl4_scope);
2624 else if (!oldflp->fl4_dst)
2625 fl.fl4_src = inet_select_addr(dev_out, 0,
2626 RT_SCOPE_HOST);
2627 }
2628 }
2629
2630 if (!fl.fl4_dst) {
2631 fl.fl4_dst = fl.fl4_src;
2632 if (!fl.fl4_dst)
2633 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2634 if (dev_out)
2635 dev_put(dev_out);
b40afd0e 2636 dev_out = net->loopback_dev;
1da177e4 2637 dev_hold(dev_out);
b40afd0e 2638 fl.oif = net->loopback_dev->ifindex;
1da177e4
LT
2639 res.type = RTN_LOCAL;
2640 flags |= RTCF_LOCAL;
2641 goto make_route;
2642 }
2643
b40afd0e 2644 if (fib_lookup(net, &fl, &res)) {
1da177e4
LT
2645 res.fi = NULL;
2646 if (oldflp->oif) {
2647 /* Apparently, routing tables are wrong. Assume,
2648 that the destination is on link.
2649
2650 WHY? DW.
2651 Because we are allowed to send to iface
2652 even if it has NO routes and NO assigned
2653 addresses. When oif is specified, routing
2654 tables are looked up with only one purpose:
2655 to catch if destination is gatewayed, rather than
2656 direct. Moreover, if MSG_DONTROUTE is set,
2657 we send packet, ignoring both routing tables
2658 and ifaddr state. --ANK
2659
2660
2661 We could make it even if oif is unknown,
2662 likely IPv6, but we do not.
2663 */
2664
2665 if (fl.fl4_src == 0)
2666 fl.fl4_src = inet_select_addr(dev_out, 0,
2667 RT_SCOPE_LINK);
2668 res.type = RTN_UNICAST;
2669 goto make_route;
2670 }
2671 if (dev_out)
2672 dev_put(dev_out);
2673 err = -ENETUNREACH;
2674 goto out;
2675 }
2676 free_res = 1;
2677
2678 if (res.type == RTN_LOCAL) {
2679 if (!fl.fl4_src)
2680 fl.fl4_src = fl.fl4_dst;
2681 if (dev_out)
2682 dev_put(dev_out);
b40afd0e 2683 dev_out = net->loopback_dev;
1da177e4
LT
2684 dev_hold(dev_out);
2685 fl.oif = dev_out->ifindex;
2686 if (res.fi)
2687 fib_info_put(res.fi);
2688 res.fi = NULL;
2689 flags |= RTCF_LOCAL;
2690 goto make_route;
2691 }
2692
2693#ifdef CONFIG_IP_ROUTE_MULTIPATH
2694 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2695 fib_select_multipath(&fl, &res);
2696 else
2697#endif
2698 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
b40afd0e 2699 fib_select_default(net, &fl, &res);
1da177e4
LT
2700
2701 if (!fl.fl4_src)
2702 fl.fl4_src = FIB_RES_PREFSRC(res);
2703
2704 if (dev_out)
2705 dev_put(dev_out);
2706 dev_out = FIB_RES_DEV(res);
2707 dev_hold(dev_out);
2708 fl.oif = dev_out->ifindex;
2709
2710
2711make_route:
2712 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2713
2714
2715 if (free_res)
2716 fib_res_put(&res);
2717 if (dev_out)
2718 dev_put(dev_out);
2719out: return err;
2720}
2721
611c183e
DL
2722int __ip_route_output_key(struct net *net, struct rtable **rp,
2723 const struct flowi *flp)
1da177e4
LT
2724{
2725 unsigned hash;
2726 struct rtable *rth;
2727
1080d709
NH
2728 if (!rt_caching(net))
2729 goto slow_output;
2730
e84f84f2 2731 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
1da177e4
LT
2732
2733 rcu_read_lock_bh();
a898def2
PM
2734 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2735 rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
1da177e4
LT
2736 if (rth->fl.fl4_dst == flp->fl4_dst &&
2737 rth->fl.fl4_src == flp->fl4_src &&
2738 rth->fl.iif == 0 &&
2739 rth->fl.oif == flp->oif &&
47dcf0cb 2740 rth->fl.mark == flp->mark &&
1da177e4 2741 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
b5921910 2742 (IPTOS_RT_MASK | RTO_ONLINK)) &&
878628fb 2743 net_eq(dev_net(rth->u.dst.dev), net) &&
e84f84f2 2744 !rt_is_expired(rth)) {
03f49f34 2745 dst_use(&rth->u.dst, jiffies);
1da177e4
LT
2746 RT_CACHE_STAT_INC(out_hit);
2747 rcu_read_unlock_bh();
2748 *rp = rth;
2749 return 0;
2750 }
2751 RT_CACHE_STAT_INC(out_hlist_search);
2752 }
2753 rcu_read_unlock_bh();
2754
1080d709 2755slow_output:
611c183e 2756 return ip_route_output_slow(net, rp, flp);
1da177e4
LT
2757}
2758
d8c97a94
ACM
2759EXPORT_SYMBOL_GPL(__ip_route_output_key);
2760
14e50e57
DM
2761static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2762{
2763}
2764
2765static struct dst_ops ipv4_dst_blackhole_ops = {
2766 .family = AF_INET,
09640e63 2767 .protocol = cpu_to_be16(ETH_P_IP),
14e50e57
DM
2768 .destroy = ipv4_dst_destroy,
2769 .check = ipv4_dst_check,
2770 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
e2422970 2771 .entries = ATOMIC_INIT(0),
14e50e57
DM
2772};
2773
2774
e84f84f2 2775static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
14e50e57
DM
2776{
2777 struct rtable *ort = *rp;
2778 struct rtable *rt = (struct rtable *)
2779 dst_alloc(&ipv4_dst_blackhole_ops);
2780
2781 if (rt) {
2782 struct dst_entry *new = &rt->u.dst;
2783
2784 atomic_set(&new->__refcnt, 1);
2785 new->__use = 1;
352e512c
HX
2786 new->input = dst_discard;
2787 new->output = dst_discard;
14e50e57
DM
2788 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2789
2790 new->dev = ort->u.dst.dev;
2791 if (new->dev)
2792 dev_hold(new->dev);
2793
2794 rt->fl = ort->fl;
2795
2796 rt->idev = ort->idev;
2797 if (rt->idev)
2798 in_dev_hold(rt->idev);
e84f84f2 2799 rt->rt_genid = rt_genid(net);
14e50e57
DM
2800 rt->rt_flags = ort->rt_flags;
2801 rt->rt_type = ort->rt_type;
2802 rt->rt_dst = ort->rt_dst;
2803 rt->rt_src = ort->rt_src;
2804 rt->rt_iif = ort->rt_iif;
2805 rt->rt_gateway = ort->rt_gateway;
2806 rt->rt_spec_dst = ort->rt_spec_dst;
2807 rt->peer = ort->peer;
2808 if (rt->peer)
2809 atomic_inc(&rt->peer->refcnt);
2810
2811 dst_free(new);
2812 }
2813
2814 dst_release(&(*rp)->u.dst);
2815 *rp = rt;
2816 return (rt ? 0 : -ENOMEM);
2817}
2818
f1b050bf
DL
2819int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2820 struct sock *sk, int flags)
1da177e4
LT
2821{
2822 int err;
2823
f1b050bf 2824 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
1da177e4
LT
2825 return err;
2826
2827 if (flp->proto) {
2828 if (!flp->fl4_src)
2829 flp->fl4_src = (*rp)->rt_src;
2830 if (!flp->fl4_dst)
2831 flp->fl4_dst = (*rp)->rt_dst;
52479b62 2832 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
bb72845e 2833 flags ? XFRM_LOOKUP_WAIT : 0);
14e50e57 2834 if (err == -EREMOTE)
e84f84f2 2835 err = ipv4_dst_blackhole(net, rp, flp);
14e50e57
DM
2836
2837 return err;
1da177e4
LT
2838 }
2839
2840 return 0;
2841}
2842
d8c97a94
ACM
2843EXPORT_SYMBOL_GPL(ip_route_output_flow);
2844
f206351a 2845int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
1da177e4 2846{
f206351a 2847 return ip_route_output_flow(net, rp, flp, NULL, 0);
1da177e4
LT
2848}
2849
4feb88e5
BT
2850static int rt_fill_info(struct net *net,
2851 struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2852 int nowait, unsigned int flags)
1da177e4 2853{
511c3f92 2854 struct rtable *rt = skb_rtable(skb);
1da177e4 2855 struct rtmsg *r;
be403ea1 2856 struct nlmsghdr *nlh;
e3703b3d
TG
2857 long expires;
2858 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2859
2860 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2861 if (nlh == NULL)
26932566 2862 return -EMSGSIZE;
be403ea1
TG
2863
2864 r = nlmsg_data(nlh);
1da177e4
LT
2865 r->rtm_family = AF_INET;
2866 r->rtm_dst_len = 32;
2867 r->rtm_src_len = 0;
2868 r->rtm_tos = rt->fl.fl4_tos;
2869 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2870 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2871 r->rtm_type = rt->rt_type;
2872 r->rtm_scope = RT_SCOPE_UNIVERSE;
2873 r->rtm_protocol = RTPROT_UNSPEC;
2874 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2875 if (rt->rt_flags & RTCF_NOTIFY)
2876 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2877
17fb2c64 2878 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2879
1da177e4
LT
2880 if (rt->fl.fl4_src) {
2881 r->rtm_src_len = 32;
17fb2c64 2882 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
1da177e4
LT
2883 }
2884 if (rt->u.dst.dev)
be403ea1 2885 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
1da177e4
LT
2886#ifdef CONFIG_NET_CLS_ROUTE
2887 if (rt->u.dst.tclassid)
be403ea1 2888 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
1da177e4
LT
2889#endif
2890 if (rt->fl.iif)
17fb2c64 2891 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
1da177e4 2892 else if (rt->rt_src != rt->fl.fl4_src)
17fb2c64 2893 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2894
1da177e4 2895 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2896 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2897
1da177e4 2898 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
be403ea1
TG
2899 goto nla_put_failure;
2900
e3703b3d
TG
2901 error = rt->u.dst.error;
2902 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
1da177e4 2903 if (rt->peer) {
2c1409a0 2904 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
1da177e4 2905 if (rt->peer->tcp_ts_stamp) {
e3703b3d 2906 ts = rt->peer->tcp_ts;
9d729f72 2907 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
1da177e4
LT
2908 }
2909 }
be403ea1 2910
1da177e4
LT
2911 if (rt->fl.iif) {
2912#ifdef CONFIG_IP_MROUTE
e448515c 2913 __be32 dst = rt->rt_dst;
1da177e4 2914
f97c1e0c 2915 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
4feb88e5
BT
2916 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2917 int err = ipmr_get_route(net, skb, r, nowait);
1da177e4
LT
2918 if (err <= 0) {
2919 if (!nowait) {
2920 if (err == 0)
2921 return 0;
be403ea1 2922 goto nla_put_failure;
1da177e4
LT
2923 } else {
2924 if (err == -EMSGSIZE)
be403ea1 2925 goto nla_put_failure;
e3703b3d 2926 error = err;
1da177e4
LT
2927 }
2928 }
2929 } else
2930#endif
be403ea1 2931 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
1da177e4
LT
2932 }
2933
e3703b3d
TG
2934 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2935 expires, error) < 0)
2936 goto nla_put_failure;
be403ea1
TG
2937
2938 return nlmsg_end(skb, nlh);
1da177e4 2939
be403ea1 2940nla_put_failure:
26932566
PM
2941 nlmsg_cancel(skb, nlh);
2942 return -EMSGSIZE;
1da177e4
LT
2943}
2944
63f3444f 2945static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 2946{
3b1e0a65 2947 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2948 struct rtmsg *rtm;
2949 struct nlattr *tb[RTA_MAX+1];
1da177e4 2950 struct rtable *rt = NULL;
9e12bb22
AV
2951 __be32 dst = 0;
2952 __be32 src = 0;
2953 u32 iif;
d889ce3b 2954 int err;
1da177e4
LT
2955 struct sk_buff *skb;
2956
d889ce3b
TG
2957 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2958 if (err < 0)
2959 goto errout;
2960
2961 rtm = nlmsg_data(nlh);
2962
1da177e4 2963 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2964 if (skb == NULL) {
2965 err = -ENOBUFS;
2966 goto errout;
2967 }
1da177e4
LT
2968
2969 /* Reserve room for dummy headers, this skb can pass
2970 through good chunk of routing engine.
2971 */
459a98ed 2972 skb_reset_mac_header(skb);
c1d2bbe1 2973 skb_reset_network_header(skb);
d2c962b8
SH
2974
2975 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2976 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2977 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2978
17fb2c64
AV
2979 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2980 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2981 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
1da177e4
LT
2982
2983 if (iif) {
d889ce3b
TG
2984 struct net_device *dev;
2985
1937504d 2986 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
2987 if (dev == NULL) {
2988 err = -ENODEV;
2989 goto errout_free;
2990 }
2991
1da177e4
LT
2992 skb->protocol = htons(ETH_P_IP);
2993 skb->dev = dev;
2994 local_bh_disable();
2995 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2996 local_bh_enable();
d889ce3b 2997
511c3f92 2998 rt = skb_rtable(skb);
d889ce3b 2999 if (err == 0 && rt->u.dst.error)
1da177e4
LT
3000 err = -rt->u.dst.error;
3001 } else {
d889ce3b
TG
3002 struct flowi fl = {
3003 .nl_u = {
3004 .ip4_u = {
3005 .daddr = dst,
3006 .saddr = src,
3007 .tos = rtm->rtm_tos,
3008 },
3009 },
3010 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3011 };
1937504d 3012 err = ip_route_output_key(net, &rt, &fl);
1da177e4 3013 }
d889ce3b 3014
1da177e4 3015 if (err)
d889ce3b 3016 goto errout_free;
1da177e4 3017
adf30907 3018 skb_dst_set(skb, &rt->u.dst);
1da177e4
LT
3019 if (rtm->rtm_flags & RTM_F_NOTIFY)
3020 rt->rt_flags |= RTCF_NOTIFY;
3021
4feb88e5 3022 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 3023 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
3024 if (err <= 0)
3025 goto errout_free;
1da177e4 3026
1937504d 3027 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 3028errout:
2942e900 3029 return err;
1da177e4 3030
d889ce3b 3031errout_free:
1da177e4 3032 kfree_skb(skb);
d889ce3b 3033 goto errout;
1da177e4
LT
3034}
3035
3036int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3037{
3038 struct rtable *rt;
3039 int h, s_h;
3040 int idx, s_idx;
1937504d
DL
3041 struct net *net;
3042
3b1e0a65 3043 net = sock_net(skb->sk);
1da177e4
LT
3044
3045 s_h = cb->args[0];
d8c92830
ED
3046 if (s_h < 0)
3047 s_h = 0;
1da177e4 3048 s_idx = idx = cb->args[1];
a6272665
ED
3049 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3050 if (!rt_hash_table[h].chain)
3051 continue;
1da177e4 3052 rcu_read_lock_bh();
a898def2
PM
3053 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3054 rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
878628fb 3055 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
1da177e4 3056 continue;
e84f84f2 3057 if (rt_is_expired(rt))
29e75252 3058 continue;
adf30907 3059 skb_dst_set(skb, dst_clone(&rt->u.dst));
4feb88e5 3060 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
e905a9ed 3061 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 3062 1, NLM_F_MULTI) <= 0) {
adf30907 3063 skb_dst_drop(skb);
1da177e4
LT
3064 rcu_read_unlock_bh();
3065 goto done;
3066 }
adf30907 3067 skb_dst_drop(skb);
1da177e4
LT
3068 }
3069 rcu_read_unlock_bh();
3070 }
3071
3072done:
3073 cb->args[0] = h;
3074 cb->args[1] = idx;
3075 return skb->len;
3076}
3077
3078void ip_rt_multicast_event(struct in_device *in_dev)
3079{
76e6ebfb 3080 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
3081}
3082
3083#ifdef CONFIG_SYSCTL
81c684d1 3084static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 3085 void __user *buffer,
1da177e4
LT
3086 size_t *lenp, loff_t *ppos)
3087{
3088 if (write) {
639e104f 3089 int flush_delay;
81c684d1 3090 ctl_table ctl;
39a23e75 3091 struct net *net;
639e104f 3092
81c684d1
DL
3093 memcpy(&ctl, __ctl, sizeof(ctl));
3094 ctl.data = &flush_delay;
8d65af78 3095 proc_dointvec(&ctl, write, buffer, lenp, ppos);
639e104f 3096
81c684d1 3097 net = (struct net *)__ctl->extra1;
39a23e75 3098 rt_cache_flush(net, flush_delay);
1da177e4 3099 return 0;
e905a9ed 3100 }
1da177e4
LT
3101
3102 return -EINVAL;
3103}
3104
c6153b5b
HX
3105static void rt_secret_reschedule(int old)
3106{
3107 struct net *net;
3108 int new = ip_rt_secret_interval;
3109 int diff = new - old;
3110
3111 if (!diff)
3112 return;
3113
3114 rtnl_lock();
3115 for_each_net(net) {
3116 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
858a18a6 3117 long time;
c6153b5b
HX
3118
3119 if (!new)
3120 continue;
3121
3122 if (deleted) {
858a18a6 3123 time = net->ipv4.rt_secret_timer.expires - jiffies;
c6153b5b
HX
3124
3125 if (time <= 0 || (time += diff) <= 0)
3126 time = 0;
c6153b5b 3127 } else
858a18a6 3128 time = new;
c6153b5b 3129
858a18a6 3130 mod_timer(&net->ipv4.rt_secret_timer, jiffies + time);
c6153b5b
HX
3131 }
3132 rtnl_unlock();
3133}
3134
3135static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
c6153b5b
HX
3136 void __user *buffer, size_t *lenp,
3137 loff_t *ppos)
3138{
3139 int old = ip_rt_secret_interval;
8d65af78 3140 int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
c6153b5b
HX
3141
3142 rt_secret_reschedule(old);
3143
3144 return ret;
3145}
3146
eeb61f71 3147static ctl_table ipv4_route_table[] = {
1da177e4 3148 {
1da177e4
LT
3149 .procname = "gc_thresh",
3150 .data = &ipv4_dst_ops.gc_thresh,
3151 .maxlen = sizeof(int),
3152 .mode = 0644,
6d9f239a 3153 .proc_handler = proc_dointvec,
1da177e4
LT
3154 },
3155 {
1da177e4
LT
3156 .procname = "max_size",
3157 .data = &ip_rt_max_size,
3158 .maxlen = sizeof(int),
3159 .mode = 0644,
6d9f239a 3160 .proc_handler = proc_dointvec,
1da177e4
LT
3161 },
3162 {
3163 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3164
1da177e4
LT
3165 .procname = "gc_min_interval",
3166 .data = &ip_rt_gc_min_interval,
3167 .maxlen = sizeof(int),
3168 .mode = 0644,
6d9f239a 3169 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3170 },
3171 {
1da177e4
LT
3172 .procname = "gc_min_interval_ms",
3173 .data = &ip_rt_gc_min_interval,
3174 .maxlen = sizeof(int),
3175 .mode = 0644,
6d9f239a 3176 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3177 },
3178 {
1da177e4
LT
3179 .procname = "gc_timeout",
3180 .data = &ip_rt_gc_timeout,
3181 .maxlen = sizeof(int),
3182 .mode = 0644,
6d9f239a 3183 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3184 },
3185 {
1da177e4
LT
3186 .procname = "gc_interval",
3187 .data = &ip_rt_gc_interval,
3188 .maxlen = sizeof(int),
3189 .mode = 0644,
6d9f239a 3190 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3191 },
3192 {
1da177e4
LT
3193 .procname = "redirect_load",
3194 .data = &ip_rt_redirect_load,
3195 .maxlen = sizeof(int),
3196 .mode = 0644,
6d9f239a 3197 .proc_handler = proc_dointvec,
1da177e4
LT
3198 },
3199 {
1da177e4
LT
3200 .procname = "redirect_number",
3201 .data = &ip_rt_redirect_number,
3202 .maxlen = sizeof(int),
3203 .mode = 0644,
6d9f239a 3204 .proc_handler = proc_dointvec,
1da177e4
LT
3205 },
3206 {
1da177e4
LT
3207 .procname = "redirect_silence",
3208 .data = &ip_rt_redirect_silence,
3209 .maxlen = sizeof(int),
3210 .mode = 0644,
6d9f239a 3211 .proc_handler = proc_dointvec,
1da177e4
LT
3212 },
3213 {
1da177e4
LT
3214 .procname = "error_cost",
3215 .data = &ip_rt_error_cost,
3216 .maxlen = sizeof(int),
3217 .mode = 0644,
6d9f239a 3218 .proc_handler = proc_dointvec,
1da177e4
LT
3219 },
3220 {
1da177e4
LT
3221 .procname = "error_burst",
3222 .data = &ip_rt_error_burst,
3223 .maxlen = sizeof(int),
3224 .mode = 0644,
6d9f239a 3225 .proc_handler = proc_dointvec,
1da177e4
LT
3226 },
3227 {
1da177e4
LT
3228 .procname = "gc_elasticity",
3229 .data = &ip_rt_gc_elasticity,
3230 .maxlen = sizeof(int),
3231 .mode = 0644,
6d9f239a 3232 .proc_handler = proc_dointvec,
1da177e4
LT
3233 },
3234 {
1da177e4
LT
3235 .procname = "mtu_expires",
3236 .data = &ip_rt_mtu_expires,
3237 .maxlen = sizeof(int),
3238 .mode = 0644,
6d9f239a 3239 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3240 },
3241 {
1da177e4
LT
3242 .procname = "min_pmtu",
3243 .data = &ip_rt_min_pmtu,
3244 .maxlen = sizeof(int),
3245 .mode = 0644,
6d9f239a 3246 .proc_handler = proc_dointvec,
1da177e4
LT
3247 },
3248 {
1da177e4
LT
3249 .procname = "min_adv_mss",
3250 .data = &ip_rt_min_advmss,
3251 .maxlen = sizeof(int),
3252 .mode = 0644,
6d9f239a 3253 .proc_handler = proc_dointvec,
1da177e4
LT
3254 },
3255 {
1da177e4
LT
3256 .procname = "secret_interval",
3257 .data = &ip_rt_secret_interval,
3258 .maxlen = sizeof(int),
3259 .mode = 0644,
6d9f239a 3260 .proc_handler = ipv4_sysctl_rt_secret_interval,
1da177e4 3261 },
f8572d8f 3262 { }
1da177e4 3263};
39a23e75 3264
2f4520d3
AV
3265static struct ctl_table empty[1];
3266
3267static struct ctl_table ipv4_skeleton[] =
3268{
f8572d8f 3269 { .procname = "route",
d994af0d 3270 .mode = 0555, .child = ipv4_route_table},
f8572d8f 3271 { .procname = "neigh",
d994af0d 3272 .mode = 0555, .child = empty},
2f4520d3
AV
3273 { }
3274};
3275
3276static __net_initdata struct ctl_path ipv4_path[] = {
f8572d8f
EB
3277 { .procname = "net", },
3278 { .procname = "ipv4", },
39a23e75
DL
3279 { },
3280};
3281
39a23e75
DL
3282static struct ctl_table ipv4_route_flush_table[] = {
3283 {
39a23e75
DL
3284 .procname = "flush",
3285 .maxlen = sizeof(int),
3286 .mode = 0200,
6d9f239a 3287 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3288 },
f8572d8f 3289 { },
39a23e75
DL
3290};
3291
2f4520d3 3292static __net_initdata struct ctl_path ipv4_route_path[] = {
f8572d8f
EB
3293 { .procname = "net", },
3294 { .procname = "ipv4", },
3295 { .procname = "route", },
2f4520d3
AV
3296 { },
3297};
3298
39a23e75
DL
3299static __net_init int sysctl_route_net_init(struct net *net)
3300{
3301 struct ctl_table *tbl;
3302
3303 tbl = ipv4_route_flush_table;
09ad9bc7 3304 if (!net_eq(net, &init_net)) {
39a23e75
DL
3305 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3306 if (tbl == NULL)
3307 goto err_dup;
3308 }
3309 tbl[0].extra1 = net;
3310
3311 net->ipv4.route_hdr =
3312 register_net_sysctl_table(net, ipv4_route_path, tbl);
3313 if (net->ipv4.route_hdr == NULL)
3314 goto err_reg;
3315 return 0;
3316
3317err_reg:
3318 if (tbl != ipv4_route_flush_table)
3319 kfree(tbl);
3320err_dup:
3321 return -ENOMEM;
3322}
3323
3324static __net_exit void sysctl_route_net_exit(struct net *net)
3325{
3326 struct ctl_table *tbl;
3327
3328 tbl = net->ipv4.route_hdr->ctl_table_arg;
3329 unregister_net_sysctl_table(net->ipv4.route_hdr);
3330 BUG_ON(tbl == ipv4_route_flush_table);
3331 kfree(tbl);
3332}
3333
3334static __net_initdata struct pernet_operations sysctl_route_ops = {
3335 .init = sysctl_route_net_init,
3336 .exit = sysctl_route_net_exit,
3337};
1da177e4
LT
3338#endif
3339
9f5e97e5
DL
3340
3341static __net_init int rt_secret_timer_init(struct net *net)
3342{
e84f84f2
DL
3343 atomic_set(&net->ipv4.rt_genid,
3344 (int) ((num_physpages ^ (num_physpages>>8)) ^
3345 (jiffies ^ (jiffies >> 7))));
3346
9f5e97e5
DL
3347 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3348 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3349 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3350
c6153b5b
HX
3351 if (ip_rt_secret_interval) {
3352 net->ipv4.rt_secret_timer.expires =
3353 jiffies + net_random() % ip_rt_secret_interval +
3354 ip_rt_secret_interval;
3355 add_timer(&net->ipv4.rt_secret_timer);
3356 }
9f5e97e5
DL
3357 return 0;
3358}
3359
3360static __net_exit void rt_secret_timer_exit(struct net *net)
3361{
3362 del_timer_sync(&net->ipv4.rt_secret_timer);
3363}
3364
3365static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3366 .init = rt_secret_timer_init,
3367 .exit = rt_secret_timer_exit,
3368};
3369
3370
1da177e4 3371#ifdef CONFIG_NET_CLS_ROUTE
7d720c3e 3372struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
1da177e4
LT
3373#endif /* CONFIG_NET_CLS_ROUTE */
3374
3375static __initdata unsigned long rhash_entries;
3376static int __init set_rhash_entries(char *str)
3377{
3378 if (!str)
3379 return 0;
3380 rhash_entries = simple_strtoul(str, &str, 0);
3381 return 1;
3382}
3383__setup("rhash_entries=", set_rhash_entries);
3384
3385int __init ip_rt_init(void)
3386{
424c4b70 3387 int rc = 0;
1da177e4 3388
1da177e4 3389#ifdef CONFIG_NET_CLS_ROUTE
0dcec8c2 3390 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3391 if (!ip_rt_acct)
3392 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3393#endif
3394
e5d679f3
AD
3395 ipv4_dst_ops.kmem_cachep =
3396 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3397 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3398
14e50e57
DM
3399 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3400
424c4b70
ED
3401 rt_hash_table = (struct rt_hash_bucket *)
3402 alloc_large_system_hash("IP route cache",
3403 sizeof(struct rt_hash_bucket),
3404 rhash_entries,
4481374c 3405 (totalram_pages >= 128 * 1024) ?
18955cfc 3406 15 : 17,
8d1502de 3407 0,
424c4b70
ED
3408 &rt_hash_log,
3409 &rt_hash_mask,
c9503e0f 3410 rhash_entries ? 0 : 512 * 1024);
22c047cc
ED
3411 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3412 rt_hash_lock_init();
1da177e4
LT
3413
3414 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3415 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3416
1da177e4
LT
3417 devinet_init();
3418 ip_fib_init();
3419
1da177e4
LT
3420 /* All the timers, started at system startup tend
3421 to synchronize. Perturb it a bit.
3422 */
125bb8f5
ED
3423 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3424 expires_ljiffies = jiffies;
39c90ece
ED
3425 schedule_delayed_work(&expires_work,
3426 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
1da177e4 3427
9f5e97e5
DL
3428 if (register_pernet_subsys(&rt_secret_timer_ops))
3429 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
1da177e4 3430
73b38711 3431 if (ip_rt_proc_init())
107f1634 3432 printk(KERN_ERR "Unable to create route proc files\n");
1da177e4
LT
3433#ifdef CONFIG_XFRM
3434 xfrm_init();
a33bc5c1 3435 xfrm4_init(ip_rt_max_size);
1da177e4 3436#endif
63f3444f
TG
3437 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3438
39a23e75
DL
3439#ifdef CONFIG_SYSCTL
3440 register_pernet_subsys(&sysctl_route_ops);
3441#endif
1da177e4
LT
3442 return rc;
3443}
3444
a1bc6eb4 3445#ifdef CONFIG_SYSCTL
eeb61f71
AV
3446/*
3447 * We really need to sanitize the damn ipv4 init order, then all
3448 * this nonsense will go away.
3449 */
3450void __init ip_static_sysctl_init(void)
3451{
2f4520d3 3452 register_sysctl_paths(ipv4_path, ipv4_skeleton);
eeb61f71 3453}
a1bc6eb4 3454#endif
eeb61f71 3455
1da177e4
LT
3456EXPORT_SYMBOL(__ip_select_ident);
3457EXPORT_SYMBOL(ip_route_input);
3458EXPORT_SYMBOL(ip_route_output_key);