]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/route.c
Revert "microblaze: Kill NET_SKB_PAD and NET_IP_ALIGN overrides."
[net-next-2.6.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
1da177e4
LT
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
1da177e4 71#include <linux/mm.h>
424c4b70 72#include <linux/bootmem.h>
1da177e4
LT
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
39c90ece 82#include <linux/workqueue.h>
1da177e4 83#include <linux/skbuff.h>
1da177e4
LT
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
5a0e3ad6 93#include <linux/slab.h>
352e512c 94#include <net/dst.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
8d71740c 106#include <net/netevent.h>
63f3444f 107#include <net/rtnetlink.h>
1da177e4
LT
108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
112#define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
1da177e4 119static int ip_rt_max_size;
817bc4db
SH
120static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123static int ip_rt_redirect_number __read_mostly = 9;
124static int ip_rt_redirect_load __read_mostly = HZ / 50;
125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost __read_mostly = HZ;
127static int ip_rt_error_burst __read_mostly = 5 * HZ;
128static int ip_rt_gc_elasticity __read_mostly = 8;
129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256;
132static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
1080d709 133static int rt_chain_length_max __read_mostly = 20;
1da177e4 134
125bb8f5
ED
135static struct delayed_work expires_work;
136static unsigned long expires_ljiffies;
1da177e4
LT
137
138/*
139 * Interface to generic destination cache.
140 */
141
142static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
143static void ipv4_dst_destroy(struct dst_entry *dst);
144static void ipv4_dst_ifdown(struct dst_entry *dst,
145 struct net_device *dev, int how);
146static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
147static void ipv4_link_failure(struct sk_buff *skb);
148static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 149static int rt_garbage_collect(struct dst_ops *ops);
1da177e4
LT
150
151
152static struct dst_ops ipv4_dst_ops = {
153 .family = AF_INET,
09640e63 154 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4
LT
155 .gc = rt_garbage_collect,
156 .check = ipv4_dst_check,
157 .destroy = ipv4_dst_destroy,
158 .ifdown = ipv4_dst_ifdown,
159 .negative_advice = ipv4_negative_advice,
160 .link_failure = ipv4_link_failure,
161 .update_pmtu = ip_rt_update_pmtu,
1ac06e03 162 .local_out = __ip_local_out,
e2422970 163 .entries = ATOMIC_INIT(0),
1da177e4
LT
164};
165
166#define ECN_OR_COST(class) TC_PRIO_##class
167
4839c52b 168const __u8 ip_tos2prio[16] = {
1da177e4
LT
169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
185};
186
187
188/*
189 * Route cache.
190 */
191
192/* The locking scheme is rather straight forward:
193 *
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
199 * lock held.
200 */
201
202struct rt_hash_bucket {
203 struct rtable *chain;
22c047cc 204};
1080d709 205
8a25d5de
IM
206#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
62051200 211 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 212 */
62051200
IM
213#ifdef CONFIG_LOCKDEP
214# define RT_HASH_LOCK_SZ 256
22c047cc 215#else
62051200
IM
216# if NR_CPUS >= 32
217# define RT_HASH_LOCK_SZ 4096
218# elif NR_CPUS >= 16
219# define RT_HASH_LOCK_SZ 2048
220# elif NR_CPUS >= 8
221# define RT_HASH_LOCK_SZ 1024
222# elif NR_CPUS >= 4
223# define RT_HASH_LOCK_SZ 512
224# else
225# define RT_HASH_LOCK_SZ 256
226# endif
22c047cc
ED
227#endif
228
229static spinlock_t *rt_hash_locks;
230# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
231
232static __init void rt_hash_lock_init(void)
233{
234 int i;
235
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 GFP_KERNEL);
238 if (!rt_hash_locks)
239 panic("IP: failed to allocate rt_hash_locks\n");
240
241 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 spin_lock_init(&rt_hash_locks[i]);
243}
22c047cc
ED
244#else
245# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
246
247static inline void rt_hash_lock_init(void)
248{
249}
22c047cc 250#endif
1da177e4 251
817bc4db
SH
252static struct rt_hash_bucket *rt_hash_table __read_mostly;
253static unsigned rt_hash_mask __read_mostly;
254static unsigned int rt_hash_log __read_mostly;
1da177e4 255
2f970d83 256static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
dbd2915c 257#define RT_CACHE_STAT_INC(field) \
bfe5d834 258 (__raw_get_cpu_var(rt_cache_stat).field++)
1da177e4 259
b00180de 260static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
0eae88f3 261 int genid)
1da177e4 262{
0eae88f3 263 return jhash_3words((__force u32)daddr, (__force u32)saddr,
b00180de 264 idx, genid)
29e75252 265 & rt_hash_mask;
1da177e4
LT
266}
267
e84f84f2
DL
268static inline int rt_genid(struct net *net)
269{
270 return atomic_read(&net->ipv4.rt_genid);
271}
272
1da177e4
LT
273#ifdef CONFIG_PROC_FS
274struct rt_cache_iter_state {
a75e936f 275 struct seq_net_private p;
1da177e4 276 int bucket;
29e75252 277 int genid;
1da177e4
LT
278};
279
1218854a 280static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 281{
1218854a 282 struct rt_cache_iter_state *st = seq->private;
1da177e4 283 struct rtable *r = NULL;
1da177e4
LT
284
285 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
a6272665
ED
286 if (!rt_hash_table[st->bucket].chain)
287 continue;
1da177e4 288 rcu_read_lock_bh();
a898def2 289 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
29e75252 290 while (r) {
1218854a 291 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
a75e936f 292 r->rt_genid == st->genid)
29e75252 293 return r;
a898def2 294 r = rcu_dereference_bh(r->u.dst.rt_next);
29e75252 295 }
1da177e4
LT
296 rcu_read_unlock_bh();
297 }
29e75252 298 return r;
1da177e4
LT
299}
300
1218854a 301static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 302 struct rtable *r)
1da177e4 303{
1218854a 304 struct rt_cache_iter_state *st = seq->private;
a6272665 305
093c2ca4 306 r = r->u.dst.rt_next;
1da177e4
LT
307 while (!r) {
308 rcu_read_unlock_bh();
a6272665
ED
309 do {
310 if (--st->bucket < 0)
311 return NULL;
312 } while (!rt_hash_table[st->bucket].chain);
1da177e4
LT
313 rcu_read_lock_bh();
314 r = rt_hash_table[st->bucket].chain;
315 }
a898def2 316 return rcu_dereference_bh(r);
1da177e4
LT
317}
318
1218854a 319static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
320 struct rtable *r)
321{
1218854a
YH
322 struct rt_cache_iter_state *st = seq->private;
323 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
324 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
a75e936f 325 continue;
642d6318
DL
326 if (r->rt_genid == st->genid)
327 break;
328 }
329 return r;
330}
331
1218854a 332static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 333{
1218854a 334 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
335
336 if (r)
1218854a 337 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
338 --pos;
339 return pos ? NULL : r;
340}
341
342static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
343{
29e75252 344 struct rt_cache_iter_state *st = seq->private;
29e75252 345 if (*pos)
1218854a 346 return rt_cache_get_idx(seq, *pos - 1);
e84f84f2 347 st->genid = rt_genid(seq_file_net(seq));
29e75252 348 return SEQ_START_TOKEN;
1da177e4
LT
349}
350
351static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
352{
29e75252 353 struct rtable *r;
1da177e4
LT
354
355 if (v == SEQ_START_TOKEN)
1218854a 356 r = rt_cache_get_first(seq);
1da177e4 357 else
1218854a 358 r = rt_cache_get_next(seq, v);
1da177e4
LT
359 ++*pos;
360 return r;
361}
362
363static void rt_cache_seq_stop(struct seq_file *seq, void *v)
364{
365 if (v && v != SEQ_START_TOKEN)
366 rcu_read_unlock_bh();
367}
368
369static int rt_cache_seq_show(struct seq_file *seq, void *v)
370{
371 if (v == SEQ_START_TOKEN)
372 seq_printf(seq, "%-127s\n",
373 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
374 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
375 "HHUptod\tSpecDst");
376 else {
377 struct rtable *r = v;
5e659e4c 378 int len;
1da177e4 379
0eae88f3
ED
380 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
381 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
1da177e4 382 r->u.dst.dev ? r->u.dst.dev->name : "*",
0eae88f3
ED
383 (__force u32)r->rt_dst,
384 (__force u32)r->rt_gateway,
1da177e4 385 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
0eae88f3 386 r->u.dst.__use, 0, (__force u32)r->rt_src,
1da177e4
LT
387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 dst_metric(&r->u.dst, RTAX_WINDOW),
390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 r->fl.fl4_tos,
393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 dev_queue_xmit) : 0,
5e659e4c
PE
396 r->rt_spec_dst, &len);
397
398 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
399 }
400 return 0;
1da177e4
LT
401}
402
f690808e 403static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
404 .start = rt_cache_seq_start,
405 .next = rt_cache_seq_next,
406 .stop = rt_cache_seq_stop,
407 .show = rt_cache_seq_show,
408};
409
410static int rt_cache_seq_open(struct inode *inode, struct file *file)
411{
a75e936f 412 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 413 sizeof(struct rt_cache_iter_state));
1da177e4
LT
414}
415
9a32144e 416static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
417 .owner = THIS_MODULE,
418 .open = rt_cache_seq_open,
419 .read = seq_read,
420 .llseek = seq_lseek,
a75e936f 421 .release = seq_release_net,
1da177e4
LT
422};
423
424
425static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426{
427 int cpu;
428
429 if (*pos == 0)
430 return SEQ_START_TOKEN;
431
0f23174a 432 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
433 if (!cpu_possible(cpu))
434 continue;
435 *pos = cpu+1;
2f970d83 436 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
437 }
438 return NULL;
439}
440
441static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442{
443 int cpu;
444
0f23174a 445 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
446 if (!cpu_possible(cpu))
447 continue;
448 *pos = cpu+1;
2f970d83 449 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
450 }
451 return NULL;
e905a9ed 452
1da177e4
LT
453}
454
455static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456{
457
458}
459
460static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461{
462 struct rt_cache_stat *st = v;
463
464 if (v == SEQ_START_TOKEN) {
5bec0039 465 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
466 return 0;
467 }
e905a9ed 468
1da177e4
LT
469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops.entries),
472 st->in_hit,
473 st->in_slow_tot,
474 st->in_slow_mc,
475 st->in_no_route,
476 st->in_brd,
477 st->in_martian_dst,
478 st->in_martian_src,
479
480 st->out_hit,
481 st->out_slow_tot,
e905a9ed 482 st->out_slow_mc,
1da177e4
LT
483
484 st->gc_total,
485 st->gc_ignored,
486 st->gc_goal_miss,
487 st->gc_dst_overflow,
488 st->in_hlist_search,
489 st->out_hlist_search
490 );
491 return 0;
492}
493
f690808e 494static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
495 .start = rt_cpu_seq_start,
496 .next = rt_cpu_seq_next,
497 .stop = rt_cpu_seq_stop,
498 .show = rt_cpu_seq_show,
499};
500
501
502static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503{
504 return seq_open(file, &rt_cpu_seq_ops);
505}
506
9a32144e 507static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
508 .owner = THIS_MODULE,
509 .open = rt_cpu_seq_open,
510 .read = seq_read,
511 .llseek = seq_lseek,
512 .release = seq_release,
513};
514
78c686e9 515#ifdef CONFIG_NET_CLS_ROUTE
a661c419 516static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 517{
a661c419
AD
518 struct ip_rt_acct *dst, *src;
519 unsigned int i, j;
520
521 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 if (!dst)
523 return -ENOMEM;
524
525 for_each_possible_cpu(i) {
526 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527 for (j = 0; j < 256; j++) {
528 dst[j].o_bytes += src[j].o_bytes;
529 dst[j].o_packets += src[j].o_packets;
530 dst[j].i_bytes += src[j].i_bytes;
531 dst[j].i_packets += src[j].i_packets;
532 }
78c686e9
PE
533 }
534
a661c419
AD
535 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
536 kfree(dst);
537 return 0;
538}
78c686e9 539
a661c419
AD
540static int rt_acct_proc_open(struct inode *inode, struct file *file)
541{
542 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 543}
a661c419
AD
544
545static const struct file_operations rt_acct_proc_fops = {
546 .owner = THIS_MODULE,
547 .open = rt_acct_proc_open,
548 .read = seq_read,
549 .llseek = seq_lseek,
550 .release = single_release,
551};
78c686e9 552#endif
107f1634 553
73b38711 554static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
555{
556 struct proc_dir_entry *pde;
557
558 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
559 &rt_cache_seq_fops);
560 if (!pde)
561 goto err1;
562
77020720
WC
563 pde = proc_create("rt_cache", S_IRUGO,
564 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
565 if (!pde)
566 goto err2;
567
107f1634 568#ifdef CONFIG_NET_CLS_ROUTE
a661c419 569 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
570 if (!pde)
571 goto err3;
572#endif
573 return 0;
574
575#ifdef CONFIG_NET_CLS_ROUTE
576err3:
577 remove_proc_entry("rt_cache", net->proc_net_stat);
578#endif
579err2:
580 remove_proc_entry("rt_cache", net->proc_net);
581err1:
582 return -ENOMEM;
583}
73b38711
DL
584
585static void __net_exit ip_rt_do_proc_exit(struct net *net)
586{
587 remove_proc_entry("rt_cache", net->proc_net_stat);
588 remove_proc_entry("rt_cache", net->proc_net);
0a931acf 589#ifdef CONFIG_NET_CLS_ROUTE
73b38711 590 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 591#endif
73b38711
DL
592}
593
594static struct pernet_operations ip_rt_proc_ops __net_initdata = {
595 .init = ip_rt_do_proc_init,
596 .exit = ip_rt_do_proc_exit,
597};
598
599static int __init ip_rt_proc_init(void)
600{
601 return register_pernet_subsys(&ip_rt_proc_ops);
602}
603
107f1634 604#else
73b38711 605static inline int ip_rt_proc_init(void)
107f1634
PE
606{
607 return 0;
608}
1da177e4 609#endif /* CONFIG_PROC_FS */
e905a9ed 610
5969f71d 611static inline void rt_free(struct rtable *rt)
1da177e4 612{
1da177e4
LT
613 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
614}
615
5969f71d 616static inline void rt_drop(struct rtable *rt)
1da177e4 617{
1da177e4
LT
618 ip_rt_put(rt);
619 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
620}
621
5969f71d 622static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
623{
624 /* Kill broadcast/multicast entries very aggresively, if they
625 collide in hash table with more useful entries */
626 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
093c2ca4 627 rth->fl.iif && rth->u.dst.rt_next;
1da177e4
LT
628}
629
5969f71d 630static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
631{
632 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 rth->u.dst.expires;
634}
635
636static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
637{
638 unsigned long age;
639 int ret = 0;
640
641 if (atomic_read(&rth->u.dst.__refcnt))
642 goto out;
643
644 ret = 1;
645 if (rth->u.dst.expires &&
646 time_after_eq(jiffies, rth->u.dst.expires))
647 goto out;
648
649 age = jiffies - rth->u.dst.lastuse;
650 ret = 0;
651 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 (age <= tmo2 && rt_valuable(rth)))
653 goto out;
654 ret = 1;
655out: return ret;
656}
657
658/* Bits of score are:
659 * 31: very valuable
660 * 30: not quite useless
661 * 29..0: usage counter
662 */
663static inline u32 rt_score(struct rtable *rt)
664{
665 u32 score = jiffies - rt->u.dst.lastuse;
666
667 score = ~score & ~(3<<30);
668
669 if (rt_valuable(rt))
670 score |= (1<<31);
671
672 if (!rt->fl.iif ||
673 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 score |= (1<<30);
675
676 return score;
677}
678
1080d709
NH
679static inline bool rt_caching(const struct net *net)
680{
681 return net->ipv4.current_rt_cache_rebuild_count <=
682 net->ipv4.sysctl_rt_cache_rebuild_count;
683}
684
685static inline bool compare_hash_inputs(const struct flowi *fl1,
686 const struct flowi *fl2)
687{
0eae88f3
ED
688 return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
689 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
1080d709
NH
690 (fl1->iif ^ fl2->iif)) == 0);
691}
692
1da177e4
LT
693static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694{
0eae88f3
ED
695 return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
696 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
47dcf0cb 697 (fl1->mark ^ fl2->mark) |
0eae88f3 698 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
8238b218
DM
699 (fl1->oif ^ fl2->oif) |
700 (fl1->iif ^ fl2->iif)) == 0;
1da177e4
LT
701}
702
b5921910
DL
703static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
704{
09ad9bc7 705 return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
b5921910
DL
706}
707
e84f84f2
DL
708static inline int rt_is_expired(struct rtable *rth)
709{
710 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
711}
712
beb659bd
ED
713/*
714 * Perform a full scan of hash table and free all entries.
715 * Can be called by a softirq or a process.
716 * In the later case, we want to be reschedule if necessary
717 */
718static void rt_do_flush(int process_context)
719{
720 unsigned int i;
721 struct rtable *rth, *next;
32cb5b4e 722 struct rtable * tail;
beb659bd
ED
723
724 for (i = 0; i <= rt_hash_mask; i++) {
725 if (process_context && need_resched())
726 cond_resched();
727 rth = rt_hash_table[i].chain;
728 if (!rth)
729 continue;
730
731 spin_lock_bh(rt_hash_lock_addr(i));
32cb5b4e
DL
732#ifdef CONFIG_NET_NS
733 {
734 struct rtable ** prev, * p;
735
736 rth = rt_hash_table[i].chain;
737
738 /* defer releasing the head of the list after spin_unlock */
739 for (tail = rth; tail; tail = tail->u.dst.rt_next)
740 if (!rt_is_expired(tail))
741 break;
742 if (rth != tail)
743 rt_hash_table[i].chain = tail;
744
745 /* call rt_free on entries after the tail requiring flush */
746 prev = &rt_hash_table[i].chain;
747 for (p = *prev; p; p = next) {
748 next = p->u.dst.rt_next;
749 if (!rt_is_expired(p)) {
750 prev = &p->u.dst.rt_next;
751 } else {
752 *prev = next;
753 rt_free(p);
754 }
755 }
756 }
757#else
beb659bd
ED
758 rth = rt_hash_table[i].chain;
759 rt_hash_table[i].chain = NULL;
32cb5b4e
DL
760 tail = NULL;
761#endif
beb659bd
ED
762 spin_unlock_bh(rt_hash_lock_addr(i));
763
32cb5b4e 764 for (; rth != tail; rth = next) {
beb659bd
ED
765 next = rth->u.dst.rt_next;
766 rt_free(rth);
767 }
768 }
769}
770
1080d709
NH
771/*
772 * While freeing expired entries, we compute average chain length
773 * and standard deviation, using fixed-point arithmetic.
774 * This to have an estimation of rt_chain_length_max
775 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
776 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
777 */
778
779#define FRACT_BITS 3
780#define ONE (1UL << FRACT_BITS)
781
98376387
ED
782/*
783 * Given a hash chain and an item in this hash chain,
784 * find if a previous entry has the same hash_inputs
785 * (but differs on tos, mark or oif)
786 * Returns 0 if an alias is found.
787 * Returns ONE if rth has no alias before itself.
788 */
789static int has_noalias(const struct rtable *head, const struct rtable *rth)
790{
791 const struct rtable *aux = head;
792
793 while (aux != rth) {
794 if (compare_hash_inputs(&aux->fl, &rth->fl))
795 return 0;
796 aux = aux->u.dst.rt_next;
797 }
798 return ONE;
799}
800
beb659bd 801static void rt_check_expire(void)
1da177e4 802{
bb1d23b0
ED
803 static unsigned int rover;
804 unsigned int i = rover, goal;
98376387 805 struct rtable *rth, **rthp;
cf8da764 806 unsigned long samples = 0;
1080d709 807 unsigned long sum = 0, sum2 = 0;
125bb8f5 808 unsigned long delta;
bb1d23b0
ED
809 u64 mult;
810
125bb8f5
ED
811 delta = jiffies - expires_ljiffies;
812 expires_ljiffies = jiffies;
813 mult = ((u64)delta) << rt_hash_log;
bb1d23b0
ED
814 if (ip_rt_gc_timeout > 1)
815 do_div(mult, ip_rt_gc_timeout);
816 goal = (unsigned int)mult;
39c90ece
ED
817 if (goal > rt_hash_mask)
818 goal = rt_hash_mask + 1;
bb1d23b0 819 for (; goal > 0; goal--) {
1da177e4 820 unsigned long tmo = ip_rt_gc_timeout;
cf8da764 821 unsigned long length;
1da177e4
LT
822
823 i = (i + 1) & rt_hash_mask;
824 rthp = &rt_hash_table[i].chain;
825
d90bf5a9
ED
826 if (need_resched())
827 cond_resched();
828
1080d709
NH
829 samples++;
830
cfcabdcc 831 if (*rthp == NULL)
bb1d23b0 832 continue;
cf8da764 833 length = 0;
39c90ece 834 spin_lock_bh(rt_hash_lock_addr(i));
1da177e4 835 while ((rth = *rthp) != NULL) {
1ddbcb00 836 prefetch(rth->u.dst.rt_next);
e84f84f2 837 if (rt_is_expired(rth)) {
29e75252
ED
838 *rthp = rth->u.dst.rt_next;
839 rt_free(rth);
840 continue;
841 }
1da177e4
LT
842 if (rth->u.dst.expires) {
843 /* Entry is expired even if it is in use */
39c90ece 844 if (time_before_eq(jiffies, rth->u.dst.expires)) {
1ddbcb00 845nofree:
1da177e4 846 tmo >>= 1;
093c2ca4 847 rthp = &rth->u.dst.rt_next;
1080d709 848 /*
1ddbcb00 849 * We only count entries on
1080d709
NH
850 * a chain with equal hash inputs once
851 * so that entries for different QOS
852 * levels, and other non-hash input
853 * attributes don't unfairly skew
854 * the length computation
855 */
98376387 856 length += has_noalias(rt_hash_table[i].chain, rth);
1da177e4
LT
857 continue;
858 }
1ddbcb00
ED
859 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
860 goto nofree;
1da177e4
LT
861
862 /* Cleanup aged off entries. */
093c2ca4 863 *rthp = rth->u.dst.rt_next;
e905a9ed 864 rt_free(rth);
1da177e4 865 }
39c90ece 866 spin_unlock_bh(rt_hash_lock_addr(i));
1080d709
NH
867 sum += length;
868 sum2 += length*length;
869 }
870 if (samples) {
871 unsigned long avg = sum / samples;
872 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
873 rt_chain_length_max = max_t(unsigned long,
874 ip_rt_gc_elasticity,
875 (avg + 4*sd) >> FRACT_BITS);
1da177e4
LT
876 }
877 rover = i;
beb659bd
ED
878}
879
880/*
881 * rt_worker_func() is run in process context.
29e75252 882 * we call rt_check_expire() to scan part of the hash table
beb659bd
ED
883 */
884static void rt_worker_func(struct work_struct *work)
885{
29e75252 886 rt_check_expire();
39c90ece 887 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
1da177e4
LT
888}
889
29e75252
ED
890/*
891 * Pertubation of rt_genid by a small quantity [1..256]
892 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
893 * many times (2^24) without giving recent rt_genid.
894 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 895 */
86c657f6 896static void rt_cache_invalidate(struct net *net)
1da177e4 897{
29e75252 898 unsigned char shuffle;
1da177e4 899
29e75252 900 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 901 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
1da177e4
LT
902}
903
29e75252
ED
904/*
905 * delay < 0 : invalidate cache (fast : entries will be deleted later)
906 * delay >= 0 : invalidate & flush cache (can be long)
907 */
76e6ebfb 908void rt_cache_flush(struct net *net, int delay)
1da177e4 909{
86c657f6 910 rt_cache_invalidate(net);
29e75252
ED
911 if (delay >= 0)
912 rt_do_flush(!in_softirq());
1da177e4
LT
913}
914
a5ee1551
EB
915/* Flush previous cache invalidated entries from the cache */
916void rt_cache_flush_batch(void)
917{
918 rt_do_flush(!in_softirq());
919}
920
beb659bd 921/*
29e75252 922 * We change rt_genid and let gc do the cleanup
beb659bd 923 */
9f5e97e5 924static void rt_secret_rebuild(unsigned long __net)
1da177e4 925{
9f5e97e5 926 struct net *net = (struct net *)__net;
86c657f6 927 rt_cache_invalidate(net);
9f5e97e5 928 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
1da177e4
LT
929}
930
1080d709
NH
931static void rt_secret_rebuild_oneshot(struct net *net)
932{
933 del_timer_sync(&net->ipv4.rt_secret_timer);
934 rt_cache_invalidate(net);
858a18a6
VG
935 if (ip_rt_secret_interval)
936 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
1080d709
NH
937}
938
939static void rt_emergency_hash_rebuild(struct net *net)
940{
941 if (net_ratelimit()) {
942 printk(KERN_WARNING "Route hash chain too long!\n");
943 printk(KERN_WARNING "Adjust your secret_interval!\n");
944 }
945
946 rt_secret_rebuild_oneshot(net);
947}
948
1da177e4
LT
949/*
950 Short description of GC goals.
951
952 We want to build algorithm, which will keep routing cache
953 at some equilibrium point, when number of aged off entries
954 is kept approximately equal to newly generated ones.
955
956 Current expiration strength is variable "expire".
957 We try to adjust it dynamically, so that if networking
958 is idle expires is large enough to keep enough of warm entries,
959 and when load increases it reduces to limit cache size.
960 */
961
569d3645 962static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
963{
964 static unsigned long expire = RT_GC_TIMEOUT;
965 static unsigned long last_gc;
966 static int rover;
967 static int equilibrium;
968 struct rtable *rth, **rthp;
969 unsigned long now = jiffies;
970 int goal;
971
972 /*
973 * Garbage collection is pretty expensive,
974 * do not make it too frequently.
975 */
976
977 RT_CACHE_STAT_INC(gc_total);
978
979 if (now - last_gc < ip_rt_gc_min_interval &&
980 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
981 RT_CACHE_STAT_INC(gc_ignored);
982 goto out;
983 }
984
985 /* Calculate number of entries, which we want to expire now. */
986 goal = atomic_read(&ipv4_dst_ops.entries) -
987 (ip_rt_gc_elasticity << rt_hash_log);
988 if (goal <= 0) {
989 if (equilibrium < ipv4_dst_ops.gc_thresh)
990 equilibrium = ipv4_dst_ops.gc_thresh;
991 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
992 if (goal > 0) {
b790cedd 993 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1da177e4
LT
994 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
995 }
996 } else {
997 /* We are in dangerous area. Try to reduce cache really
998 * aggressively.
999 */
b790cedd 1000 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1da177e4
LT
1001 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
1002 }
1003
1004 if (now - last_gc >= ip_rt_gc_min_interval)
1005 last_gc = now;
1006
1007 if (goal <= 0) {
1008 equilibrium += goal;
1009 goto work_done;
1010 }
1011
1012 do {
1013 int i, k;
1014
1015 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1016 unsigned long tmo = expire;
1017
1018 k = (k + 1) & rt_hash_mask;
1019 rthp = &rt_hash_table[k].chain;
22c047cc 1020 spin_lock_bh(rt_hash_lock_addr(k));
1da177e4 1021 while ((rth = *rthp) != NULL) {
e84f84f2 1022 if (!rt_is_expired(rth) &&
29e75252 1023 !rt_may_expire(rth, tmo, expire)) {
1da177e4 1024 tmo >>= 1;
093c2ca4 1025 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1026 continue;
1027 }
093c2ca4 1028 *rthp = rth->u.dst.rt_next;
1da177e4
LT
1029 rt_free(rth);
1030 goal--;
1da177e4 1031 }
22c047cc 1032 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
1033 if (goal <= 0)
1034 break;
1035 }
1036 rover = k;
1037
1038 if (goal <= 0)
1039 goto work_done;
1040
1041 /* Goal is not achieved. We stop process if:
1042
1043 - if expire reduced to zero. Otherwise, expire is halfed.
1044 - if table is not full.
1045 - if we are called from interrupt.
1046 - jiffies check is just fallback/debug loop breaker.
1047 We will not spin here for long time in any case.
1048 */
1049
1050 RT_CACHE_STAT_INC(gc_goal_miss);
1051
1052 if (expire == 0)
1053 break;
1054
1055 expire >>= 1;
1056#if RT_CACHE_DEBUG >= 2
1057 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1058 atomic_read(&ipv4_dst_ops.entries), goal, i);
1059#endif
1060
1061 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1062 goto out;
1063 } while (!in_softirq() && time_before_eq(jiffies, now));
1064
1065 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1066 goto out;
1067 if (net_ratelimit())
1068 printk(KERN_WARNING "dst cache overflow\n");
1069 RT_CACHE_STAT_INC(gc_dst_overflow);
1070 return 1;
1071
1072work_done:
1073 expire += ip_rt_gc_min_interval;
1074 if (expire > ip_rt_gc_timeout ||
1075 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1076 expire = ip_rt_gc_timeout;
1077#if RT_CACHE_DEBUG >= 2
1078 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1079 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1080#endif
1081out: return 0;
1082}
1083
98376387
ED
1084/*
1085 * Returns number of entries in a hash chain that have different hash_inputs
1086 */
1087static int slow_chain_length(const struct rtable *head)
1088{
1089 int length = 0;
1090 const struct rtable *rth = head;
1091
1092 while (rth) {
1093 length += has_noalias(head, rth);
1094 rth = rth->u.dst.rt_next;
1095 }
1096 return length >> FRACT_BITS;
1097}
1098
511c3f92 1099static int rt_intern_hash(unsigned hash, struct rtable *rt,
6a2bad70 1100 struct rtable **rp, struct sk_buff *skb, int ifindex)
1da177e4
LT
1101{
1102 struct rtable *rth, **rthp;
1103 unsigned long now;
1104 struct rtable *cand, **candp;
1105 u32 min_score;
1106 int chain_length;
1107 int attempts = !in_softirq();
1108
1109restart:
1110 chain_length = 0;
1111 min_score = ~(u32)0;
1112 cand = NULL;
1113 candp = NULL;
1114 now = jiffies;
1115
1080d709 1116 if (!rt_caching(dev_net(rt->u.dst.dev))) {
73e42897
NH
1117 /*
1118 * If we're not caching, just tell the caller we
1119 * were successful and don't touch the route. The
1120 * caller hold the sole reference to the cache entry, and
1121 * it will be released when the caller is done with it.
1122 * If we drop it here, the callers have no way to resolve routes
1123 * when we're not caching. Instead, just point *rp at rt, so
1124 * the caller gets a single use out of the route
b6280b47
NH
1125 * Note that we do rt_free on this new route entry, so that
1126 * once its refcount hits zero, we are still able to reap it
1127 * (Thanks Alexey)
1128 * Note also the rt_free uses call_rcu. We don't actually
1129 * need rcu protection here, this is just our path to get
1130 * on the route gc list.
73e42897 1131 */
b6280b47
NH
1132
1133 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1134 int err = arp_bind_neighbour(&rt->u.dst);
1135 if (err) {
1136 if (net_ratelimit())
1137 printk(KERN_WARNING
1138 "Neighbour table failure & not caching routes.\n");
1139 rt_drop(rt);
1140 return err;
1141 }
1142 }
1143
1144 rt_free(rt);
1145 goto skip_hashing;
1080d709
NH
1146 }
1147
1da177e4
LT
1148 rthp = &rt_hash_table[hash].chain;
1149
22c047cc 1150 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1151 while ((rth = *rthp) != NULL) {
e84f84f2 1152 if (rt_is_expired(rth)) {
29e75252
ED
1153 *rthp = rth->u.dst.rt_next;
1154 rt_free(rth);
1155 continue;
1156 }
b5921910 1157 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1da177e4 1158 /* Put it first */
093c2ca4 1159 *rthp = rth->u.dst.rt_next;
1da177e4
LT
1160 /*
1161 * Since lookup is lockfree, the deletion
1162 * must be visible to another weakly ordered CPU before
1163 * the insertion at the start of the hash chain.
1164 */
093c2ca4 1165 rcu_assign_pointer(rth->u.dst.rt_next,
1da177e4
LT
1166 rt_hash_table[hash].chain);
1167 /*
1168 * Since lookup is lockfree, the update writes
1169 * must be ordered for consistency on SMP.
1170 */
1171 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1172
03f49f34 1173 dst_use(&rth->u.dst, now);
22c047cc 1174 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1175
1176 rt_drop(rt);
511c3f92
ED
1177 if (rp)
1178 *rp = rth;
1179 else
adf30907 1180 skb_dst_set(skb, &rth->u.dst);
1da177e4
LT
1181 return 0;
1182 }
1183
1184 if (!atomic_read(&rth->u.dst.__refcnt)) {
1185 u32 score = rt_score(rth);
1186
1187 if (score <= min_score) {
1188 cand = rth;
1189 candp = rthp;
1190 min_score = score;
1191 }
1192 }
1193
1194 chain_length++;
1195
093c2ca4 1196 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1197 }
1198
1199 if (cand) {
1200 /* ip_rt_gc_elasticity used to be average length of chain
1201 * length, when exceeded gc becomes really aggressive.
1202 *
1203 * The second limit is less certain. At the moment it allows
1204 * only 2 entries per bucket. We will see.
1205 */
1206 if (chain_length > ip_rt_gc_elasticity) {
093c2ca4 1207 *candp = cand->u.dst.rt_next;
1da177e4
LT
1208 rt_free(cand);
1209 }
1080d709 1210 } else {
98376387
ED
1211 if (chain_length > rt_chain_length_max &&
1212 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1080d709
NH
1213 struct net *net = dev_net(rt->u.dst.dev);
1214 int num = ++net->ipv4.current_rt_cache_rebuild_count;
b35ecb5d 1215 if (!rt_caching(net)) {
1080d709
NH
1216 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1217 rt->u.dst.dev->name, num);
1218 }
b35ecb5d 1219 rt_emergency_hash_rebuild(net);
6a2bad70
PE
1220 spin_unlock_bh(rt_hash_lock_addr(hash));
1221
1222 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1223 ifindex, rt_genid(net));
1224 goto restart;
1080d709 1225 }
1da177e4
LT
1226 }
1227
1228 /* Try to bind route to arp only if it is output
1229 route or unicast forwarding path.
1230 */
1231 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1232 int err = arp_bind_neighbour(&rt->u.dst);
1233 if (err) {
22c047cc 1234 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1235
1236 if (err != -ENOBUFS) {
1237 rt_drop(rt);
1238 return err;
1239 }
1240
1241 /* Neighbour tables are full and nothing
1242 can be released. Try to shrink route cache,
1243 it is most likely it holds some neighbour records.
1244 */
1245 if (attempts-- > 0) {
1246 int saved_elasticity = ip_rt_gc_elasticity;
1247 int saved_int = ip_rt_gc_min_interval;
1248 ip_rt_gc_elasticity = 1;
1249 ip_rt_gc_min_interval = 0;
569d3645 1250 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1251 ip_rt_gc_min_interval = saved_int;
1252 ip_rt_gc_elasticity = saved_elasticity;
1253 goto restart;
1254 }
1255
1256 if (net_ratelimit())
1257 printk(KERN_WARNING "Neighbour table overflow.\n");
1258 rt_drop(rt);
1259 return -ENOBUFS;
1260 }
1261 }
1262
1ddbcb00 1263 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1080d709 1264
1da177e4 1265#if RT_CACHE_DEBUG >= 2
093c2ca4 1266 if (rt->u.dst.rt_next) {
1da177e4 1267 struct rtable *trt;
b6280b47
NH
1268 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1269 hash, &rt->rt_dst);
093c2ca4 1270 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
673d57e7 1271 printk(" . %pI4", &trt->rt_dst);
1da177e4
LT
1272 printk("\n");
1273 }
1274#endif
00269b54
ED
1275 /*
1276 * Since lookup is lockfree, we must make sure
1277 * previous writes to rt are comitted to memory
1278 * before making rt visible to other CPUS.
1279 */
1ddbcb00 1280 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1080d709 1281
22c047cc 1282 spin_unlock_bh(rt_hash_lock_addr(hash));
73e42897 1283
b6280b47 1284skip_hashing:
511c3f92
ED
1285 if (rp)
1286 *rp = rt;
1287 else
adf30907 1288 skb_dst_set(skb, &rt->u.dst);
1da177e4
LT
1289 return 0;
1290}
1291
1292void rt_bind_peer(struct rtable *rt, int create)
1293{
1294 static DEFINE_SPINLOCK(rt_peer_lock);
1295 struct inet_peer *peer;
1296
1297 peer = inet_getpeer(rt->rt_dst, create);
1298
1299 spin_lock_bh(&rt_peer_lock);
1300 if (rt->peer == NULL) {
1301 rt->peer = peer;
1302 peer = NULL;
1303 }
1304 spin_unlock_bh(&rt_peer_lock);
1305 if (peer)
1306 inet_putpeer(peer);
1307}
1308
1309/*
1310 * Peer allocation may fail only in serious out-of-memory conditions. However
1311 * we still can generate some output.
1312 * Random ID selection looks a bit dangerous because we have no chances to
1313 * select ID being unique in a reasonable period of time.
1314 * But broken packet identifier may be better than no packet at all.
1315 */
1316static void ip_select_fb_ident(struct iphdr *iph)
1317{
1318 static DEFINE_SPINLOCK(ip_fb_id_lock);
1319 static u32 ip_fallback_id;
1320 u32 salt;
1321
1322 spin_lock_bh(&ip_fb_id_lock);
e448515c 1323 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1324 iph->id = htons(salt & 0xFFFF);
1325 ip_fallback_id = salt;
1326 spin_unlock_bh(&ip_fb_id_lock);
1327}
1328
1329void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1330{
1331 struct rtable *rt = (struct rtable *) dst;
1332
1333 if (rt) {
1334 if (rt->peer == NULL)
1335 rt_bind_peer(rt, 1);
1336
1337 /* If peer is attached to destination, it is never detached,
1338 so that we need not to grab a lock to dereference it.
1339 */
1340 if (rt->peer) {
1341 iph->id = htons(inet_getid(rt->peer, more));
1342 return;
1343 }
1344 } else
e905a9ed 1345 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1346 __builtin_return_address(0));
1da177e4
LT
1347
1348 ip_select_fb_ident(iph);
1349}
1350
1351static void rt_del(unsigned hash, struct rtable *rt)
1352{
29e75252 1353 struct rtable **rthp, *aux;
1da177e4 1354
29e75252 1355 rthp = &rt_hash_table[hash].chain;
22c047cc 1356 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1357 ip_rt_put(rt);
29e75252 1358 while ((aux = *rthp) != NULL) {
e84f84f2 1359 if (aux == rt || rt_is_expired(aux)) {
29e75252
ED
1360 *rthp = aux->u.dst.rt_next;
1361 rt_free(aux);
1362 continue;
1da177e4 1363 }
29e75252
ED
1364 rthp = &aux->u.dst.rt_next;
1365 }
22c047cc 1366 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1367}
1368
f7655229
AV
1369void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1370 __be32 saddr, struct net_device *dev)
1da177e4
LT
1371{
1372 int i, k;
1373 struct in_device *in_dev = in_dev_get(dev);
1374 struct rtable *rth, **rthp;
f7655229 1375 __be32 skeys[2] = { saddr, 0 };
1da177e4 1376 int ikeys[2] = { dev->ifindex, 0 };
8d71740c 1377 struct netevent_redirect netevent;
317805b8 1378 struct net *net;
1da177e4 1379
1da177e4
LT
1380 if (!in_dev)
1381 return;
1382
c346dca1 1383 net = dev_net(dev);
9d4fb27d
JP
1384 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1385 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1386 ipv4_is_zeronet(new_gw))
1da177e4
LT
1387 goto reject_redirect;
1388
1080d709
NH
1389 if (!rt_caching(net))
1390 goto reject_redirect;
1391
1da177e4
LT
1392 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1393 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1394 goto reject_redirect;
1395 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1396 goto reject_redirect;
1397 } else {
317805b8 1398 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1399 goto reject_redirect;
1400 }
1401
1402 for (i = 0; i < 2; i++) {
1403 for (k = 0; k < 2; k++) {
b00180de 1404 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
e84f84f2 1405 rt_genid(net));
1da177e4
LT
1406
1407 rthp=&rt_hash_table[hash].chain;
1408
1409 rcu_read_lock();
1410 while ((rth = rcu_dereference(*rthp)) != NULL) {
1411 struct rtable *rt;
1412
1413 if (rth->fl.fl4_dst != daddr ||
1414 rth->fl.fl4_src != skeys[i] ||
1da177e4 1415 rth->fl.oif != ikeys[k] ||
29e75252 1416 rth->fl.iif != 0 ||
e84f84f2 1417 rt_is_expired(rth) ||
878628fb 1418 !net_eq(dev_net(rth->u.dst.dev), net)) {
093c2ca4 1419 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1420 continue;
1421 }
1422
1423 if (rth->rt_dst != daddr ||
1424 rth->rt_src != saddr ||
1425 rth->u.dst.error ||
1426 rth->rt_gateway != old_gw ||
1427 rth->u.dst.dev != dev)
1428 break;
1429
1430 dst_hold(&rth->u.dst);
1431 rcu_read_unlock();
1432
1433 rt = dst_alloc(&ipv4_dst_ops);
1434 if (rt == NULL) {
1435 ip_rt_put(rth);
1436 in_dev_put(in_dev);
1437 return;
1438 }
1439
1440 /* Copy all the information. */
1441 *rt = *rth;
1da177e4
LT
1442 rt->u.dst.__use = 1;
1443 atomic_set(&rt->u.dst.__refcnt, 1);
1444 rt->u.dst.child = NULL;
1445 if (rt->u.dst.dev)
1446 dev_hold(rt->u.dst.dev);
1447 if (rt->idev)
1448 in_dev_hold(rt->idev);
d11a4dc1 1449 rt->u.dst.obsolete = -1;
1da177e4
LT
1450 rt->u.dst.lastuse = jiffies;
1451 rt->u.dst.path = &rt->u.dst;
1452 rt->u.dst.neighbour = NULL;
1453 rt->u.dst.hh = NULL;
def8b4fa 1454#ifdef CONFIG_XFRM
1da177e4 1455 rt->u.dst.xfrm = NULL;
def8b4fa 1456#endif
e84f84f2 1457 rt->rt_genid = rt_genid(net);
1da177e4
LT
1458 rt->rt_flags |= RTCF_REDIRECTED;
1459
1460 /* Gateway is different ... */
1461 rt->rt_gateway = new_gw;
1462
1463 /* Redirect received -> path was valid */
1464 dst_confirm(&rth->u.dst);
1465
1466 if (rt->peer)
1467 atomic_inc(&rt->peer->refcnt);
1468
1469 if (arp_bind_neighbour(&rt->u.dst) ||
1470 !(rt->u.dst.neighbour->nud_state &
1471 NUD_VALID)) {
1472 if (rt->u.dst.neighbour)
1473 neigh_event_send(rt->u.dst.neighbour, NULL);
1474 ip_rt_put(rth);
1475 rt_drop(rt);
1476 goto do_next;
1477 }
e905a9ed 1478
8d71740c
TT
1479 netevent.old = &rth->u.dst;
1480 netevent.new = &rt->u.dst;
e905a9ed
YH
1481 call_netevent_notifiers(NETEVENT_REDIRECT,
1482 &netevent);
1da177e4
LT
1483
1484 rt_del(hash, rth);
6a2bad70 1485 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1da177e4
LT
1486 ip_rt_put(rt);
1487 goto do_next;
1488 }
1489 rcu_read_unlock();
1490 do_next:
1491 ;
1492 }
1493 }
1494 in_dev_put(in_dev);
1495 return;
1496
1497reject_redirect:
1498#ifdef CONFIG_IP_ROUTE_VERBOSE
1499 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
1500 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1501 " Advised path = %pI4 -> %pI4\n",
1502 &old_gw, dev->name, &new_gw,
1503 &saddr, &daddr);
1da177e4
LT
1504#endif
1505 in_dev_put(in_dev);
1506}
1507
1508static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1509{
ee6b9673 1510 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1511 struct dst_entry *ret = dst;
1512
1513 if (rt) {
d11a4dc1 1514 if (dst->obsolete > 0) {
1da177e4
LT
1515 ip_rt_put(rt);
1516 ret = NULL;
1517 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
5e016cbf
GR
1518 (rt->u.dst.expires &&
1519 time_after_eq(jiffies, rt->u.dst.expires))) {
8c7bc840 1520 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
b00180de 1521 rt->fl.oif,
e84f84f2 1522 rt_genid(dev_net(dst->dev)));
1da177e4 1523#if RT_CACHE_DEBUG >= 1
673d57e7
HH
1524 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1525 &rt->rt_dst, rt->fl.fl4_tos);
1da177e4
LT
1526#endif
1527 rt_del(hash, rt);
1528 ret = NULL;
1529 }
1530 }
1531 return ret;
1532}
1533
1534/*
1535 * Algorithm:
1536 * 1. The first ip_rt_redirect_number redirects are sent
1537 * with exponential backoff, then we stop sending them at all,
1538 * assuming that the host ignores our redirects.
1539 * 2. If we did not see packets requiring redirects
1540 * during ip_rt_redirect_silence, we assume that the host
1541 * forgot redirected route and start to send redirects again.
1542 *
1543 * This algorithm is much cheaper and more intelligent than dumb load limiting
1544 * in icmp.c.
1545 *
1546 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1547 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1548 */
1549
1550void ip_rt_send_redirect(struct sk_buff *skb)
1551{
511c3f92 1552 struct rtable *rt = skb_rtable(skb);
30038fc6
ED
1553 struct in_device *in_dev;
1554 int log_martians;
1da177e4 1555
30038fc6
ED
1556 rcu_read_lock();
1557 in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1558 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1559 rcu_read_unlock();
1da177e4 1560 return;
30038fc6
ED
1561 }
1562 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1563 rcu_read_unlock();
1da177e4
LT
1564
1565 /* No redirected packets during ip_rt_redirect_silence;
1566 * reset the algorithm.
1567 */
1568 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1569 rt->u.dst.rate_tokens = 0;
1570
1571 /* Too many ignored redirects; do not send anything
1572 * set u.dst.rate_last to the last seen redirected packet.
1573 */
1574 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1575 rt->u.dst.rate_last = jiffies;
30038fc6 1576 return;
1da177e4
LT
1577 }
1578
1579 /* Check for load limit; set rate_last to the latest sent
1580 * redirect.
1581 */
14fb8a76
LY
1582 if (rt->u.dst.rate_tokens == 0 ||
1583 time_after(jiffies,
1da177e4
LT
1584 (rt->u.dst.rate_last +
1585 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1586 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1587 rt->u.dst.rate_last = jiffies;
1588 ++rt->u.dst.rate_tokens;
1589#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 1590 if (log_martians &&
1da177e4
LT
1591 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1592 net_ratelimit())
673d57e7
HH
1593 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1594 &rt->rt_src, rt->rt_iif,
1595 &rt->rt_dst, &rt->rt_gateway);
1da177e4
LT
1596#endif
1597 }
1da177e4
LT
1598}
1599
1600static int ip_error(struct sk_buff *skb)
1601{
511c3f92 1602 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
1603 unsigned long now;
1604 int code;
1605
1606 switch (rt->u.dst.error) {
1607 case EINVAL:
1608 default:
1609 goto out;
1610 case EHOSTUNREACH:
1611 code = ICMP_HOST_UNREACH;
1612 break;
1613 case ENETUNREACH:
1614 code = ICMP_NET_UNREACH;
7c73a6fa
PE
1615 IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1616 IPSTATS_MIB_INNOROUTES);
1da177e4
LT
1617 break;
1618 case EACCES:
1619 code = ICMP_PKT_FILTERED;
1620 break;
1621 }
1622
1623 now = jiffies;
1624 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1625 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1626 rt->u.dst.rate_tokens = ip_rt_error_burst;
1627 rt->u.dst.rate_last = now;
1628 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1629 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1630 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1631 }
1632
1633out: kfree_skb(skb);
1634 return 0;
e905a9ed 1635}
1da177e4
LT
1636
1637/*
1638 * The last two values are not from the RFC but
1639 * are needed for AMPRnet AX.25 paths.
1640 */
1641
9b5b5cff 1642static const unsigned short mtu_plateau[] =
1da177e4
LT
1643{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1644
5969f71d 1645static inline unsigned short guess_mtu(unsigned short old_mtu)
1da177e4
LT
1646{
1647 int i;
e905a9ed 1648
1da177e4
LT
1649 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1650 if (old_mtu > mtu_plateau[i])
1651 return mtu_plateau[i];
1652 return 68;
1653}
1654
b5921910 1655unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
0010e465
TT
1656 unsigned short new_mtu,
1657 struct net_device *dev)
1da177e4 1658{
0010e465 1659 int i, k;
1da177e4
LT
1660 unsigned short old_mtu = ntohs(iph->tot_len);
1661 struct rtable *rth;
0010e465 1662 int ikeys[2] = { dev->ifindex, 0 };
e448515c
AV
1663 __be32 skeys[2] = { iph->saddr, 0, };
1664 __be32 daddr = iph->daddr;
1da177e4
LT
1665 unsigned short est_mtu = 0;
1666
0010e465
TT
1667 for (k = 0; k < 2; k++) {
1668 for (i = 0; i < 2; i++) {
b00180de 1669 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
e84f84f2 1670 rt_genid(net));
0010e465
TT
1671
1672 rcu_read_lock();
1673 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1674 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
1675 unsigned short mtu = new_mtu;
1676
0010e465
TT
1677 if (rth->fl.fl4_dst != daddr ||
1678 rth->fl.fl4_src != skeys[i] ||
1679 rth->rt_dst != daddr ||
1680 rth->rt_src != iph->saddr ||
1681 rth->fl.oif != ikeys[k] ||
1682 rth->fl.iif != 0 ||
1683 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1684 !net_eq(dev_net(rth->u.dst.dev), net) ||
6c3b8fc6 1685 rt_is_expired(rth))
0010e465
TT
1686 continue;
1687
1da177e4
LT
1688 if (new_mtu < 68 || new_mtu >= old_mtu) {
1689
1690 /* BSD 4.2 compatibility hack :-( */
1691 if (mtu == 0 &&
6d273f8d 1692 old_mtu >= dst_mtu(&rth->u.dst) &&
1da177e4
LT
1693 old_mtu >= 68 + (iph->ihl << 2))
1694 old_mtu -= iph->ihl << 2;
1695
1696 mtu = guess_mtu(old_mtu);
1697 }
6d273f8d
RR
1698 if (mtu <= dst_mtu(&rth->u.dst)) {
1699 if (mtu < dst_mtu(&rth->u.dst)) {
1da177e4
LT
1700 dst_confirm(&rth->u.dst);
1701 if (mtu < ip_rt_min_pmtu) {
1702 mtu = ip_rt_min_pmtu;
1703 rth->u.dst.metrics[RTAX_LOCK-1] |=
1704 (1 << RTAX_MTU);
1705 }
1706 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1707 dst_set_expires(&rth->u.dst,
1708 ip_rt_mtu_expires);
1709 }
1710 est_mtu = mtu;
1711 }
1712 }
0010e465 1713 rcu_read_unlock();
1da177e4 1714 }
1da177e4
LT
1715 }
1716 return est_mtu ? : new_mtu;
1717}
1718
1719static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1720{
6d273f8d 1721 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1da177e4
LT
1722 !(dst_metric_locked(dst, RTAX_MTU))) {
1723 if (mtu < ip_rt_min_pmtu) {
1724 mtu = ip_rt_min_pmtu;
1725 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1726 }
1727 dst->metrics[RTAX_MTU-1] = mtu;
1728 dst_set_expires(dst, ip_rt_mtu_expires);
8d71740c 1729 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1da177e4
LT
1730 }
1731}
1732
1733static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1734{
d11a4dc1
TT
1735 if (rt_is_expired((struct rtable *)dst))
1736 return NULL;
1737 return dst;
1da177e4
LT
1738}
1739
1740static void ipv4_dst_destroy(struct dst_entry *dst)
1741{
1742 struct rtable *rt = (struct rtable *) dst;
1743 struct inet_peer *peer = rt->peer;
1744 struct in_device *idev = rt->idev;
1745
1746 if (peer) {
1747 rt->peer = NULL;
1748 inet_putpeer(peer);
1749 }
1750
1751 if (idev) {
1752 rt->idev = NULL;
1753 in_dev_put(idev);
1754 }
1755}
1756
1757static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1758 int how)
1759{
1760 struct rtable *rt = (struct rtable *) dst;
1761 struct in_device *idev = rt->idev;
c346dca1 1762 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
5a3e55d6 1763 struct in_device *loopback_idev =
c346dca1 1764 in_dev_get(dev_net(dev)->loopback_dev);
1da177e4
LT
1765 if (loopback_idev) {
1766 rt->idev = loopback_idev;
1767 in_dev_put(idev);
1768 }
1769 }
1770}
1771
1772static void ipv4_link_failure(struct sk_buff *skb)
1773{
1774 struct rtable *rt;
1775
1776 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1777
511c3f92 1778 rt = skb_rtable(skb);
1da177e4
LT
1779 if (rt)
1780 dst_set_expires(&rt->u.dst, 0);
1781}
1782
1783static int ip_rt_bug(struct sk_buff *skb)
1784{
673d57e7
HH
1785 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1786 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1da177e4
LT
1787 skb->dev ? skb->dev->name : "?");
1788 kfree_skb(skb);
1789 return 0;
1790}
1791
1792/*
1793 We do not cache source address of outgoing interface,
1794 because it is used only by IP RR, TS and SRR options,
1795 so that it out of fast path.
1796
1797 BTW remember: "addr" is allowed to be not aligned
1798 in IP options!
1799 */
1800
1801void ip_rt_get_source(u8 *addr, struct rtable *rt)
1802{
a61ced5d 1803 __be32 src;
1da177e4
LT
1804 struct fib_result res;
1805
1806 if (rt->fl.iif == 0)
1807 src = rt->rt_src;
c346dca1 1808 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1da177e4
LT
1809 src = FIB_RES_PREFSRC(res);
1810 fib_res_put(&res);
1811 } else
1812 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1813 RT_SCOPE_UNIVERSE);
1814 memcpy(addr, &src, 4);
1815}
1816
1817#ifdef CONFIG_NET_CLS_ROUTE
1818static void set_class_tag(struct rtable *rt, u32 tag)
1819{
1820 if (!(rt->u.dst.tclassid & 0xFFFF))
1821 rt->u.dst.tclassid |= tag & 0xFFFF;
1822 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1823 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1824}
1825#endif
1826
1827static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1828{
1829 struct fib_info *fi = res->fi;
1830
1831 if (fi) {
1832 if (FIB_RES_GW(*res) &&
1833 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1834 rt->rt_gateway = FIB_RES_GW(*res);
1835 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1836 sizeof(rt->u.dst.metrics));
1837 if (fi->fib_mtu == 0) {
1838 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
0bbeafd0 1839 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1da177e4
LT
1840 rt->rt_gateway != rt->rt_dst &&
1841 rt->u.dst.dev->mtu > 576)
1842 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1843 }
1844#ifdef CONFIG_NET_CLS_ROUTE
1845 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1846#endif
1847 } else
1848 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1849
5ffc02a1 1850 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1da177e4 1851 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
6d273f8d 1852 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1da177e4 1853 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
5ffc02a1 1854 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1da177e4
LT
1855 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1856 ip_rt_min_advmss);
5ffc02a1 1857 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1da177e4
LT
1858 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1859
1860#ifdef CONFIG_NET_CLS_ROUTE
1861#ifdef CONFIG_IP_MULTIPLE_TABLES
1862 set_class_tag(rt, fib_rules_tclass(res));
1863#endif
1864 set_class_tag(rt, itag);
1865#endif
e905a9ed 1866 rt->rt_type = res->type;
1da177e4
LT
1867}
1868
9e12bb22 1869static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1870 u8 tos, struct net_device *dev, int our)
1871{
1872 unsigned hash;
1873 struct rtable *rth;
a61ced5d 1874 __be32 spec_dst;
1da177e4
LT
1875 struct in_device *in_dev = in_dev_get(dev);
1876 u32 itag = 0;
1877
1878 /* Primary sanity checks. */
1879
1880 if (in_dev == NULL)
1881 return -EINVAL;
1882
1e637c74 1883 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 1884 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1885 goto e_inval;
1886
f97c1e0c
JP
1887 if (ipv4_is_zeronet(saddr)) {
1888 if (!ipv4_is_local_multicast(daddr))
1da177e4
LT
1889 goto e_inval;
1890 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1891 } else if (fib_validate_source(saddr, 0, tos, 0,
b0c110ca 1892 dev, &spec_dst, &itag, 0) < 0)
1da177e4
LT
1893 goto e_inval;
1894
1895 rth = dst_alloc(&ipv4_dst_ops);
1896 if (!rth)
1897 goto e_nobufs;
1898
d11a4dc1
TT
1899 rth->u.dst.output = ip_rt_bug;
1900 rth->u.dst.obsolete = -1;
1da177e4
LT
1901
1902 atomic_set(&rth->u.dst.__refcnt, 1);
1903 rth->u.dst.flags= DST_HOST;
42f811b8 1904 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
1905 rth->u.dst.flags |= DST_NOPOLICY;
1906 rth->fl.fl4_dst = daddr;
1907 rth->rt_dst = daddr;
1908 rth->fl.fl4_tos = tos;
47dcf0cb 1909 rth->fl.mark = skb->mark;
1da177e4
LT
1910 rth->fl.fl4_src = saddr;
1911 rth->rt_src = saddr;
1912#ifdef CONFIG_NET_CLS_ROUTE
1913 rth->u.dst.tclassid = itag;
1914#endif
1915 rth->rt_iif =
1916 rth->fl.iif = dev->ifindex;
2774c7ab 1917 rth->u.dst.dev = init_net.loopback_dev;
1da177e4
LT
1918 dev_hold(rth->u.dst.dev);
1919 rth->idev = in_dev_get(rth->u.dst.dev);
1920 rth->fl.oif = 0;
1921 rth->rt_gateway = daddr;
1922 rth->rt_spec_dst= spec_dst;
e84f84f2 1923 rth->rt_genid = rt_genid(dev_net(dev));
1da177e4 1924 rth->rt_flags = RTCF_MULTICAST;
29e75252 1925 rth->rt_type = RTN_MULTICAST;
1da177e4
LT
1926 if (our) {
1927 rth->u.dst.input= ip_local_deliver;
1928 rth->rt_flags |= RTCF_LOCAL;
1929 }
1930
1931#ifdef CONFIG_IP_MROUTE
f97c1e0c 1932 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1da177e4
LT
1933 rth->u.dst.input = ip_mr_input;
1934#endif
1935 RT_CACHE_STAT_INC(in_slow_mc);
1936
1937 in_dev_put(in_dev);
e84f84f2 1938 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
6a2bad70 1939 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1da177e4
LT
1940
1941e_nobufs:
1942 in_dev_put(in_dev);
1943 return -ENOBUFS;
1944
1945e_inval:
1946 in_dev_put(in_dev);
1947 return -EINVAL;
1948}
1949
1950
1951static void ip_handle_martian_source(struct net_device *dev,
1952 struct in_device *in_dev,
1953 struct sk_buff *skb,
9e12bb22
AV
1954 __be32 daddr,
1955 __be32 saddr)
1da177e4
LT
1956{
1957 RT_CACHE_STAT_INC(in_martian_src);
1958#ifdef CONFIG_IP_ROUTE_VERBOSE
1959 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1960 /*
1961 * RFC1812 recommendation, if source is martian,
1962 * the only hint is MAC header.
1963 */
673d57e7
HH
1964 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1965 &daddr, &saddr, dev->name);
98e399f8 1966 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 1967 int i;
98e399f8 1968 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
1969 printk(KERN_WARNING "ll header: ");
1970 for (i = 0; i < dev->hard_header_len; i++, p++) {
1971 printk("%02x", *p);
1972 if (i < (dev->hard_header_len - 1))
1973 printk(":");
1974 }
1975 printk("\n");
1976 }
1977 }
1978#endif
1979}
1980
5969f71d
SH
1981static int __mkroute_input(struct sk_buff *skb,
1982 struct fib_result *res,
1983 struct in_device *in_dev,
1984 __be32 daddr, __be32 saddr, u32 tos,
1985 struct rtable **result)
1da177e4
LT
1986{
1987
1988 struct rtable *rth;
1989 int err;
1990 struct in_device *out_dev;
1991 unsigned flags = 0;
d9c9df8c
AV
1992 __be32 spec_dst;
1993 u32 itag;
1da177e4
LT
1994
1995 /* get a working reference to the output device */
1996 out_dev = in_dev_get(FIB_RES_DEV(*res));
1997 if (out_dev == NULL) {
1998 if (net_ratelimit())
1999 printk(KERN_CRIT "Bug in ip_route_input" \
2000 "_slow(). Please, report\n");
2001 return -EINVAL;
2002 }
2003
2004
e905a9ed 2005 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
b0c110ca 2006 in_dev->dev, &spec_dst, &itag, skb->mark);
1da177e4 2007 if (err < 0) {
e905a9ed 2008 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 2009 saddr);
e905a9ed 2010
1da177e4
LT
2011 err = -EINVAL;
2012 goto cleanup;
2013 }
2014
2015 if (err)
2016 flags |= RTCF_DIRECTSRC;
2017
51b77cae 2018 if (out_dev == in_dev && err &&
1da177e4
LT
2019 (IN_DEV_SHARED_MEDIA(out_dev) ||
2020 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2021 flags |= RTCF_DOREDIRECT;
2022
2023 if (skb->protocol != htons(ETH_P_IP)) {
2024 /* Not IP (i.e. ARP). Do not create route, if it is
2025 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
2026 *
2027 * Proxy arp feature have been extended to allow, ARP
2028 * replies back to the same interface, to support
2029 * Private VLAN switch technologies. See arp.c.
1da177e4 2030 */
65324144
JDB
2031 if (out_dev == in_dev &&
2032 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
2033 err = -EINVAL;
2034 goto cleanup;
2035 }
2036 }
2037
2038
2039 rth = dst_alloc(&ipv4_dst_ops);
2040 if (!rth) {
2041 err = -ENOBUFS;
2042 goto cleanup;
2043 }
2044
ce723d8e 2045 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 2046 rth->u.dst.flags= DST_HOST;
42f811b8 2047 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4 2048 rth->u.dst.flags |= DST_NOPOLICY;
42f811b8 2049 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1da177e4
LT
2050 rth->u.dst.flags |= DST_NOXFRM;
2051 rth->fl.fl4_dst = daddr;
2052 rth->rt_dst = daddr;
2053 rth->fl.fl4_tos = tos;
47dcf0cb 2054 rth->fl.mark = skb->mark;
1da177e4
LT
2055 rth->fl.fl4_src = saddr;
2056 rth->rt_src = saddr;
2057 rth->rt_gateway = daddr;
2058 rth->rt_iif =
2059 rth->fl.iif = in_dev->dev->ifindex;
2060 rth->u.dst.dev = (out_dev)->dev;
2061 dev_hold(rth->u.dst.dev);
2062 rth->idev = in_dev_get(rth->u.dst.dev);
2063 rth->fl.oif = 0;
2064 rth->rt_spec_dst= spec_dst;
2065
d11a4dc1 2066 rth->u.dst.obsolete = -1;
1da177e4
LT
2067 rth->u.dst.input = ip_forward;
2068 rth->u.dst.output = ip_output;
e84f84f2 2069 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
1da177e4
LT
2070
2071 rt_set_nexthop(rth, res, itag);
2072
2073 rth->rt_flags = flags;
2074
2075 *result = rth;
2076 err = 0;
2077 cleanup:
2078 /* release the working reference to the output device */
2079 in_dev_put(out_dev);
2080 return err;
e905a9ed 2081}
1da177e4 2082
5969f71d
SH
2083static int ip_mkroute_input(struct sk_buff *skb,
2084 struct fib_result *res,
2085 const struct flowi *fl,
2086 struct in_device *in_dev,
2087 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 2088{
7abaa27c 2089 struct rtable* rth = NULL;
1da177e4
LT
2090 int err;
2091 unsigned hash;
2092
2093#ifdef CONFIG_IP_ROUTE_MULTIPATH
2094 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2095 fib_select_multipath(fl, res);
2096#endif
2097
2098 /* create a routing cache entry */
2099 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2100 if (err)
2101 return err;
1da177e4
LT
2102
2103 /* put it into the cache */
e84f84f2
DL
2104 hash = rt_hash(daddr, saddr, fl->iif,
2105 rt_genid(dev_net(rth->u.dst.dev)));
6a2bad70 2106 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
1da177e4
LT
2107}
2108
1da177e4
LT
2109/*
2110 * NOTE. We drop all the packets that has local source
2111 * addresses, because every properly looped back packet
2112 * must have correct destination already attached by output routine.
2113 *
2114 * Such approach solves two big problems:
2115 * 1. Not simplex devices are handled properly.
2116 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2117 */
2118
9e12bb22 2119static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2120 u8 tos, struct net_device *dev)
2121{
2122 struct fib_result res;
2123 struct in_device *in_dev = in_dev_get(dev);
2124 struct flowi fl = { .nl_u = { .ip4_u =
2125 { .daddr = daddr,
2126 .saddr = saddr,
2127 .tos = tos,
2128 .scope = RT_SCOPE_UNIVERSE,
1da177e4 2129 } },
47dcf0cb 2130 .mark = skb->mark,
1da177e4
LT
2131 .iif = dev->ifindex };
2132 unsigned flags = 0;
2133 u32 itag = 0;
2134 struct rtable * rth;
2135 unsigned hash;
9e12bb22 2136 __be32 spec_dst;
1da177e4
LT
2137 int err = -EINVAL;
2138 int free_res = 0;
c346dca1 2139 struct net * net = dev_net(dev);
1da177e4
LT
2140
2141 /* IP on this device is disabled. */
2142
2143 if (!in_dev)
2144 goto out;
2145
2146 /* Check for the most weird martians, which can be not detected
2147 by fib_lookup.
2148 */
2149
1e637c74 2150 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2151 ipv4_is_loopback(saddr))
1da177e4
LT
2152 goto martian_source;
2153
e448515c 2154 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1da177e4
LT
2155 goto brd_input;
2156
2157 /* Accept zero addresses only to limited broadcast;
2158 * I even do not know to fix it or not. Waiting for complains :-)
2159 */
f97c1e0c 2160 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2161 goto martian_source;
2162
1e637c74 2163 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
f97c1e0c 2164 ipv4_is_loopback(daddr))
1da177e4
LT
2165 goto martian_destination;
2166
2167 /*
2168 * Now we are ready to route packet.
2169 */
84a885f4 2170 if ((err = fib_lookup(net, &fl, &res)) != 0) {
1da177e4 2171 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2172 goto e_hostunreach;
1da177e4
LT
2173 goto no_route;
2174 }
2175 free_res = 1;
2176
2177 RT_CACHE_STAT_INC(in_slow_tot);
2178
2179 if (res.type == RTN_BROADCAST)
2180 goto brd_input;
2181
2182 if (res.type == RTN_LOCAL) {
2183 int result;
2184 result = fib_validate_source(saddr, daddr, tos,
84a885f4 2185 net->loopback_dev->ifindex,
b0c110ca 2186 dev, &spec_dst, &itag, skb->mark);
1da177e4
LT
2187 if (result < 0)
2188 goto martian_source;
2189 if (result)
2190 flags |= RTCF_DIRECTSRC;
2191 spec_dst = daddr;
2192 goto local_input;
2193 }
2194
2195 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2196 goto e_hostunreach;
1da177e4
LT
2197 if (res.type != RTN_UNICAST)
2198 goto martian_destination;
2199
2200 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1da177e4
LT
2201done:
2202 in_dev_put(in_dev);
2203 if (free_res)
2204 fib_res_put(&res);
2205out: return err;
2206
2207brd_input:
2208 if (skb->protocol != htons(ETH_P_IP))
2209 goto e_inval;
2210
f97c1e0c 2211 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2212 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2213 else {
2214 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
b0c110ca 2215 &itag, skb->mark);
1da177e4
LT
2216 if (err < 0)
2217 goto martian_source;
2218 if (err)
2219 flags |= RTCF_DIRECTSRC;
2220 }
2221 flags |= RTCF_BROADCAST;
2222 res.type = RTN_BROADCAST;
2223 RT_CACHE_STAT_INC(in_brd);
2224
2225local_input:
2226 rth = dst_alloc(&ipv4_dst_ops);
2227 if (!rth)
2228 goto e_nobufs;
2229
2230 rth->u.dst.output= ip_rt_bug;
d11a4dc1 2231 rth->u.dst.obsolete = -1;
e84f84f2 2232 rth->rt_genid = rt_genid(net);
1da177e4
LT
2233
2234 atomic_set(&rth->u.dst.__refcnt, 1);
2235 rth->u.dst.flags= DST_HOST;
42f811b8 2236 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
2237 rth->u.dst.flags |= DST_NOPOLICY;
2238 rth->fl.fl4_dst = daddr;
2239 rth->rt_dst = daddr;
2240 rth->fl.fl4_tos = tos;
47dcf0cb 2241 rth->fl.mark = skb->mark;
1da177e4
LT
2242 rth->fl.fl4_src = saddr;
2243 rth->rt_src = saddr;
2244#ifdef CONFIG_NET_CLS_ROUTE
2245 rth->u.dst.tclassid = itag;
2246#endif
2247 rth->rt_iif =
2248 rth->fl.iif = dev->ifindex;
84a885f4 2249 rth->u.dst.dev = net->loopback_dev;
1da177e4
LT
2250 dev_hold(rth->u.dst.dev);
2251 rth->idev = in_dev_get(rth->u.dst.dev);
2252 rth->rt_gateway = daddr;
2253 rth->rt_spec_dst= spec_dst;
2254 rth->u.dst.input= ip_local_deliver;
2255 rth->rt_flags = flags|RTCF_LOCAL;
2256 if (res.type == RTN_UNREACHABLE) {
2257 rth->u.dst.input= ip_error;
2258 rth->u.dst.error= -err;
2259 rth->rt_flags &= ~RTCF_LOCAL;
2260 }
2261 rth->rt_type = res.type;
e84f84f2 2262 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
6a2bad70 2263 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
1da177e4
LT
2264 goto done;
2265
2266no_route:
2267 RT_CACHE_STAT_INC(in_no_route);
2268 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2269 res.type = RTN_UNREACHABLE;
7f53878d
MC
2270 if (err == -ESRCH)
2271 err = -ENETUNREACH;
1da177e4
LT
2272 goto local_input;
2273
2274 /*
2275 * Do not cache martian addresses: they should be logged (RFC1812)
2276 */
2277martian_destination:
2278 RT_CACHE_STAT_INC(in_martian_dst);
2279#ifdef CONFIG_IP_ROUTE_VERBOSE
2280 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
2281 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2282 &daddr, &saddr, dev->name);
1da177e4 2283#endif
2c2910a4
DE
2284
2285e_hostunreach:
e905a9ed
YH
2286 err = -EHOSTUNREACH;
2287 goto done;
2c2910a4 2288
1da177e4
LT
2289e_inval:
2290 err = -EINVAL;
2291 goto done;
2292
2293e_nobufs:
2294 err = -ENOBUFS;
2295 goto done;
2296
2297martian_source:
2298 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2299 goto e_inval;
2300}
2301
9e12bb22 2302int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2303 u8 tos, struct net_device *dev)
2304{
2305 struct rtable * rth;
2306 unsigned hash;
2307 int iif = dev->ifindex;
b5921910 2308 struct net *net;
1da177e4 2309
c346dca1 2310 net = dev_net(dev);
1080d709
NH
2311
2312 if (!rt_caching(net))
2313 goto skip_cache;
2314
1da177e4 2315 tos &= IPTOS_RT_MASK;
e84f84f2 2316 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
1da177e4
LT
2317
2318 rcu_read_lock();
2319 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 2320 rth = rcu_dereference(rth->u.dst.rt_next)) {
0eae88f3
ED
2321 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2322 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
c0b8c32b
SH
2323 (rth->fl.iif ^ iif) |
2324 rth->fl.oif |
2325 (rth->fl.fl4_tos ^ tos)) == 0 &&
47dcf0cb 2326 rth->fl.mark == skb->mark &&
878628fb 2327 net_eq(dev_net(rth->u.dst.dev), net) &&
e84f84f2 2328 !rt_is_expired(rth)) {
03f49f34 2329 dst_use(&rth->u.dst, jiffies);
1da177e4
LT
2330 RT_CACHE_STAT_INC(in_hit);
2331 rcu_read_unlock();
adf30907 2332 skb_dst_set(skb, &rth->u.dst);
1da177e4
LT
2333 return 0;
2334 }
2335 RT_CACHE_STAT_INC(in_hlist_search);
2336 }
2337 rcu_read_unlock();
2338
1080d709 2339skip_cache:
1da177e4
LT
2340 /* Multicast recognition logic is moved from route cache to here.
2341 The problem was that too many Ethernet cards have broken/missing
2342 hardware multicast filters :-( As result the host on multicasting
2343 network acquires a lot of useless route cache entries, sort of
2344 SDR messages from all the world. Now we try to get rid of them.
2345 Really, provided software IP multicast filter is organized
2346 reasonably (at least, hashed), it does not result in a slowdown
2347 comparing with route cache reject entries.
2348 Note, that multicast routers are not affected, because
2349 route cache entry is created eventually.
2350 */
f97c1e0c 2351 if (ipv4_is_multicast(daddr)) {
1da177e4
LT
2352 struct in_device *in_dev;
2353
2354 rcu_read_lock();
e5ed6399 2355 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1da177e4 2356 int our = ip_check_mc(in_dev, daddr, saddr,
eddc9ec5 2357 ip_hdr(skb)->protocol);
1da177e4
LT
2358 if (our
2359#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
2360 ||
2361 (!ipv4_is_local_multicast(daddr) &&
2362 IN_DEV_MFORWARD(in_dev))
1da177e4 2363#endif
9d4fb27d 2364 ) {
1da177e4
LT
2365 rcu_read_unlock();
2366 return ip_route_input_mc(skb, daddr, saddr,
2367 tos, dev, our);
2368 }
2369 }
2370 rcu_read_unlock();
2371 return -EINVAL;
2372 }
2373 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2374}
2375
5969f71d
SH
2376static int __mkroute_output(struct rtable **result,
2377 struct fib_result *res,
2378 const struct flowi *fl,
2379 const struct flowi *oldflp,
2380 struct net_device *dev_out,
2381 unsigned flags)
1da177e4
LT
2382{
2383 struct rtable *rth;
2384 struct in_device *in_dev;
2385 u32 tos = RT_FL_TOS(oldflp);
2386 int err = 0;
2387
f97c1e0c 2388 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
1da177e4
LT
2389 return -EINVAL;
2390
e448515c 2391 if (fl->fl4_dst == htonl(0xFFFFFFFF))
1da177e4 2392 res->type = RTN_BROADCAST;
f97c1e0c 2393 else if (ipv4_is_multicast(fl->fl4_dst))
1da177e4 2394 res->type = RTN_MULTICAST;
1e637c74 2395 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
1da177e4
LT
2396 return -EINVAL;
2397
2398 if (dev_out->flags & IFF_LOOPBACK)
2399 flags |= RTCF_LOCAL;
2400
2401 /* get work reference to inet device */
2402 in_dev = in_dev_get(dev_out);
2403 if (!in_dev)
2404 return -EINVAL;
2405
2406 if (res->type == RTN_BROADCAST) {
2407 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2408 if (res->fi) {
2409 fib_info_put(res->fi);
2410 res->fi = NULL;
2411 }
2412 } else if (res->type == RTN_MULTICAST) {
2413 flags |= RTCF_MULTICAST|RTCF_LOCAL;
e905a9ed 2414 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
1da177e4
LT
2415 oldflp->proto))
2416 flags &= ~RTCF_LOCAL;
2417 /* If multicast route do not exist use
2418 default one, but do not gateway in this case.
2419 Yes, it is hack.
2420 */
2421 if (res->fi && res->prefixlen < 4) {
2422 fib_info_put(res->fi);
2423 res->fi = NULL;
2424 }
2425 }
2426
2427
2428 rth = dst_alloc(&ipv4_dst_ops);
2429 if (!rth) {
2430 err = -ENOBUFS;
2431 goto cleanup;
e905a9ed 2432 }
1da177e4 2433
ce723d8e 2434 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 2435 rth->u.dst.flags= DST_HOST;
42f811b8 2436 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
1da177e4 2437 rth->u.dst.flags |= DST_NOXFRM;
42f811b8 2438 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
2439 rth->u.dst.flags |= DST_NOPOLICY;
2440
2441 rth->fl.fl4_dst = oldflp->fl4_dst;
2442 rth->fl.fl4_tos = tos;
2443 rth->fl.fl4_src = oldflp->fl4_src;
2444 rth->fl.oif = oldflp->oif;
47dcf0cb 2445 rth->fl.mark = oldflp->mark;
1da177e4
LT
2446 rth->rt_dst = fl->fl4_dst;
2447 rth->rt_src = fl->fl4_src;
2448 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
e905a9ed 2449 /* get references to the devices that are to be hold by the routing
1da177e4
LT
2450 cache entry */
2451 rth->u.dst.dev = dev_out;
2452 dev_hold(dev_out);
2453 rth->idev = in_dev_get(dev_out);
2454 rth->rt_gateway = fl->fl4_dst;
2455 rth->rt_spec_dst= fl->fl4_src;
2456
2457 rth->u.dst.output=ip_output;
d11a4dc1 2458 rth->u.dst.obsolete = -1;
e84f84f2 2459 rth->rt_genid = rt_genid(dev_net(dev_out));
1da177e4
LT
2460
2461 RT_CACHE_STAT_INC(out_slow_tot);
2462
2463 if (flags & RTCF_LOCAL) {
2464 rth->u.dst.input = ip_local_deliver;
2465 rth->rt_spec_dst = fl->fl4_dst;
2466 }
2467 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2468 rth->rt_spec_dst = fl->fl4_src;
e905a9ed 2469 if (flags & RTCF_LOCAL &&
1da177e4
LT
2470 !(dev_out->flags & IFF_LOOPBACK)) {
2471 rth->u.dst.output = ip_mc_output;
2472 RT_CACHE_STAT_INC(out_slow_mc);
2473 }
2474#ifdef CONFIG_IP_MROUTE
2475 if (res->type == RTN_MULTICAST) {
2476 if (IN_DEV_MFORWARD(in_dev) &&
f97c1e0c 2477 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
1da177e4
LT
2478 rth->u.dst.input = ip_mr_input;
2479 rth->u.dst.output = ip_mc_output;
2480 }
2481 }
2482#endif
2483 }
2484
2485 rt_set_nexthop(rth, res, 0);
2486
2487 rth->rt_flags = flags;
2488
2489 *result = rth;
2490 cleanup:
2491 /* release work reference to inet device */
2492 in_dev_put(in_dev);
2493
2494 return err;
2495}
2496
5969f71d
SH
2497static int ip_mkroute_output(struct rtable **rp,
2498 struct fib_result *res,
2499 const struct flowi *fl,
2500 const struct flowi *oldflp,
2501 struct net_device *dev_out,
2502 unsigned flags)
1da177e4 2503{
7abaa27c 2504 struct rtable *rth = NULL;
1da177e4
LT
2505 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2506 unsigned hash;
2507 if (err == 0) {
b00180de 2508 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
e84f84f2 2509 rt_genid(dev_net(dev_out)));
6a2bad70 2510 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
1da177e4 2511 }
e905a9ed 2512
1da177e4
LT
2513 return err;
2514}
2515
1da177e4
LT
2516/*
2517 * Major route resolver routine.
2518 */
2519
b40afd0e
DL
2520static int ip_route_output_slow(struct net *net, struct rtable **rp,
2521 const struct flowi *oldflp)
1da177e4
LT
2522{
2523 u32 tos = RT_FL_TOS(oldflp);
2524 struct flowi fl = { .nl_u = { .ip4_u =
2525 { .daddr = oldflp->fl4_dst,
2526 .saddr = oldflp->fl4_src,
2527 .tos = tos & IPTOS_RT_MASK,
2528 .scope = ((tos & RTO_ONLINK) ?
2529 RT_SCOPE_LINK :
2530 RT_SCOPE_UNIVERSE),
1da177e4 2531 } },
47dcf0cb 2532 .mark = oldflp->mark,
b40afd0e 2533 .iif = net->loopback_dev->ifindex,
1da177e4
LT
2534 .oif = oldflp->oif };
2535 struct fib_result res;
2536 unsigned flags = 0;
2537 struct net_device *dev_out = NULL;
2538 int free_res = 0;
2539 int err;
2540
2541
2542 res.fi = NULL;
2543#ifdef CONFIG_IP_MULTIPLE_TABLES
2544 res.r = NULL;
2545#endif
2546
2547 if (oldflp->fl4_src) {
2548 err = -EINVAL;
f97c1e0c 2549 if (ipv4_is_multicast(oldflp->fl4_src) ||
1e637c74 2550 ipv4_is_lbcast(oldflp->fl4_src) ||
f97c1e0c 2551 ipv4_is_zeronet(oldflp->fl4_src))
1da177e4
LT
2552 goto out;
2553
1da177e4
LT
2554 /* I removed check for oif == dev_out->oif here.
2555 It was wrong for two reasons:
1ab35276
DL
2556 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2557 is assigned to multiple interfaces.
1da177e4
LT
2558 2. Moreover, we are allowed to send packets with saddr
2559 of another iface. --ANK
2560 */
2561
9d4fb27d
JP
2562 if (oldflp->oif == 0 &&
2563 (ipv4_is_multicast(oldflp->fl4_dst) ||
2564 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
a210d01a
JA
2565 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2566 dev_out = ip_dev_find(net, oldflp->fl4_src);
2567 if (dev_out == NULL)
2568 goto out;
2569
1da177e4
LT
2570 /* Special hack: user can direct multicasts
2571 and limited broadcast via necessary interface
2572 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2573 This hack is not just for fun, it allows
2574 vic,vat and friends to work.
2575 They bind socket to loopback, set ttl to zero
2576 and expect that it will work.
2577 From the viewpoint of routing cache they are broken,
2578 because we are not allowed to build multicast path
2579 with loopback source addr (look, routing cache
2580 cannot know, that ttl is zero, so that packet
2581 will not leave this host and route is valid).
2582 Luckily, this hack is good workaround.
2583 */
2584
2585 fl.oif = dev_out->ifindex;
2586 goto make_route;
2587 }
a210d01a
JA
2588
2589 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2590 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2591 dev_out = ip_dev_find(net, oldflp->fl4_src);
2592 if (dev_out == NULL)
2593 goto out;
1da177e4 2594 dev_put(dev_out);
a210d01a
JA
2595 dev_out = NULL;
2596 }
1da177e4
LT
2597 }
2598
2599
2600 if (oldflp->oif) {
b40afd0e 2601 dev_out = dev_get_by_index(net, oldflp->oif);
1da177e4
LT
2602 err = -ENODEV;
2603 if (dev_out == NULL)
2604 goto out;
e5ed6399
HX
2605
2606 /* RACE: Check return value of inet_select_addr instead. */
2607 if (__in_dev_get_rtnl(dev_out) == NULL) {
1da177e4
LT
2608 dev_put(dev_out);
2609 goto out; /* Wrong error code */
2610 }
2611
f97c1e0c
JP
2612 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2613 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
1da177e4
LT
2614 if (!fl.fl4_src)
2615 fl.fl4_src = inet_select_addr(dev_out, 0,
2616 RT_SCOPE_LINK);
2617 goto make_route;
2618 }
2619 if (!fl.fl4_src) {
f97c1e0c 2620 if (ipv4_is_multicast(oldflp->fl4_dst))
1da177e4
LT
2621 fl.fl4_src = inet_select_addr(dev_out, 0,
2622 fl.fl4_scope);
2623 else if (!oldflp->fl4_dst)
2624 fl.fl4_src = inet_select_addr(dev_out, 0,
2625 RT_SCOPE_HOST);
2626 }
2627 }
2628
2629 if (!fl.fl4_dst) {
2630 fl.fl4_dst = fl.fl4_src;
2631 if (!fl.fl4_dst)
2632 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2633 if (dev_out)
2634 dev_put(dev_out);
b40afd0e 2635 dev_out = net->loopback_dev;
1da177e4 2636 dev_hold(dev_out);
b40afd0e 2637 fl.oif = net->loopback_dev->ifindex;
1da177e4
LT
2638 res.type = RTN_LOCAL;
2639 flags |= RTCF_LOCAL;
2640 goto make_route;
2641 }
2642
b40afd0e 2643 if (fib_lookup(net, &fl, &res)) {
1da177e4
LT
2644 res.fi = NULL;
2645 if (oldflp->oif) {
2646 /* Apparently, routing tables are wrong. Assume,
2647 that the destination is on link.
2648
2649 WHY? DW.
2650 Because we are allowed to send to iface
2651 even if it has NO routes and NO assigned
2652 addresses. When oif is specified, routing
2653 tables are looked up with only one purpose:
2654 to catch if destination is gatewayed, rather than
2655 direct. Moreover, if MSG_DONTROUTE is set,
2656 we send packet, ignoring both routing tables
2657 and ifaddr state. --ANK
2658
2659
2660 We could make it even if oif is unknown,
2661 likely IPv6, but we do not.
2662 */
2663
2664 if (fl.fl4_src == 0)
2665 fl.fl4_src = inet_select_addr(dev_out, 0,
2666 RT_SCOPE_LINK);
2667 res.type = RTN_UNICAST;
2668 goto make_route;
2669 }
2670 if (dev_out)
2671 dev_put(dev_out);
2672 err = -ENETUNREACH;
2673 goto out;
2674 }
2675 free_res = 1;
2676
2677 if (res.type == RTN_LOCAL) {
2678 if (!fl.fl4_src)
2679 fl.fl4_src = fl.fl4_dst;
2680 if (dev_out)
2681 dev_put(dev_out);
b40afd0e 2682 dev_out = net->loopback_dev;
1da177e4
LT
2683 dev_hold(dev_out);
2684 fl.oif = dev_out->ifindex;
2685 if (res.fi)
2686 fib_info_put(res.fi);
2687 res.fi = NULL;
2688 flags |= RTCF_LOCAL;
2689 goto make_route;
2690 }
2691
2692#ifdef CONFIG_IP_ROUTE_MULTIPATH
2693 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2694 fib_select_multipath(&fl, &res);
2695 else
2696#endif
2697 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
b40afd0e 2698 fib_select_default(net, &fl, &res);
1da177e4
LT
2699
2700 if (!fl.fl4_src)
2701 fl.fl4_src = FIB_RES_PREFSRC(res);
2702
2703 if (dev_out)
2704 dev_put(dev_out);
2705 dev_out = FIB_RES_DEV(res);
2706 dev_hold(dev_out);
2707 fl.oif = dev_out->ifindex;
2708
2709
2710make_route:
2711 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2712
2713
2714 if (free_res)
2715 fib_res_put(&res);
2716 if (dev_out)
2717 dev_put(dev_out);
2718out: return err;
2719}
2720
611c183e
DL
2721int __ip_route_output_key(struct net *net, struct rtable **rp,
2722 const struct flowi *flp)
1da177e4
LT
2723{
2724 unsigned hash;
2725 struct rtable *rth;
2726
1080d709
NH
2727 if (!rt_caching(net))
2728 goto slow_output;
2729
e84f84f2 2730 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
1da177e4
LT
2731
2732 rcu_read_lock_bh();
a898def2
PM
2733 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2734 rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
1da177e4
LT
2735 if (rth->fl.fl4_dst == flp->fl4_dst &&
2736 rth->fl.fl4_src == flp->fl4_src &&
2737 rth->fl.iif == 0 &&
2738 rth->fl.oif == flp->oif &&
47dcf0cb 2739 rth->fl.mark == flp->mark &&
1da177e4 2740 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
b5921910 2741 (IPTOS_RT_MASK | RTO_ONLINK)) &&
878628fb 2742 net_eq(dev_net(rth->u.dst.dev), net) &&
e84f84f2 2743 !rt_is_expired(rth)) {
03f49f34 2744 dst_use(&rth->u.dst, jiffies);
1da177e4
LT
2745 RT_CACHE_STAT_INC(out_hit);
2746 rcu_read_unlock_bh();
2747 *rp = rth;
2748 return 0;
2749 }
2750 RT_CACHE_STAT_INC(out_hlist_search);
2751 }
2752 rcu_read_unlock_bh();
2753
1080d709 2754slow_output:
611c183e 2755 return ip_route_output_slow(net, rp, flp);
1da177e4
LT
2756}
2757
d8c97a94
ACM
2758EXPORT_SYMBOL_GPL(__ip_route_output_key);
2759
14e50e57
DM
2760static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2761{
2762}
2763
2764static struct dst_ops ipv4_dst_blackhole_ops = {
2765 .family = AF_INET,
09640e63 2766 .protocol = cpu_to_be16(ETH_P_IP),
14e50e57
DM
2767 .destroy = ipv4_dst_destroy,
2768 .check = ipv4_dst_check,
2769 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
e2422970 2770 .entries = ATOMIC_INIT(0),
14e50e57
DM
2771};
2772
2773
e84f84f2 2774static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
14e50e57
DM
2775{
2776 struct rtable *ort = *rp;
2777 struct rtable *rt = (struct rtable *)
2778 dst_alloc(&ipv4_dst_blackhole_ops);
2779
2780 if (rt) {
2781 struct dst_entry *new = &rt->u.dst;
2782
2783 atomic_set(&new->__refcnt, 1);
2784 new->__use = 1;
352e512c
HX
2785 new->input = dst_discard;
2786 new->output = dst_discard;
14e50e57
DM
2787 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2788
2789 new->dev = ort->u.dst.dev;
2790 if (new->dev)
2791 dev_hold(new->dev);
2792
2793 rt->fl = ort->fl;
2794
2795 rt->idev = ort->idev;
2796 if (rt->idev)
2797 in_dev_hold(rt->idev);
e84f84f2 2798 rt->rt_genid = rt_genid(net);
14e50e57
DM
2799 rt->rt_flags = ort->rt_flags;
2800 rt->rt_type = ort->rt_type;
2801 rt->rt_dst = ort->rt_dst;
2802 rt->rt_src = ort->rt_src;
2803 rt->rt_iif = ort->rt_iif;
2804 rt->rt_gateway = ort->rt_gateway;
2805 rt->rt_spec_dst = ort->rt_spec_dst;
2806 rt->peer = ort->peer;
2807 if (rt->peer)
2808 atomic_inc(&rt->peer->refcnt);
2809
2810 dst_free(new);
2811 }
2812
2813 dst_release(&(*rp)->u.dst);
2814 *rp = rt;
2815 return (rt ? 0 : -ENOMEM);
2816}
2817
f1b050bf
DL
2818int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2819 struct sock *sk, int flags)
1da177e4
LT
2820{
2821 int err;
2822
f1b050bf 2823 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
1da177e4
LT
2824 return err;
2825
2826 if (flp->proto) {
2827 if (!flp->fl4_src)
2828 flp->fl4_src = (*rp)->rt_src;
2829 if (!flp->fl4_dst)
2830 flp->fl4_dst = (*rp)->rt_dst;
52479b62 2831 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
bb72845e 2832 flags ? XFRM_LOOKUP_WAIT : 0);
14e50e57 2833 if (err == -EREMOTE)
e84f84f2 2834 err = ipv4_dst_blackhole(net, rp, flp);
14e50e57
DM
2835
2836 return err;
1da177e4
LT
2837 }
2838
2839 return 0;
2840}
2841
d8c97a94
ACM
2842EXPORT_SYMBOL_GPL(ip_route_output_flow);
2843
f206351a 2844int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
1da177e4 2845{
f206351a 2846 return ip_route_output_flow(net, rp, flp, NULL, 0);
1da177e4
LT
2847}
2848
4feb88e5
BT
2849static int rt_fill_info(struct net *net,
2850 struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2851 int nowait, unsigned int flags)
1da177e4 2852{
511c3f92 2853 struct rtable *rt = skb_rtable(skb);
1da177e4 2854 struct rtmsg *r;
be403ea1 2855 struct nlmsghdr *nlh;
e3703b3d
TG
2856 long expires;
2857 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2858
2859 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2860 if (nlh == NULL)
26932566 2861 return -EMSGSIZE;
be403ea1
TG
2862
2863 r = nlmsg_data(nlh);
1da177e4
LT
2864 r->rtm_family = AF_INET;
2865 r->rtm_dst_len = 32;
2866 r->rtm_src_len = 0;
2867 r->rtm_tos = rt->fl.fl4_tos;
2868 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2869 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2870 r->rtm_type = rt->rt_type;
2871 r->rtm_scope = RT_SCOPE_UNIVERSE;
2872 r->rtm_protocol = RTPROT_UNSPEC;
2873 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2874 if (rt->rt_flags & RTCF_NOTIFY)
2875 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2876
17fb2c64 2877 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2878
1da177e4
LT
2879 if (rt->fl.fl4_src) {
2880 r->rtm_src_len = 32;
17fb2c64 2881 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
1da177e4
LT
2882 }
2883 if (rt->u.dst.dev)
be403ea1 2884 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
1da177e4
LT
2885#ifdef CONFIG_NET_CLS_ROUTE
2886 if (rt->u.dst.tclassid)
be403ea1 2887 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
1da177e4
LT
2888#endif
2889 if (rt->fl.iif)
17fb2c64 2890 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
1da177e4 2891 else if (rt->rt_src != rt->fl.fl4_src)
17fb2c64 2892 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2893
1da177e4 2894 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2895 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2896
1da177e4 2897 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
be403ea1
TG
2898 goto nla_put_failure;
2899
e3703b3d
TG
2900 error = rt->u.dst.error;
2901 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
1da177e4 2902 if (rt->peer) {
2c1409a0 2903 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
1da177e4 2904 if (rt->peer->tcp_ts_stamp) {
e3703b3d 2905 ts = rt->peer->tcp_ts;
9d729f72 2906 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
1da177e4
LT
2907 }
2908 }
be403ea1 2909
1da177e4
LT
2910 if (rt->fl.iif) {
2911#ifdef CONFIG_IP_MROUTE
e448515c 2912 __be32 dst = rt->rt_dst;
1da177e4 2913
f97c1e0c 2914 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
4feb88e5
BT
2915 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2916 int err = ipmr_get_route(net, skb, r, nowait);
1da177e4
LT
2917 if (err <= 0) {
2918 if (!nowait) {
2919 if (err == 0)
2920 return 0;
be403ea1 2921 goto nla_put_failure;
1da177e4
LT
2922 } else {
2923 if (err == -EMSGSIZE)
be403ea1 2924 goto nla_put_failure;
e3703b3d 2925 error = err;
1da177e4
LT
2926 }
2927 }
2928 } else
2929#endif
be403ea1 2930 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
1da177e4
LT
2931 }
2932
e3703b3d
TG
2933 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2934 expires, error) < 0)
2935 goto nla_put_failure;
be403ea1
TG
2936
2937 return nlmsg_end(skb, nlh);
1da177e4 2938
be403ea1 2939nla_put_failure:
26932566
PM
2940 nlmsg_cancel(skb, nlh);
2941 return -EMSGSIZE;
1da177e4
LT
2942}
2943
63f3444f 2944static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 2945{
3b1e0a65 2946 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2947 struct rtmsg *rtm;
2948 struct nlattr *tb[RTA_MAX+1];
1da177e4 2949 struct rtable *rt = NULL;
9e12bb22
AV
2950 __be32 dst = 0;
2951 __be32 src = 0;
2952 u32 iif;
d889ce3b 2953 int err;
1da177e4
LT
2954 struct sk_buff *skb;
2955
d889ce3b
TG
2956 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2957 if (err < 0)
2958 goto errout;
2959
2960 rtm = nlmsg_data(nlh);
2961
1da177e4 2962 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2963 if (skb == NULL) {
2964 err = -ENOBUFS;
2965 goto errout;
2966 }
1da177e4
LT
2967
2968 /* Reserve room for dummy headers, this skb can pass
2969 through good chunk of routing engine.
2970 */
459a98ed 2971 skb_reset_mac_header(skb);
c1d2bbe1 2972 skb_reset_network_header(skb);
d2c962b8
SH
2973
2974 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2975 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2976 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2977
17fb2c64
AV
2978 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2979 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2980 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
1da177e4
LT
2981
2982 if (iif) {
d889ce3b
TG
2983 struct net_device *dev;
2984
1937504d 2985 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
2986 if (dev == NULL) {
2987 err = -ENODEV;
2988 goto errout_free;
2989 }
2990
1da177e4
LT
2991 skb->protocol = htons(ETH_P_IP);
2992 skb->dev = dev;
2993 local_bh_disable();
2994 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2995 local_bh_enable();
d889ce3b 2996
511c3f92 2997 rt = skb_rtable(skb);
d889ce3b 2998 if (err == 0 && rt->u.dst.error)
1da177e4
LT
2999 err = -rt->u.dst.error;
3000 } else {
d889ce3b
TG
3001 struct flowi fl = {
3002 .nl_u = {
3003 .ip4_u = {
3004 .daddr = dst,
3005 .saddr = src,
3006 .tos = rtm->rtm_tos,
3007 },
3008 },
3009 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3010 };
1937504d 3011 err = ip_route_output_key(net, &rt, &fl);
1da177e4 3012 }
d889ce3b 3013
1da177e4 3014 if (err)
d889ce3b 3015 goto errout_free;
1da177e4 3016
adf30907 3017 skb_dst_set(skb, &rt->u.dst);
1da177e4
LT
3018 if (rtm->rtm_flags & RTM_F_NOTIFY)
3019 rt->rt_flags |= RTCF_NOTIFY;
3020
4feb88e5 3021 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 3022 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
3023 if (err <= 0)
3024 goto errout_free;
1da177e4 3025
1937504d 3026 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 3027errout:
2942e900 3028 return err;
1da177e4 3029
d889ce3b 3030errout_free:
1da177e4 3031 kfree_skb(skb);
d889ce3b 3032 goto errout;
1da177e4
LT
3033}
3034
3035int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3036{
3037 struct rtable *rt;
3038 int h, s_h;
3039 int idx, s_idx;
1937504d
DL
3040 struct net *net;
3041
3b1e0a65 3042 net = sock_net(skb->sk);
1da177e4
LT
3043
3044 s_h = cb->args[0];
d8c92830
ED
3045 if (s_h < 0)
3046 s_h = 0;
1da177e4 3047 s_idx = idx = cb->args[1];
a6272665
ED
3048 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3049 if (!rt_hash_table[h].chain)
3050 continue;
1da177e4 3051 rcu_read_lock_bh();
a898def2
PM
3052 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3053 rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
878628fb 3054 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
1da177e4 3055 continue;
e84f84f2 3056 if (rt_is_expired(rt))
29e75252 3057 continue;
adf30907 3058 skb_dst_set(skb, dst_clone(&rt->u.dst));
4feb88e5 3059 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
e905a9ed 3060 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 3061 1, NLM_F_MULTI) <= 0) {
adf30907 3062 skb_dst_drop(skb);
1da177e4
LT
3063 rcu_read_unlock_bh();
3064 goto done;
3065 }
adf30907 3066 skb_dst_drop(skb);
1da177e4
LT
3067 }
3068 rcu_read_unlock_bh();
3069 }
3070
3071done:
3072 cb->args[0] = h;
3073 cb->args[1] = idx;
3074 return skb->len;
3075}
3076
3077void ip_rt_multicast_event(struct in_device *in_dev)
3078{
76e6ebfb 3079 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
3080}
3081
3082#ifdef CONFIG_SYSCTL
81c684d1 3083static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 3084 void __user *buffer,
1da177e4
LT
3085 size_t *lenp, loff_t *ppos)
3086{
3087 if (write) {
639e104f 3088 int flush_delay;
81c684d1 3089 ctl_table ctl;
39a23e75 3090 struct net *net;
639e104f 3091
81c684d1
DL
3092 memcpy(&ctl, __ctl, sizeof(ctl));
3093 ctl.data = &flush_delay;
8d65af78 3094 proc_dointvec(&ctl, write, buffer, lenp, ppos);
639e104f 3095
81c684d1 3096 net = (struct net *)__ctl->extra1;
39a23e75 3097 rt_cache_flush(net, flush_delay);
1da177e4 3098 return 0;
e905a9ed 3099 }
1da177e4
LT
3100
3101 return -EINVAL;
3102}
3103
c6153b5b
HX
3104static void rt_secret_reschedule(int old)
3105{
3106 struct net *net;
3107 int new = ip_rt_secret_interval;
3108 int diff = new - old;
3109
3110 if (!diff)
3111 return;
3112
3113 rtnl_lock();
3114 for_each_net(net) {
3115 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
858a18a6 3116 long time;
c6153b5b
HX
3117
3118 if (!new)
3119 continue;
3120
3121 if (deleted) {
858a18a6 3122 time = net->ipv4.rt_secret_timer.expires - jiffies;
c6153b5b
HX
3123
3124 if (time <= 0 || (time += diff) <= 0)
3125 time = 0;
c6153b5b 3126 } else
858a18a6 3127 time = new;
c6153b5b 3128
858a18a6 3129 mod_timer(&net->ipv4.rt_secret_timer, jiffies + time);
c6153b5b
HX
3130 }
3131 rtnl_unlock();
3132}
3133
3134static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
c6153b5b
HX
3135 void __user *buffer, size_t *lenp,
3136 loff_t *ppos)
3137{
3138 int old = ip_rt_secret_interval;
8d65af78 3139 int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
c6153b5b
HX
3140
3141 rt_secret_reschedule(old);
3142
3143 return ret;
3144}
3145
eeb61f71 3146static ctl_table ipv4_route_table[] = {
1da177e4 3147 {
1da177e4
LT
3148 .procname = "gc_thresh",
3149 .data = &ipv4_dst_ops.gc_thresh,
3150 .maxlen = sizeof(int),
3151 .mode = 0644,
6d9f239a 3152 .proc_handler = proc_dointvec,
1da177e4
LT
3153 },
3154 {
1da177e4
LT
3155 .procname = "max_size",
3156 .data = &ip_rt_max_size,
3157 .maxlen = sizeof(int),
3158 .mode = 0644,
6d9f239a 3159 .proc_handler = proc_dointvec,
1da177e4
LT
3160 },
3161 {
3162 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3163
1da177e4
LT
3164 .procname = "gc_min_interval",
3165 .data = &ip_rt_gc_min_interval,
3166 .maxlen = sizeof(int),
3167 .mode = 0644,
6d9f239a 3168 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3169 },
3170 {
1da177e4
LT
3171 .procname = "gc_min_interval_ms",
3172 .data = &ip_rt_gc_min_interval,
3173 .maxlen = sizeof(int),
3174 .mode = 0644,
6d9f239a 3175 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3176 },
3177 {
1da177e4
LT
3178 .procname = "gc_timeout",
3179 .data = &ip_rt_gc_timeout,
3180 .maxlen = sizeof(int),
3181 .mode = 0644,
6d9f239a 3182 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3183 },
3184 {
1da177e4
LT
3185 .procname = "gc_interval",
3186 .data = &ip_rt_gc_interval,
3187 .maxlen = sizeof(int),
3188 .mode = 0644,
6d9f239a 3189 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3190 },
3191 {
1da177e4
LT
3192 .procname = "redirect_load",
3193 .data = &ip_rt_redirect_load,
3194 .maxlen = sizeof(int),
3195 .mode = 0644,
6d9f239a 3196 .proc_handler = proc_dointvec,
1da177e4
LT
3197 },
3198 {
1da177e4
LT
3199 .procname = "redirect_number",
3200 .data = &ip_rt_redirect_number,
3201 .maxlen = sizeof(int),
3202 .mode = 0644,
6d9f239a 3203 .proc_handler = proc_dointvec,
1da177e4
LT
3204 },
3205 {
1da177e4
LT
3206 .procname = "redirect_silence",
3207 .data = &ip_rt_redirect_silence,
3208 .maxlen = sizeof(int),
3209 .mode = 0644,
6d9f239a 3210 .proc_handler = proc_dointvec,
1da177e4
LT
3211 },
3212 {
1da177e4
LT
3213 .procname = "error_cost",
3214 .data = &ip_rt_error_cost,
3215 .maxlen = sizeof(int),
3216 .mode = 0644,
6d9f239a 3217 .proc_handler = proc_dointvec,
1da177e4
LT
3218 },
3219 {
1da177e4
LT
3220 .procname = "error_burst",
3221 .data = &ip_rt_error_burst,
3222 .maxlen = sizeof(int),
3223 .mode = 0644,
6d9f239a 3224 .proc_handler = proc_dointvec,
1da177e4
LT
3225 },
3226 {
1da177e4
LT
3227 .procname = "gc_elasticity",
3228 .data = &ip_rt_gc_elasticity,
3229 .maxlen = sizeof(int),
3230 .mode = 0644,
6d9f239a 3231 .proc_handler = proc_dointvec,
1da177e4
LT
3232 },
3233 {
1da177e4
LT
3234 .procname = "mtu_expires",
3235 .data = &ip_rt_mtu_expires,
3236 .maxlen = sizeof(int),
3237 .mode = 0644,
6d9f239a 3238 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3239 },
3240 {
1da177e4
LT
3241 .procname = "min_pmtu",
3242 .data = &ip_rt_min_pmtu,
3243 .maxlen = sizeof(int),
3244 .mode = 0644,
6d9f239a 3245 .proc_handler = proc_dointvec,
1da177e4
LT
3246 },
3247 {
1da177e4
LT
3248 .procname = "min_adv_mss",
3249 .data = &ip_rt_min_advmss,
3250 .maxlen = sizeof(int),
3251 .mode = 0644,
6d9f239a 3252 .proc_handler = proc_dointvec,
1da177e4
LT
3253 },
3254 {
1da177e4
LT
3255 .procname = "secret_interval",
3256 .data = &ip_rt_secret_interval,
3257 .maxlen = sizeof(int),
3258 .mode = 0644,
6d9f239a 3259 .proc_handler = ipv4_sysctl_rt_secret_interval,
1da177e4 3260 },
f8572d8f 3261 { }
1da177e4 3262};
39a23e75 3263
2f4520d3
AV
3264static struct ctl_table empty[1];
3265
3266static struct ctl_table ipv4_skeleton[] =
3267{
f8572d8f 3268 { .procname = "route",
d994af0d 3269 .mode = 0555, .child = ipv4_route_table},
f8572d8f 3270 { .procname = "neigh",
d994af0d 3271 .mode = 0555, .child = empty},
2f4520d3
AV
3272 { }
3273};
3274
3275static __net_initdata struct ctl_path ipv4_path[] = {
f8572d8f
EB
3276 { .procname = "net", },
3277 { .procname = "ipv4", },
39a23e75
DL
3278 { },
3279};
3280
39a23e75
DL
3281static struct ctl_table ipv4_route_flush_table[] = {
3282 {
39a23e75
DL
3283 .procname = "flush",
3284 .maxlen = sizeof(int),
3285 .mode = 0200,
6d9f239a 3286 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3287 },
f8572d8f 3288 { },
39a23e75
DL
3289};
3290
2f4520d3 3291static __net_initdata struct ctl_path ipv4_route_path[] = {
f8572d8f
EB
3292 { .procname = "net", },
3293 { .procname = "ipv4", },
3294 { .procname = "route", },
2f4520d3
AV
3295 { },
3296};
3297
39a23e75
DL
3298static __net_init int sysctl_route_net_init(struct net *net)
3299{
3300 struct ctl_table *tbl;
3301
3302 tbl = ipv4_route_flush_table;
09ad9bc7 3303 if (!net_eq(net, &init_net)) {
39a23e75
DL
3304 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3305 if (tbl == NULL)
3306 goto err_dup;
3307 }
3308 tbl[0].extra1 = net;
3309
3310 net->ipv4.route_hdr =
3311 register_net_sysctl_table(net, ipv4_route_path, tbl);
3312 if (net->ipv4.route_hdr == NULL)
3313 goto err_reg;
3314 return 0;
3315
3316err_reg:
3317 if (tbl != ipv4_route_flush_table)
3318 kfree(tbl);
3319err_dup:
3320 return -ENOMEM;
3321}
3322
3323static __net_exit void sysctl_route_net_exit(struct net *net)
3324{
3325 struct ctl_table *tbl;
3326
3327 tbl = net->ipv4.route_hdr->ctl_table_arg;
3328 unregister_net_sysctl_table(net->ipv4.route_hdr);
3329 BUG_ON(tbl == ipv4_route_flush_table);
3330 kfree(tbl);
3331}
3332
3333static __net_initdata struct pernet_operations sysctl_route_ops = {
3334 .init = sysctl_route_net_init,
3335 .exit = sysctl_route_net_exit,
3336};
1da177e4
LT
3337#endif
3338
9f5e97e5
DL
3339
3340static __net_init int rt_secret_timer_init(struct net *net)
3341{
e84f84f2
DL
3342 atomic_set(&net->ipv4.rt_genid,
3343 (int) ((num_physpages ^ (num_physpages>>8)) ^
3344 (jiffies ^ (jiffies >> 7))));
3345
9f5e97e5
DL
3346 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3347 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3348 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3349
c6153b5b
HX
3350 if (ip_rt_secret_interval) {
3351 net->ipv4.rt_secret_timer.expires =
3352 jiffies + net_random() % ip_rt_secret_interval +
3353 ip_rt_secret_interval;
3354 add_timer(&net->ipv4.rt_secret_timer);
3355 }
9f5e97e5
DL
3356 return 0;
3357}
3358
3359static __net_exit void rt_secret_timer_exit(struct net *net)
3360{
3361 del_timer_sync(&net->ipv4.rt_secret_timer);
3362}
3363
3364static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3365 .init = rt_secret_timer_init,
3366 .exit = rt_secret_timer_exit,
3367};
3368
3369
1da177e4 3370#ifdef CONFIG_NET_CLS_ROUTE
7d720c3e 3371struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
1da177e4
LT
3372#endif /* CONFIG_NET_CLS_ROUTE */
3373
3374static __initdata unsigned long rhash_entries;
3375static int __init set_rhash_entries(char *str)
3376{
3377 if (!str)
3378 return 0;
3379 rhash_entries = simple_strtoul(str, &str, 0);
3380 return 1;
3381}
3382__setup("rhash_entries=", set_rhash_entries);
3383
3384int __init ip_rt_init(void)
3385{
424c4b70 3386 int rc = 0;
1da177e4 3387
1da177e4 3388#ifdef CONFIG_NET_CLS_ROUTE
0dcec8c2 3389 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3390 if (!ip_rt_acct)
3391 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3392#endif
3393
e5d679f3
AD
3394 ipv4_dst_ops.kmem_cachep =
3395 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3396 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3397
14e50e57
DM
3398 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3399
424c4b70
ED
3400 rt_hash_table = (struct rt_hash_bucket *)
3401 alloc_large_system_hash("IP route cache",
3402 sizeof(struct rt_hash_bucket),
3403 rhash_entries,
4481374c 3404 (totalram_pages >= 128 * 1024) ?
18955cfc 3405 15 : 17,
8d1502de 3406 0,
424c4b70
ED
3407 &rt_hash_log,
3408 &rt_hash_mask,
c9503e0f 3409 rhash_entries ? 0 : 512 * 1024);
22c047cc
ED
3410 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3411 rt_hash_lock_init();
1da177e4
LT
3412
3413 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3414 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3415
1da177e4
LT
3416 devinet_init();
3417 ip_fib_init();
3418
1da177e4
LT
3419 /* All the timers, started at system startup tend
3420 to synchronize. Perturb it a bit.
3421 */
125bb8f5
ED
3422 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3423 expires_ljiffies = jiffies;
39c90ece
ED
3424 schedule_delayed_work(&expires_work,
3425 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
1da177e4 3426
9f5e97e5
DL
3427 if (register_pernet_subsys(&rt_secret_timer_ops))
3428 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
1da177e4 3429
73b38711 3430 if (ip_rt_proc_init())
107f1634 3431 printk(KERN_ERR "Unable to create route proc files\n");
1da177e4
LT
3432#ifdef CONFIG_XFRM
3433 xfrm_init();
a33bc5c1 3434 xfrm4_init(ip_rt_max_size);
1da177e4 3435#endif
63f3444f
TG
3436 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3437
39a23e75
DL
3438#ifdef CONFIG_SYSCTL
3439 register_pernet_subsys(&sysctl_route_ops);
3440#endif
1da177e4
LT
3441 return rc;
3442}
3443
a1bc6eb4 3444#ifdef CONFIG_SYSCTL
eeb61f71
AV
3445/*
3446 * We really need to sanitize the damn ipv4 init order, then all
3447 * this nonsense will go away.
3448 */
3449void __init ip_static_sysctl_init(void)
3450{
2f4520d3 3451 register_sysctl_paths(ipv4_path, ipv4_skeleton);
eeb61f71 3452}
a1bc6eb4 3453#endif
eeb61f71 3454
1da177e4
LT
3455EXPORT_SYMBOL(__ip_select_ident);
3456EXPORT_SYMBOL(ip_route_input);
3457EXPORT_SYMBOL(ip_route_output_key);