]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/route.c
isdn: clean up documentation index
[net-next-2.6.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
1da177e4
LT
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
1da177e4 71#include <linux/mm.h>
424c4b70 72#include <linux/bootmem.h>
1da177e4
LT
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
39c90ece 82#include <linux/workqueue.h>
1da177e4 83#include <linux/skbuff.h>
1da177e4
LT
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
352e512c 93#include <net/dst.h>
457c4cbc 94#include <net/net_namespace.h>
1da177e4
LT
95#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
8d71740c 105#include <net/netevent.h>
63f3444f 106#include <net/rtnetlink.h>
1da177e4
LT
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
1da177e4 118static int ip_rt_max_size;
817bc4db
SH
119static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
120static int ip_rt_gc_interval __read_mostly = 60 * HZ;
121static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
122static int ip_rt_redirect_number __read_mostly = 9;
123static int ip_rt_redirect_load __read_mostly = HZ / 50;
124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125static int ip_rt_error_cost __read_mostly = HZ;
126static int ip_rt_error_burst __read_mostly = 5 * HZ;
127static int ip_rt_gc_elasticity __read_mostly = 8;
128static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
1080d709 132static int rt_chain_length_max __read_mostly = 20;
1da177e4 133
125bb8f5
ED
134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
1da177e4
LT
136
137/*
138 * Interface to generic destination cache.
139 */
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142static void ipv4_dst_destroy(struct dst_entry *dst);
143static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 148static int rt_garbage_collect(struct dst_ops *ops);
1080d709 149static void rt_emergency_hash_rebuild(struct net *net);
1da177e4
LT
150
151
152static struct dst_ops ipv4_dst_ops = {
153 .family = AF_INET,
09640e63 154 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4
LT
155 .gc = rt_garbage_collect,
156 .check = ipv4_dst_check,
157 .destroy = ipv4_dst_destroy,
158 .ifdown = ipv4_dst_ifdown,
159 .negative_advice = ipv4_negative_advice,
160 .link_failure = ipv4_link_failure,
161 .update_pmtu = ip_rt_update_pmtu,
1ac06e03 162 .local_out = __ip_local_out,
e2422970 163 .entries = ATOMIC_INIT(0),
1da177e4
LT
164};
165
166#define ECN_OR_COST(class) TC_PRIO_##class
167
4839c52b 168const __u8 ip_tos2prio[16] = {
1da177e4
LT
169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
185};
186
187
188/*
189 * Route cache.
190 */
191
192/* The locking scheme is rather straight forward:
193 *
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
199 * lock held.
200 */
201
202struct rt_hash_bucket {
203 struct rtable *chain;
22c047cc 204};
1080d709 205
8a25d5de
IM
206#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
62051200 211 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 212 */
62051200
IM
213#ifdef CONFIG_LOCKDEP
214# define RT_HASH_LOCK_SZ 256
22c047cc 215#else
62051200
IM
216# if NR_CPUS >= 32
217# define RT_HASH_LOCK_SZ 4096
218# elif NR_CPUS >= 16
219# define RT_HASH_LOCK_SZ 2048
220# elif NR_CPUS >= 8
221# define RT_HASH_LOCK_SZ 1024
222# elif NR_CPUS >= 4
223# define RT_HASH_LOCK_SZ 512
224# else
225# define RT_HASH_LOCK_SZ 256
226# endif
22c047cc
ED
227#endif
228
229static spinlock_t *rt_hash_locks;
230# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
231
232static __init void rt_hash_lock_init(void)
233{
234 int i;
235
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 GFP_KERNEL);
238 if (!rt_hash_locks)
239 panic("IP: failed to allocate rt_hash_locks\n");
240
241 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 spin_lock_init(&rt_hash_locks[i]);
243}
22c047cc
ED
244#else
245# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
246
247static inline void rt_hash_lock_init(void)
248{
249}
22c047cc 250#endif
1da177e4 251
817bc4db
SH
252static struct rt_hash_bucket *rt_hash_table __read_mostly;
253static unsigned rt_hash_mask __read_mostly;
254static unsigned int rt_hash_log __read_mostly;
1da177e4 255
2f970d83 256static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
dbd2915c 257#define RT_CACHE_STAT_INC(field) \
bfe5d834 258 (__raw_get_cpu_var(rt_cache_stat).field++)
1da177e4 259
b00180de
DL
260static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 int genid)
1da177e4 262{
1294fc4a
SH
263 return jhash_3words((__force u32)(__be32)(daddr),
264 (__force u32)(__be32)(saddr),
b00180de 265 idx, genid)
29e75252 266 & rt_hash_mask;
1da177e4
LT
267}
268
e84f84f2
DL
269static inline int rt_genid(struct net *net)
270{
271 return atomic_read(&net->ipv4.rt_genid);
272}
273
1da177e4
LT
274#ifdef CONFIG_PROC_FS
275struct rt_cache_iter_state {
a75e936f 276 struct seq_net_private p;
1da177e4 277 int bucket;
29e75252 278 int genid;
1da177e4
LT
279};
280
1218854a 281static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 282{
1218854a 283 struct rt_cache_iter_state *st = seq->private;
1da177e4 284 struct rtable *r = NULL;
1da177e4
LT
285
286 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
a6272665
ED
287 if (!rt_hash_table[st->bucket].chain)
288 continue;
1da177e4 289 rcu_read_lock_bh();
29e75252
ED
290 r = rcu_dereference(rt_hash_table[st->bucket].chain);
291 while (r) {
1218854a 292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
a75e936f 293 r->rt_genid == st->genid)
29e75252
ED
294 return r;
295 r = rcu_dereference(r->u.dst.rt_next);
296 }
1da177e4
LT
297 rcu_read_unlock_bh();
298 }
29e75252 299 return r;
1da177e4
LT
300}
301
1218854a 302static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 303 struct rtable *r)
1da177e4 304{
1218854a 305 struct rt_cache_iter_state *st = seq->private;
a6272665 306
093c2ca4 307 r = r->u.dst.rt_next;
1da177e4
LT
308 while (!r) {
309 rcu_read_unlock_bh();
a6272665
ED
310 do {
311 if (--st->bucket < 0)
312 return NULL;
313 } while (!rt_hash_table[st->bucket].chain);
1da177e4
LT
314 rcu_read_lock_bh();
315 r = rt_hash_table[st->bucket].chain;
316 }
0bcceadc 317 return rcu_dereference(r);
1da177e4
LT
318}
319
1218854a 320static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
321 struct rtable *r)
322{
1218854a
YH
323 struct rt_cache_iter_state *st = seq->private;
324 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
a75e936f 326 continue;
642d6318
DL
327 if (r->rt_genid == st->genid)
328 break;
329 }
330 return r;
331}
332
1218854a 333static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 334{
1218854a 335 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
336
337 if (r)
1218854a 338 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
339 --pos;
340 return pos ? NULL : r;
341}
342
343static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344{
29e75252 345 struct rt_cache_iter_state *st = seq->private;
29e75252 346 if (*pos)
1218854a 347 return rt_cache_get_idx(seq, *pos - 1);
e84f84f2 348 st->genid = rt_genid(seq_file_net(seq));
29e75252 349 return SEQ_START_TOKEN;
1da177e4
LT
350}
351
352static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353{
29e75252 354 struct rtable *r;
1da177e4
LT
355
356 if (v == SEQ_START_TOKEN)
1218854a 357 r = rt_cache_get_first(seq);
1da177e4 358 else
1218854a 359 r = rt_cache_get_next(seq, v);
1da177e4
LT
360 ++*pos;
361 return r;
362}
363
364static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365{
366 if (v && v != SEQ_START_TOKEN)
367 rcu_read_unlock_bh();
368}
369
370static int rt_cache_seq_show(struct seq_file *seq, void *v)
371{
372 if (v == SEQ_START_TOKEN)
373 seq_printf(seq, "%-127s\n",
374 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 "HHUptod\tSpecDst");
377 else {
378 struct rtable *r = v;
5e659e4c 379 int len;
1da177e4 380
5e659e4c
PE
381 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
1da177e4
LT
383 r->u.dst.dev ? r->u.dst.dev->name : "*",
384 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 dst_metric(&r->u.dst, RTAX_WINDOW),
390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 r->fl.fl4_tos,
393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 dev_queue_xmit) : 0,
5e659e4c
PE
396 r->rt_spec_dst, &len);
397
398 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
399 }
400 return 0;
1da177e4
LT
401}
402
f690808e 403static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
404 .start = rt_cache_seq_start,
405 .next = rt_cache_seq_next,
406 .stop = rt_cache_seq_stop,
407 .show = rt_cache_seq_show,
408};
409
410static int rt_cache_seq_open(struct inode *inode, struct file *file)
411{
a75e936f 412 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 413 sizeof(struct rt_cache_iter_state));
1da177e4
LT
414}
415
9a32144e 416static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
417 .owner = THIS_MODULE,
418 .open = rt_cache_seq_open,
419 .read = seq_read,
420 .llseek = seq_lseek,
a75e936f 421 .release = seq_release_net,
1da177e4
LT
422};
423
424
425static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426{
427 int cpu;
428
429 if (*pos == 0)
430 return SEQ_START_TOKEN;
431
0f23174a 432 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
433 if (!cpu_possible(cpu))
434 continue;
435 *pos = cpu+1;
2f970d83 436 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
437 }
438 return NULL;
439}
440
441static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442{
443 int cpu;
444
0f23174a 445 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
446 if (!cpu_possible(cpu))
447 continue;
448 *pos = cpu+1;
2f970d83 449 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
450 }
451 return NULL;
e905a9ed 452
1da177e4
LT
453}
454
455static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456{
457
458}
459
460static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461{
462 struct rt_cache_stat *st = v;
463
464 if (v == SEQ_START_TOKEN) {
5bec0039 465 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
466 return 0;
467 }
e905a9ed 468
1da177e4
LT
469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops.entries),
472 st->in_hit,
473 st->in_slow_tot,
474 st->in_slow_mc,
475 st->in_no_route,
476 st->in_brd,
477 st->in_martian_dst,
478 st->in_martian_src,
479
480 st->out_hit,
481 st->out_slow_tot,
e905a9ed 482 st->out_slow_mc,
1da177e4
LT
483
484 st->gc_total,
485 st->gc_ignored,
486 st->gc_goal_miss,
487 st->gc_dst_overflow,
488 st->in_hlist_search,
489 st->out_hlist_search
490 );
491 return 0;
492}
493
f690808e 494static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
495 .start = rt_cpu_seq_start,
496 .next = rt_cpu_seq_next,
497 .stop = rt_cpu_seq_stop,
498 .show = rt_cpu_seq_show,
499};
500
501
502static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503{
504 return seq_open(file, &rt_cpu_seq_ops);
505}
506
9a32144e 507static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
508 .owner = THIS_MODULE,
509 .open = rt_cpu_seq_open,
510 .read = seq_read,
511 .llseek = seq_lseek,
512 .release = seq_release,
513};
514
78c686e9
PE
515#ifdef CONFIG_NET_CLS_ROUTE
516static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
517 int length, int *eof, void *data)
518{
519 unsigned int i;
520
521 if ((offset & 3) || (length & 3))
522 return -EIO;
523
524 if (offset >= sizeof(struct ip_rt_acct) * 256) {
525 *eof = 1;
526 return 0;
527 }
528
529 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
530 length = sizeof(struct ip_rt_acct) * 256 - offset;
531 *eof = 1;
532 }
533
534 offset /= sizeof(u32);
535
536 if (length > 0) {
537 u32 *dst = (u32 *) buffer;
538
539 *start = buffer;
540 memset(dst, 0, length);
541
542 for_each_possible_cpu(i) {
543 unsigned int j;
544 u32 *src;
545
546 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
547 for (j = 0; j < length/4; j++)
548 dst[j] += src[j];
549 }
550 }
551 return length;
552}
553#endif
107f1634 554
73b38711 555static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
556{
557 struct proc_dir_entry *pde;
558
559 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
560 &rt_cache_seq_fops);
561 if (!pde)
562 goto err1;
563
77020720
WC
564 pde = proc_create("rt_cache", S_IRUGO,
565 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
566 if (!pde)
567 goto err2;
568
107f1634
PE
569#ifdef CONFIG_NET_CLS_ROUTE
570 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
571 ip_rt_acct_read, NULL);
572 if (!pde)
573 goto err3;
574#endif
575 return 0;
576
577#ifdef CONFIG_NET_CLS_ROUTE
578err3:
579 remove_proc_entry("rt_cache", net->proc_net_stat);
580#endif
581err2:
582 remove_proc_entry("rt_cache", net->proc_net);
583err1:
584 return -ENOMEM;
585}
73b38711
DL
586
587static void __net_exit ip_rt_do_proc_exit(struct net *net)
588{
589 remove_proc_entry("rt_cache", net->proc_net_stat);
590 remove_proc_entry("rt_cache", net->proc_net);
591 remove_proc_entry("rt_acct", net->proc_net);
592}
593
594static struct pernet_operations ip_rt_proc_ops __net_initdata = {
595 .init = ip_rt_do_proc_init,
596 .exit = ip_rt_do_proc_exit,
597};
598
599static int __init ip_rt_proc_init(void)
600{
601 return register_pernet_subsys(&ip_rt_proc_ops);
602}
603
107f1634 604#else
73b38711 605static inline int ip_rt_proc_init(void)
107f1634
PE
606{
607 return 0;
608}
1da177e4 609#endif /* CONFIG_PROC_FS */
e905a9ed 610
5969f71d 611static inline void rt_free(struct rtable *rt)
1da177e4 612{
1da177e4
LT
613 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
614}
615
5969f71d 616static inline void rt_drop(struct rtable *rt)
1da177e4 617{
1da177e4
LT
618 ip_rt_put(rt);
619 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
620}
621
5969f71d 622static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
623{
624 /* Kill broadcast/multicast entries very aggresively, if they
625 collide in hash table with more useful entries */
626 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
093c2ca4 627 rth->fl.iif && rth->u.dst.rt_next;
1da177e4
LT
628}
629
5969f71d 630static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
631{
632 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 rth->u.dst.expires;
634}
635
636static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
637{
638 unsigned long age;
639 int ret = 0;
640
641 if (atomic_read(&rth->u.dst.__refcnt))
642 goto out;
643
644 ret = 1;
645 if (rth->u.dst.expires &&
646 time_after_eq(jiffies, rth->u.dst.expires))
647 goto out;
648
649 age = jiffies - rth->u.dst.lastuse;
650 ret = 0;
651 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 (age <= tmo2 && rt_valuable(rth)))
653 goto out;
654 ret = 1;
655out: return ret;
656}
657
658/* Bits of score are:
659 * 31: very valuable
660 * 30: not quite useless
661 * 29..0: usage counter
662 */
663static inline u32 rt_score(struct rtable *rt)
664{
665 u32 score = jiffies - rt->u.dst.lastuse;
666
667 score = ~score & ~(3<<30);
668
669 if (rt_valuable(rt))
670 score |= (1<<31);
671
672 if (!rt->fl.iif ||
673 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 score |= (1<<30);
675
676 return score;
677}
678
1080d709
NH
679static inline bool rt_caching(const struct net *net)
680{
681 return net->ipv4.current_rt_cache_rebuild_count <=
682 net->ipv4.sysctl_rt_cache_rebuild_count;
683}
684
685static inline bool compare_hash_inputs(const struct flowi *fl1,
686 const struct flowi *fl2)
687{
688 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690 (fl1->iif ^ fl2->iif)) == 0);
691}
692
1da177e4
LT
693static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694{
714e85be
AV
695 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
696 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
47dcf0cb 697 (fl1->mark ^ fl2->mark) |
8238b218
DM
698 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
699 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 (fl1->oif ^ fl2->oif) |
701 (fl1->iif ^ fl2->iif)) == 0;
1da177e4
LT
702}
703
b5921910
DL
704static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705{
c346dca1 706 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
b5921910
DL
707}
708
e84f84f2
DL
709static inline int rt_is_expired(struct rtable *rth)
710{
711 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
712}
713
beb659bd
ED
714/*
715 * Perform a full scan of hash table and free all entries.
716 * Can be called by a softirq or a process.
717 * In the later case, we want to be reschedule if necessary
718 */
719static void rt_do_flush(int process_context)
720{
721 unsigned int i;
722 struct rtable *rth, *next;
32cb5b4e 723 struct rtable * tail;
beb659bd
ED
724
725 for (i = 0; i <= rt_hash_mask; i++) {
726 if (process_context && need_resched())
727 cond_resched();
728 rth = rt_hash_table[i].chain;
729 if (!rth)
730 continue;
731
732 spin_lock_bh(rt_hash_lock_addr(i));
32cb5b4e
DL
733#ifdef CONFIG_NET_NS
734 {
735 struct rtable ** prev, * p;
736
737 rth = rt_hash_table[i].chain;
738
739 /* defer releasing the head of the list after spin_unlock */
740 for (tail = rth; tail; tail = tail->u.dst.rt_next)
741 if (!rt_is_expired(tail))
742 break;
743 if (rth != tail)
744 rt_hash_table[i].chain = tail;
745
746 /* call rt_free on entries after the tail requiring flush */
747 prev = &rt_hash_table[i].chain;
748 for (p = *prev; p; p = next) {
749 next = p->u.dst.rt_next;
750 if (!rt_is_expired(p)) {
751 prev = &p->u.dst.rt_next;
752 } else {
753 *prev = next;
754 rt_free(p);
755 }
756 }
757 }
758#else
beb659bd
ED
759 rth = rt_hash_table[i].chain;
760 rt_hash_table[i].chain = NULL;
32cb5b4e
DL
761 tail = NULL;
762#endif
beb659bd
ED
763 spin_unlock_bh(rt_hash_lock_addr(i));
764
32cb5b4e 765 for (; rth != tail; rth = next) {
beb659bd
ED
766 next = rth->u.dst.rt_next;
767 rt_free(rth);
768 }
769 }
770}
771
1080d709
NH
772/*
773 * While freeing expired entries, we compute average chain length
774 * and standard deviation, using fixed-point arithmetic.
775 * This to have an estimation of rt_chain_length_max
776 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
777 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
778 */
779
780#define FRACT_BITS 3
781#define ONE (1UL << FRACT_BITS)
782
beb659bd 783static void rt_check_expire(void)
1da177e4 784{
bb1d23b0
ED
785 static unsigned int rover;
786 unsigned int i = rover, goal;
1ddbcb00 787 struct rtable *rth, *aux, **rthp;
cf8da764 788 unsigned long samples = 0;
1080d709 789 unsigned long sum = 0, sum2 = 0;
125bb8f5 790 unsigned long delta;
bb1d23b0
ED
791 u64 mult;
792
125bb8f5
ED
793 delta = jiffies - expires_ljiffies;
794 expires_ljiffies = jiffies;
795 mult = ((u64)delta) << rt_hash_log;
bb1d23b0
ED
796 if (ip_rt_gc_timeout > 1)
797 do_div(mult, ip_rt_gc_timeout);
798 goal = (unsigned int)mult;
39c90ece
ED
799 if (goal > rt_hash_mask)
800 goal = rt_hash_mask + 1;
bb1d23b0 801 for (; goal > 0; goal--) {
1da177e4 802 unsigned long tmo = ip_rt_gc_timeout;
cf8da764 803 unsigned long length;
1da177e4
LT
804
805 i = (i + 1) & rt_hash_mask;
806 rthp = &rt_hash_table[i].chain;
807
d90bf5a9
ED
808 if (need_resched())
809 cond_resched();
810
1080d709
NH
811 samples++;
812
cfcabdcc 813 if (*rthp == NULL)
bb1d23b0 814 continue;
cf8da764 815 length = 0;
39c90ece 816 spin_lock_bh(rt_hash_lock_addr(i));
1da177e4 817 while ((rth = *rthp) != NULL) {
1ddbcb00 818 prefetch(rth->u.dst.rt_next);
e84f84f2 819 if (rt_is_expired(rth)) {
29e75252
ED
820 *rthp = rth->u.dst.rt_next;
821 rt_free(rth);
822 continue;
823 }
1da177e4
LT
824 if (rth->u.dst.expires) {
825 /* Entry is expired even if it is in use */
39c90ece 826 if (time_before_eq(jiffies, rth->u.dst.expires)) {
1ddbcb00 827nofree:
1da177e4 828 tmo >>= 1;
093c2ca4 829 rthp = &rth->u.dst.rt_next;
1080d709 830 /*
1ddbcb00 831 * We only count entries on
1080d709
NH
832 * a chain with equal hash inputs once
833 * so that entries for different QOS
834 * levels, and other non-hash input
835 * attributes don't unfairly skew
836 * the length computation
837 */
1ddbcb00
ED
838 for (aux = rt_hash_table[i].chain;;) {
839 if (aux == rth) {
840 length += ONE;
841 break;
842 }
843 if (compare_hash_inputs(&aux->fl, &rth->fl))
844 break;
845 aux = aux->u.dst.rt_next;
846 }
1da177e4
LT
847 continue;
848 }
1ddbcb00
ED
849 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
850 goto nofree;
1da177e4
LT
851
852 /* Cleanup aged off entries. */
093c2ca4 853 *rthp = rth->u.dst.rt_next;
e905a9ed 854 rt_free(rth);
1da177e4 855 }
39c90ece 856 spin_unlock_bh(rt_hash_lock_addr(i));
1080d709
NH
857 sum += length;
858 sum2 += length*length;
859 }
860 if (samples) {
861 unsigned long avg = sum / samples;
862 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
863 rt_chain_length_max = max_t(unsigned long,
864 ip_rt_gc_elasticity,
865 (avg + 4*sd) >> FRACT_BITS);
1da177e4
LT
866 }
867 rover = i;
beb659bd
ED
868}
869
870/*
871 * rt_worker_func() is run in process context.
29e75252 872 * we call rt_check_expire() to scan part of the hash table
beb659bd
ED
873 */
874static void rt_worker_func(struct work_struct *work)
875{
29e75252 876 rt_check_expire();
39c90ece 877 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
1da177e4
LT
878}
879
29e75252
ED
880/*
881 * Pertubation of rt_genid by a small quantity [1..256]
882 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
883 * many times (2^24) without giving recent rt_genid.
884 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 885 */
86c657f6 886static void rt_cache_invalidate(struct net *net)
1da177e4 887{
29e75252 888 unsigned char shuffle;
1da177e4 889
29e75252 890 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 891 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
1da177e4
LT
892}
893
29e75252
ED
894/*
895 * delay < 0 : invalidate cache (fast : entries will be deleted later)
896 * delay >= 0 : invalidate & flush cache (can be long)
897 */
76e6ebfb 898void rt_cache_flush(struct net *net, int delay)
1da177e4 899{
86c657f6 900 rt_cache_invalidate(net);
29e75252
ED
901 if (delay >= 0)
902 rt_do_flush(!in_softirq());
1da177e4
LT
903}
904
beb659bd 905/*
29e75252 906 * We change rt_genid and let gc do the cleanup
beb659bd 907 */
9f5e97e5 908static void rt_secret_rebuild(unsigned long __net)
1da177e4 909{
9f5e97e5 910 struct net *net = (struct net *)__net;
86c657f6 911 rt_cache_invalidate(net);
9f5e97e5 912 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
1da177e4
LT
913}
914
1080d709
NH
915static void rt_secret_rebuild_oneshot(struct net *net)
916{
917 del_timer_sync(&net->ipv4.rt_secret_timer);
918 rt_cache_invalidate(net);
919 if (ip_rt_secret_interval) {
920 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
921 add_timer(&net->ipv4.rt_secret_timer);
922 }
923}
924
925static void rt_emergency_hash_rebuild(struct net *net)
926{
927 if (net_ratelimit()) {
928 printk(KERN_WARNING "Route hash chain too long!\n");
929 printk(KERN_WARNING "Adjust your secret_interval!\n");
930 }
931
932 rt_secret_rebuild_oneshot(net);
933}
934
1da177e4
LT
935/*
936 Short description of GC goals.
937
938 We want to build algorithm, which will keep routing cache
939 at some equilibrium point, when number of aged off entries
940 is kept approximately equal to newly generated ones.
941
942 Current expiration strength is variable "expire".
943 We try to adjust it dynamically, so that if networking
944 is idle expires is large enough to keep enough of warm entries,
945 and when load increases it reduces to limit cache size.
946 */
947
569d3645 948static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
949{
950 static unsigned long expire = RT_GC_TIMEOUT;
951 static unsigned long last_gc;
952 static int rover;
953 static int equilibrium;
954 struct rtable *rth, **rthp;
955 unsigned long now = jiffies;
956 int goal;
957
958 /*
959 * Garbage collection is pretty expensive,
960 * do not make it too frequently.
961 */
962
963 RT_CACHE_STAT_INC(gc_total);
964
965 if (now - last_gc < ip_rt_gc_min_interval &&
966 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
967 RT_CACHE_STAT_INC(gc_ignored);
968 goto out;
969 }
970
971 /* Calculate number of entries, which we want to expire now. */
972 goal = atomic_read(&ipv4_dst_ops.entries) -
973 (ip_rt_gc_elasticity << rt_hash_log);
974 if (goal <= 0) {
975 if (equilibrium < ipv4_dst_ops.gc_thresh)
976 equilibrium = ipv4_dst_ops.gc_thresh;
977 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
978 if (goal > 0) {
b790cedd 979 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1da177e4
LT
980 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
981 }
982 } else {
983 /* We are in dangerous area. Try to reduce cache really
984 * aggressively.
985 */
b790cedd 986 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1da177e4
LT
987 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
988 }
989
990 if (now - last_gc >= ip_rt_gc_min_interval)
991 last_gc = now;
992
993 if (goal <= 0) {
994 equilibrium += goal;
995 goto work_done;
996 }
997
998 do {
999 int i, k;
1000
1001 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1002 unsigned long tmo = expire;
1003
1004 k = (k + 1) & rt_hash_mask;
1005 rthp = &rt_hash_table[k].chain;
22c047cc 1006 spin_lock_bh(rt_hash_lock_addr(k));
1da177e4 1007 while ((rth = *rthp) != NULL) {
e84f84f2 1008 if (!rt_is_expired(rth) &&
29e75252 1009 !rt_may_expire(rth, tmo, expire)) {
1da177e4 1010 tmo >>= 1;
093c2ca4 1011 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1012 continue;
1013 }
093c2ca4 1014 *rthp = rth->u.dst.rt_next;
1da177e4
LT
1015 rt_free(rth);
1016 goal--;
1da177e4 1017 }
22c047cc 1018 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
1019 if (goal <= 0)
1020 break;
1021 }
1022 rover = k;
1023
1024 if (goal <= 0)
1025 goto work_done;
1026
1027 /* Goal is not achieved. We stop process if:
1028
1029 - if expire reduced to zero. Otherwise, expire is halfed.
1030 - if table is not full.
1031 - if we are called from interrupt.
1032 - jiffies check is just fallback/debug loop breaker.
1033 We will not spin here for long time in any case.
1034 */
1035
1036 RT_CACHE_STAT_INC(gc_goal_miss);
1037
1038 if (expire == 0)
1039 break;
1040
1041 expire >>= 1;
1042#if RT_CACHE_DEBUG >= 2
1043 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1044 atomic_read(&ipv4_dst_ops.entries), goal, i);
1045#endif
1046
1047 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1048 goto out;
1049 } while (!in_softirq() && time_before_eq(jiffies, now));
1050
1051 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1052 goto out;
1053 if (net_ratelimit())
1054 printk(KERN_WARNING "dst cache overflow\n");
1055 RT_CACHE_STAT_INC(gc_dst_overflow);
1056 return 1;
1057
1058work_done:
1059 expire += ip_rt_gc_min_interval;
1060 if (expire > ip_rt_gc_timeout ||
1061 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1062 expire = ip_rt_gc_timeout;
1063#if RT_CACHE_DEBUG >= 2
1064 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1065 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1066#endif
1067out: return 0;
1068}
1069
511c3f92
ED
1070static int rt_intern_hash(unsigned hash, struct rtable *rt,
1071 struct rtable **rp, struct sk_buff *skb)
1da177e4
LT
1072{
1073 struct rtable *rth, **rthp;
1074 unsigned long now;
1075 struct rtable *cand, **candp;
1076 u32 min_score;
1077 int chain_length;
1078 int attempts = !in_softirq();
1079
1080restart:
1081 chain_length = 0;
1082 min_score = ~(u32)0;
1083 cand = NULL;
1084 candp = NULL;
1085 now = jiffies;
1086
1080d709
NH
1087 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1088 rt_drop(rt);
1089 return 0;
1090 }
1091
1da177e4
LT
1092 rthp = &rt_hash_table[hash].chain;
1093
22c047cc 1094 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1095 while ((rth = *rthp) != NULL) {
e84f84f2 1096 if (rt_is_expired(rth)) {
29e75252
ED
1097 *rthp = rth->u.dst.rt_next;
1098 rt_free(rth);
1099 continue;
1100 }
b5921910 1101 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1da177e4 1102 /* Put it first */
093c2ca4 1103 *rthp = rth->u.dst.rt_next;
1da177e4
LT
1104 /*
1105 * Since lookup is lockfree, the deletion
1106 * must be visible to another weakly ordered CPU before
1107 * the insertion at the start of the hash chain.
1108 */
093c2ca4 1109 rcu_assign_pointer(rth->u.dst.rt_next,
1da177e4
LT
1110 rt_hash_table[hash].chain);
1111 /*
1112 * Since lookup is lockfree, the update writes
1113 * must be ordered for consistency on SMP.
1114 */
1115 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1116
03f49f34 1117 dst_use(&rth->u.dst, now);
22c047cc 1118 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1119
1120 rt_drop(rt);
511c3f92
ED
1121 if (rp)
1122 *rp = rth;
1123 else
adf30907 1124 skb_dst_set(skb, &rth->u.dst);
1da177e4
LT
1125 return 0;
1126 }
1127
1128 if (!atomic_read(&rth->u.dst.__refcnt)) {
1129 u32 score = rt_score(rth);
1130
1131 if (score <= min_score) {
1132 cand = rth;
1133 candp = rthp;
1134 min_score = score;
1135 }
1136 }
1137
1138 chain_length++;
1139
093c2ca4 1140 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1141 }
1142
1143 if (cand) {
1144 /* ip_rt_gc_elasticity used to be average length of chain
1145 * length, when exceeded gc becomes really aggressive.
1146 *
1147 * The second limit is less certain. At the moment it allows
1148 * only 2 entries per bucket. We will see.
1149 */
1150 if (chain_length > ip_rt_gc_elasticity) {
093c2ca4 1151 *candp = cand->u.dst.rt_next;
1da177e4
LT
1152 rt_free(cand);
1153 }
1080d709
NH
1154 } else {
1155 if (chain_length > rt_chain_length_max) {
1156 struct net *net = dev_net(rt->u.dst.dev);
1157 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1158 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1159 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1160 rt->u.dst.dev->name, num);
1161 }
1162 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1163 }
1da177e4
LT
1164 }
1165
1166 /* Try to bind route to arp only if it is output
1167 route or unicast forwarding path.
1168 */
1169 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1170 int err = arp_bind_neighbour(&rt->u.dst);
1171 if (err) {
22c047cc 1172 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1173
1174 if (err != -ENOBUFS) {
1175 rt_drop(rt);
1176 return err;
1177 }
1178
1179 /* Neighbour tables are full and nothing
1180 can be released. Try to shrink route cache,
1181 it is most likely it holds some neighbour records.
1182 */
1183 if (attempts-- > 0) {
1184 int saved_elasticity = ip_rt_gc_elasticity;
1185 int saved_int = ip_rt_gc_min_interval;
1186 ip_rt_gc_elasticity = 1;
1187 ip_rt_gc_min_interval = 0;
569d3645 1188 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1189 ip_rt_gc_min_interval = saved_int;
1190 ip_rt_gc_elasticity = saved_elasticity;
1191 goto restart;
1192 }
1193
1194 if (net_ratelimit())
1195 printk(KERN_WARNING "Neighbour table overflow.\n");
1196 rt_drop(rt);
1197 return -ENOBUFS;
1198 }
1199 }
1200
1ddbcb00 1201 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1080d709 1202
1da177e4 1203#if RT_CACHE_DEBUG >= 2
093c2ca4 1204 if (rt->u.dst.rt_next) {
1da177e4 1205 struct rtable *trt;
673d57e7 1206 printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst);
093c2ca4 1207 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
673d57e7 1208 printk(" . %pI4", &trt->rt_dst);
1da177e4
LT
1209 printk("\n");
1210 }
1211#endif
00269b54
ED
1212 /*
1213 * Since lookup is lockfree, we must make sure
1214 * previous writes to rt are comitted to memory
1215 * before making rt visible to other CPUS.
1216 */
1ddbcb00 1217 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1080d709 1218
22c047cc 1219 spin_unlock_bh(rt_hash_lock_addr(hash));
511c3f92
ED
1220 if (rp)
1221 *rp = rt;
1222 else
adf30907 1223 skb_dst_set(skb, &rt->u.dst);
1da177e4
LT
1224 return 0;
1225}
1226
1227void rt_bind_peer(struct rtable *rt, int create)
1228{
1229 static DEFINE_SPINLOCK(rt_peer_lock);
1230 struct inet_peer *peer;
1231
1232 peer = inet_getpeer(rt->rt_dst, create);
1233
1234 spin_lock_bh(&rt_peer_lock);
1235 if (rt->peer == NULL) {
1236 rt->peer = peer;
1237 peer = NULL;
1238 }
1239 spin_unlock_bh(&rt_peer_lock);
1240 if (peer)
1241 inet_putpeer(peer);
1242}
1243
1244/*
1245 * Peer allocation may fail only in serious out-of-memory conditions. However
1246 * we still can generate some output.
1247 * Random ID selection looks a bit dangerous because we have no chances to
1248 * select ID being unique in a reasonable period of time.
1249 * But broken packet identifier may be better than no packet at all.
1250 */
1251static void ip_select_fb_ident(struct iphdr *iph)
1252{
1253 static DEFINE_SPINLOCK(ip_fb_id_lock);
1254 static u32 ip_fallback_id;
1255 u32 salt;
1256
1257 spin_lock_bh(&ip_fb_id_lock);
e448515c 1258 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1259 iph->id = htons(salt & 0xFFFF);
1260 ip_fallback_id = salt;
1261 spin_unlock_bh(&ip_fb_id_lock);
1262}
1263
1264void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1265{
1266 struct rtable *rt = (struct rtable *) dst;
1267
1268 if (rt) {
1269 if (rt->peer == NULL)
1270 rt_bind_peer(rt, 1);
1271
1272 /* If peer is attached to destination, it is never detached,
1273 so that we need not to grab a lock to dereference it.
1274 */
1275 if (rt->peer) {
1276 iph->id = htons(inet_getid(rt->peer, more));
1277 return;
1278 }
1279 } else
e905a9ed 1280 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1281 __builtin_return_address(0));
1da177e4
LT
1282
1283 ip_select_fb_ident(iph);
1284}
1285
1286static void rt_del(unsigned hash, struct rtable *rt)
1287{
29e75252 1288 struct rtable **rthp, *aux;
1da177e4 1289
29e75252 1290 rthp = &rt_hash_table[hash].chain;
22c047cc 1291 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1292 ip_rt_put(rt);
29e75252 1293 while ((aux = *rthp) != NULL) {
e84f84f2 1294 if (aux == rt || rt_is_expired(aux)) {
29e75252
ED
1295 *rthp = aux->u.dst.rt_next;
1296 rt_free(aux);
1297 continue;
1da177e4 1298 }
29e75252
ED
1299 rthp = &aux->u.dst.rt_next;
1300 }
22c047cc 1301 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1302}
1303
f7655229
AV
1304void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1305 __be32 saddr, struct net_device *dev)
1da177e4
LT
1306{
1307 int i, k;
1308 struct in_device *in_dev = in_dev_get(dev);
1309 struct rtable *rth, **rthp;
f7655229 1310 __be32 skeys[2] = { saddr, 0 };
1da177e4 1311 int ikeys[2] = { dev->ifindex, 0 };
8d71740c 1312 struct netevent_redirect netevent;
317805b8 1313 struct net *net;
1da177e4 1314
1da177e4
LT
1315 if (!in_dev)
1316 return;
1317
c346dca1 1318 net = dev_net(dev);
1da177e4 1319 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1e637c74 1320 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
f97c1e0c 1321 || ipv4_is_zeronet(new_gw))
1da177e4
LT
1322 goto reject_redirect;
1323
1080d709
NH
1324 if (!rt_caching(net))
1325 goto reject_redirect;
1326
1da177e4
LT
1327 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1328 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1329 goto reject_redirect;
1330 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1331 goto reject_redirect;
1332 } else {
317805b8 1333 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1334 goto reject_redirect;
1335 }
1336
1337 for (i = 0; i < 2; i++) {
1338 for (k = 0; k < 2; k++) {
b00180de 1339 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
e84f84f2 1340 rt_genid(net));
1da177e4
LT
1341
1342 rthp=&rt_hash_table[hash].chain;
1343
1344 rcu_read_lock();
1345 while ((rth = rcu_dereference(*rthp)) != NULL) {
1346 struct rtable *rt;
1347
1348 if (rth->fl.fl4_dst != daddr ||
1349 rth->fl.fl4_src != skeys[i] ||
1da177e4 1350 rth->fl.oif != ikeys[k] ||
29e75252 1351 rth->fl.iif != 0 ||
e84f84f2 1352 rt_is_expired(rth) ||
878628fb 1353 !net_eq(dev_net(rth->u.dst.dev), net)) {
093c2ca4 1354 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1355 continue;
1356 }
1357
1358 if (rth->rt_dst != daddr ||
1359 rth->rt_src != saddr ||
1360 rth->u.dst.error ||
1361 rth->rt_gateway != old_gw ||
1362 rth->u.dst.dev != dev)
1363 break;
1364
1365 dst_hold(&rth->u.dst);
1366 rcu_read_unlock();
1367
1368 rt = dst_alloc(&ipv4_dst_ops);
1369 if (rt == NULL) {
1370 ip_rt_put(rth);
1371 in_dev_put(in_dev);
1372 return;
1373 }
1374
1375 /* Copy all the information. */
1376 *rt = *rth;
1da177e4
LT
1377 rt->u.dst.__use = 1;
1378 atomic_set(&rt->u.dst.__refcnt, 1);
1379 rt->u.dst.child = NULL;
1380 if (rt->u.dst.dev)
1381 dev_hold(rt->u.dst.dev);
1382 if (rt->idev)
1383 in_dev_hold(rt->idev);
1384 rt->u.dst.obsolete = 0;
1385 rt->u.dst.lastuse = jiffies;
1386 rt->u.dst.path = &rt->u.dst;
1387 rt->u.dst.neighbour = NULL;
1388 rt->u.dst.hh = NULL;
def8b4fa 1389#ifdef CONFIG_XFRM
1da177e4 1390 rt->u.dst.xfrm = NULL;
def8b4fa 1391#endif
e84f84f2 1392 rt->rt_genid = rt_genid(net);
1da177e4
LT
1393 rt->rt_flags |= RTCF_REDIRECTED;
1394
1395 /* Gateway is different ... */
1396 rt->rt_gateway = new_gw;
1397
1398 /* Redirect received -> path was valid */
1399 dst_confirm(&rth->u.dst);
1400
1401 if (rt->peer)
1402 atomic_inc(&rt->peer->refcnt);
1403
1404 if (arp_bind_neighbour(&rt->u.dst) ||
1405 !(rt->u.dst.neighbour->nud_state &
1406 NUD_VALID)) {
1407 if (rt->u.dst.neighbour)
1408 neigh_event_send(rt->u.dst.neighbour, NULL);
1409 ip_rt_put(rth);
1410 rt_drop(rt);
1411 goto do_next;
1412 }
e905a9ed 1413
8d71740c
TT
1414 netevent.old = &rth->u.dst;
1415 netevent.new = &rt->u.dst;
e905a9ed
YH
1416 call_netevent_notifiers(NETEVENT_REDIRECT,
1417 &netevent);
1da177e4
LT
1418
1419 rt_del(hash, rth);
511c3f92 1420 if (!rt_intern_hash(hash, rt, &rt, NULL))
1da177e4
LT
1421 ip_rt_put(rt);
1422 goto do_next;
1423 }
1424 rcu_read_unlock();
1425 do_next:
1426 ;
1427 }
1428 }
1429 in_dev_put(in_dev);
1430 return;
1431
1432reject_redirect:
1433#ifdef CONFIG_IP_ROUTE_VERBOSE
1434 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
1435 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1436 " Advised path = %pI4 -> %pI4\n",
1437 &old_gw, dev->name, &new_gw,
1438 &saddr, &daddr);
1da177e4
LT
1439#endif
1440 in_dev_put(in_dev);
1441}
1442
1443static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1444{
ee6b9673 1445 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1446 struct dst_entry *ret = dst;
1447
1448 if (rt) {
1449 if (dst->obsolete) {
1450 ip_rt_put(rt);
1451 ret = NULL;
1452 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1453 rt->u.dst.expires) {
8c7bc840 1454 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
b00180de 1455 rt->fl.oif,
e84f84f2 1456 rt_genid(dev_net(dst->dev)));
1da177e4 1457#if RT_CACHE_DEBUG >= 1
673d57e7
HH
1458 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1459 &rt->rt_dst, rt->fl.fl4_tos);
1da177e4
LT
1460#endif
1461 rt_del(hash, rt);
1462 ret = NULL;
1463 }
1464 }
1465 return ret;
1466}
1467
1468/*
1469 * Algorithm:
1470 * 1. The first ip_rt_redirect_number redirects are sent
1471 * with exponential backoff, then we stop sending them at all,
1472 * assuming that the host ignores our redirects.
1473 * 2. If we did not see packets requiring redirects
1474 * during ip_rt_redirect_silence, we assume that the host
1475 * forgot redirected route and start to send redirects again.
1476 *
1477 * This algorithm is much cheaper and more intelligent than dumb load limiting
1478 * in icmp.c.
1479 *
1480 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1481 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1482 */
1483
1484void ip_rt_send_redirect(struct sk_buff *skb)
1485{
511c3f92 1486 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
1487 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1488
1489 if (!in_dev)
1490 return;
1491
1492 if (!IN_DEV_TX_REDIRECTS(in_dev))
1493 goto out;
1494
1495 /* No redirected packets during ip_rt_redirect_silence;
1496 * reset the algorithm.
1497 */
1498 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1499 rt->u.dst.rate_tokens = 0;
1500
1501 /* Too many ignored redirects; do not send anything
1502 * set u.dst.rate_last to the last seen redirected packet.
1503 */
1504 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1505 rt->u.dst.rate_last = jiffies;
1506 goto out;
1507 }
1508
1509 /* Check for load limit; set rate_last to the latest sent
1510 * redirect.
1511 */
14fb8a76
LY
1512 if (rt->u.dst.rate_tokens == 0 ||
1513 time_after(jiffies,
1da177e4
LT
1514 (rt->u.dst.rate_last +
1515 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1516 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1517 rt->u.dst.rate_last = jiffies;
1518 ++rt->u.dst.rate_tokens;
1519#ifdef CONFIG_IP_ROUTE_VERBOSE
1520 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1521 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1522 net_ratelimit())
673d57e7
HH
1523 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1524 &rt->rt_src, rt->rt_iif,
1525 &rt->rt_dst, &rt->rt_gateway);
1da177e4
LT
1526#endif
1527 }
1528out:
e905a9ed 1529 in_dev_put(in_dev);
1da177e4
LT
1530}
1531
1532static int ip_error(struct sk_buff *skb)
1533{
511c3f92 1534 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
1535 unsigned long now;
1536 int code;
1537
1538 switch (rt->u.dst.error) {
1539 case EINVAL:
1540 default:
1541 goto out;
1542 case EHOSTUNREACH:
1543 code = ICMP_HOST_UNREACH;
1544 break;
1545 case ENETUNREACH:
1546 code = ICMP_NET_UNREACH;
7c73a6fa
PE
1547 IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1548 IPSTATS_MIB_INNOROUTES);
1da177e4
LT
1549 break;
1550 case EACCES:
1551 code = ICMP_PKT_FILTERED;
1552 break;
1553 }
1554
1555 now = jiffies;
1556 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1557 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1558 rt->u.dst.rate_tokens = ip_rt_error_burst;
1559 rt->u.dst.rate_last = now;
1560 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1561 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1562 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1563 }
1564
1565out: kfree_skb(skb);
1566 return 0;
e905a9ed 1567}
1da177e4
LT
1568
1569/*
1570 * The last two values are not from the RFC but
1571 * are needed for AMPRnet AX.25 paths.
1572 */
1573
9b5b5cff 1574static const unsigned short mtu_plateau[] =
1da177e4
LT
1575{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1576
5969f71d 1577static inline unsigned short guess_mtu(unsigned short old_mtu)
1da177e4
LT
1578{
1579 int i;
e905a9ed 1580
1da177e4
LT
1581 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1582 if (old_mtu > mtu_plateau[i])
1583 return mtu_plateau[i];
1584 return 68;
1585}
1586
b5921910 1587unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
0010e465
TT
1588 unsigned short new_mtu,
1589 struct net_device *dev)
1da177e4 1590{
0010e465 1591 int i, k;
1da177e4
LT
1592 unsigned short old_mtu = ntohs(iph->tot_len);
1593 struct rtable *rth;
0010e465 1594 int ikeys[2] = { dev->ifindex, 0 };
e448515c
AV
1595 __be32 skeys[2] = { iph->saddr, 0, };
1596 __be32 daddr = iph->daddr;
1da177e4
LT
1597 unsigned short est_mtu = 0;
1598
1599 if (ipv4_config.no_pmtu_disc)
1600 return 0;
1601
0010e465
TT
1602 for (k = 0; k < 2; k++) {
1603 for (i = 0; i < 2; i++) {
b00180de 1604 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
e84f84f2 1605 rt_genid(net));
0010e465
TT
1606
1607 rcu_read_lock();
1608 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1609 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
1610 unsigned short mtu = new_mtu;
1611
0010e465
TT
1612 if (rth->fl.fl4_dst != daddr ||
1613 rth->fl.fl4_src != skeys[i] ||
1614 rth->rt_dst != daddr ||
1615 rth->rt_src != iph->saddr ||
1616 rth->fl.oif != ikeys[k] ||
1617 rth->fl.iif != 0 ||
1618 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1619 !net_eq(dev_net(rth->u.dst.dev), net) ||
6c3b8fc6 1620 rt_is_expired(rth))
0010e465
TT
1621 continue;
1622
1da177e4
LT
1623 if (new_mtu < 68 || new_mtu >= old_mtu) {
1624
1625 /* BSD 4.2 compatibility hack :-( */
1626 if (mtu == 0 &&
6d273f8d 1627 old_mtu >= dst_mtu(&rth->u.dst) &&
1da177e4
LT
1628 old_mtu >= 68 + (iph->ihl << 2))
1629 old_mtu -= iph->ihl << 2;
1630
1631 mtu = guess_mtu(old_mtu);
1632 }
6d273f8d
RR
1633 if (mtu <= dst_mtu(&rth->u.dst)) {
1634 if (mtu < dst_mtu(&rth->u.dst)) {
1da177e4
LT
1635 dst_confirm(&rth->u.dst);
1636 if (mtu < ip_rt_min_pmtu) {
1637 mtu = ip_rt_min_pmtu;
1638 rth->u.dst.metrics[RTAX_LOCK-1] |=
1639 (1 << RTAX_MTU);
1640 }
1641 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1642 dst_set_expires(&rth->u.dst,
1643 ip_rt_mtu_expires);
1644 }
1645 est_mtu = mtu;
1646 }
1647 }
0010e465 1648 rcu_read_unlock();
1da177e4 1649 }
1da177e4
LT
1650 }
1651 return est_mtu ? : new_mtu;
1652}
1653
1654static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1655{
6d273f8d 1656 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1da177e4
LT
1657 !(dst_metric_locked(dst, RTAX_MTU))) {
1658 if (mtu < ip_rt_min_pmtu) {
1659 mtu = ip_rt_min_pmtu;
1660 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1661 }
1662 dst->metrics[RTAX_MTU-1] = mtu;
1663 dst_set_expires(dst, ip_rt_mtu_expires);
8d71740c 1664 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1da177e4
LT
1665 }
1666}
1667
1668static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1669{
1670 return NULL;
1671}
1672
1673static void ipv4_dst_destroy(struct dst_entry *dst)
1674{
1675 struct rtable *rt = (struct rtable *) dst;
1676 struct inet_peer *peer = rt->peer;
1677 struct in_device *idev = rt->idev;
1678
1679 if (peer) {
1680 rt->peer = NULL;
1681 inet_putpeer(peer);
1682 }
1683
1684 if (idev) {
1685 rt->idev = NULL;
1686 in_dev_put(idev);
1687 }
1688}
1689
1690static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1691 int how)
1692{
1693 struct rtable *rt = (struct rtable *) dst;
1694 struct in_device *idev = rt->idev;
c346dca1 1695 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
5a3e55d6 1696 struct in_device *loopback_idev =
c346dca1 1697 in_dev_get(dev_net(dev)->loopback_dev);
1da177e4
LT
1698 if (loopback_idev) {
1699 rt->idev = loopback_idev;
1700 in_dev_put(idev);
1701 }
1702 }
1703}
1704
1705static void ipv4_link_failure(struct sk_buff *skb)
1706{
1707 struct rtable *rt;
1708
1709 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1710
511c3f92 1711 rt = skb_rtable(skb);
1da177e4
LT
1712 if (rt)
1713 dst_set_expires(&rt->u.dst, 0);
1714}
1715
1716static int ip_rt_bug(struct sk_buff *skb)
1717{
673d57e7
HH
1718 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1719 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1da177e4
LT
1720 skb->dev ? skb->dev->name : "?");
1721 kfree_skb(skb);
1722 return 0;
1723}
1724
1725/*
1726 We do not cache source address of outgoing interface,
1727 because it is used only by IP RR, TS and SRR options,
1728 so that it out of fast path.
1729
1730 BTW remember: "addr" is allowed to be not aligned
1731 in IP options!
1732 */
1733
1734void ip_rt_get_source(u8 *addr, struct rtable *rt)
1735{
a61ced5d 1736 __be32 src;
1da177e4
LT
1737 struct fib_result res;
1738
1739 if (rt->fl.iif == 0)
1740 src = rt->rt_src;
c346dca1 1741 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1da177e4
LT
1742 src = FIB_RES_PREFSRC(res);
1743 fib_res_put(&res);
1744 } else
1745 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1746 RT_SCOPE_UNIVERSE);
1747 memcpy(addr, &src, 4);
1748}
1749
1750#ifdef CONFIG_NET_CLS_ROUTE
1751static void set_class_tag(struct rtable *rt, u32 tag)
1752{
1753 if (!(rt->u.dst.tclassid & 0xFFFF))
1754 rt->u.dst.tclassid |= tag & 0xFFFF;
1755 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1756 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1757}
1758#endif
1759
1760static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1761{
1762 struct fib_info *fi = res->fi;
1763
1764 if (fi) {
1765 if (FIB_RES_GW(*res) &&
1766 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1767 rt->rt_gateway = FIB_RES_GW(*res);
1768 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1769 sizeof(rt->u.dst.metrics));
1770 if (fi->fib_mtu == 0) {
1771 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
0bbeafd0 1772 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1da177e4
LT
1773 rt->rt_gateway != rt->rt_dst &&
1774 rt->u.dst.dev->mtu > 576)
1775 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1776 }
1777#ifdef CONFIG_NET_CLS_ROUTE
1778 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1779#endif
1780 } else
1781 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1782
5ffc02a1 1783 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1da177e4 1784 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
6d273f8d 1785 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1da177e4 1786 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
5ffc02a1 1787 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1da177e4
LT
1788 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1789 ip_rt_min_advmss);
5ffc02a1 1790 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1da177e4
LT
1791 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1792
1793#ifdef CONFIG_NET_CLS_ROUTE
1794#ifdef CONFIG_IP_MULTIPLE_TABLES
1795 set_class_tag(rt, fib_rules_tclass(res));
1796#endif
1797 set_class_tag(rt, itag);
1798#endif
e905a9ed 1799 rt->rt_type = res->type;
1da177e4
LT
1800}
1801
9e12bb22 1802static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1803 u8 tos, struct net_device *dev, int our)
1804{
1805 unsigned hash;
1806 struct rtable *rth;
a61ced5d 1807 __be32 spec_dst;
1da177e4
LT
1808 struct in_device *in_dev = in_dev_get(dev);
1809 u32 itag = 0;
1810
1811 /* Primary sanity checks. */
1812
1813 if (in_dev == NULL)
1814 return -EINVAL;
1815
1e637c74 1816 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 1817 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1818 goto e_inval;
1819
f97c1e0c
JP
1820 if (ipv4_is_zeronet(saddr)) {
1821 if (!ipv4_is_local_multicast(daddr))
1da177e4
LT
1822 goto e_inval;
1823 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1824 } else if (fib_validate_source(saddr, 0, tos, 0,
1825 dev, &spec_dst, &itag) < 0)
1826 goto e_inval;
1827
1828 rth = dst_alloc(&ipv4_dst_ops);
1829 if (!rth)
1830 goto e_nobufs;
1831
1832 rth->u.dst.output= ip_rt_bug;
1833
1834 atomic_set(&rth->u.dst.__refcnt, 1);
1835 rth->u.dst.flags= DST_HOST;
42f811b8 1836 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
1837 rth->u.dst.flags |= DST_NOPOLICY;
1838 rth->fl.fl4_dst = daddr;
1839 rth->rt_dst = daddr;
1840 rth->fl.fl4_tos = tos;
47dcf0cb 1841 rth->fl.mark = skb->mark;
1da177e4
LT
1842 rth->fl.fl4_src = saddr;
1843 rth->rt_src = saddr;
1844#ifdef CONFIG_NET_CLS_ROUTE
1845 rth->u.dst.tclassid = itag;
1846#endif
1847 rth->rt_iif =
1848 rth->fl.iif = dev->ifindex;
2774c7ab 1849 rth->u.dst.dev = init_net.loopback_dev;
1da177e4
LT
1850 dev_hold(rth->u.dst.dev);
1851 rth->idev = in_dev_get(rth->u.dst.dev);
1852 rth->fl.oif = 0;
1853 rth->rt_gateway = daddr;
1854 rth->rt_spec_dst= spec_dst;
e84f84f2 1855 rth->rt_genid = rt_genid(dev_net(dev));
1da177e4 1856 rth->rt_flags = RTCF_MULTICAST;
29e75252 1857 rth->rt_type = RTN_MULTICAST;
1da177e4
LT
1858 if (our) {
1859 rth->u.dst.input= ip_local_deliver;
1860 rth->rt_flags |= RTCF_LOCAL;
1861 }
1862
1863#ifdef CONFIG_IP_MROUTE
f97c1e0c 1864 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1da177e4
LT
1865 rth->u.dst.input = ip_mr_input;
1866#endif
1867 RT_CACHE_STAT_INC(in_slow_mc);
1868
1869 in_dev_put(in_dev);
e84f84f2 1870 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
511c3f92 1871 return rt_intern_hash(hash, rth, NULL, skb);
1da177e4
LT
1872
1873e_nobufs:
1874 in_dev_put(in_dev);
1875 return -ENOBUFS;
1876
1877e_inval:
1878 in_dev_put(in_dev);
1879 return -EINVAL;
1880}
1881
1882
1883static void ip_handle_martian_source(struct net_device *dev,
1884 struct in_device *in_dev,
1885 struct sk_buff *skb,
9e12bb22
AV
1886 __be32 daddr,
1887 __be32 saddr)
1da177e4
LT
1888{
1889 RT_CACHE_STAT_INC(in_martian_src);
1890#ifdef CONFIG_IP_ROUTE_VERBOSE
1891 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1892 /*
1893 * RFC1812 recommendation, if source is martian,
1894 * the only hint is MAC header.
1895 */
673d57e7
HH
1896 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1897 &daddr, &saddr, dev->name);
98e399f8 1898 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 1899 int i;
98e399f8 1900 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
1901 printk(KERN_WARNING "ll header: ");
1902 for (i = 0; i < dev->hard_header_len; i++, p++) {
1903 printk("%02x", *p);
1904 if (i < (dev->hard_header_len - 1))
1905 printk(":");
1906 }
1907 printk("\n");
1908 }
1909 }
1910#endif
1911}
1912
5969f71d
SH
1913static int __mkroute_input(struct sk_buff *skb,
1914 struct fib_result *res,
1915 struct in_device *in_dev,
1916 __be32 daddr, __be32 saddr, u32 tos,
1917 struct rtable **result)
1da177e4
LT
1918{
1919
1920 struct rtable *rth;
1921 int err;
1922 struct in_device *out_dev;
1923 unsigned flags = 0;
d9c9df8c
AV
1924 __be32 spec_dst;
1925 u32 itag;
1da177e4
LT
1926
1927 /* get a working reference to the output device */
1928 out_dev = in_dev_get(FIB_RES_DEV(*res));
1929 if (out_dev == NULL) {
1930 if (net_ratelimit())
1931 printk(KERN_CRIT "Bug in ip_route_input" \
1932 "_slow(). Please, report\n");
1933 return -EINVAL;
1934 }
1935
1936
e905a9ed 1937 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1da177e4
LT
1938 in_dev->dev, &spec_dst, &itag);
1939 if (err < 0) {
e905a9ed 1940 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1941 saddr);
e905a9ed 1942
1da177e4
LT
1943 err = -EINVAL;
1944 goto cleanup;
1945 }
1946
1947 if (err)
1948 flags |= RTCF_DIRECTSRC;
1949
51b77cae 1950 if (out_dev == in_dev && err &&
1da177e4
LT
1951 (IN_DEV_SHARED_MEDIA(out_dev) ||
1952 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1953 flags |= RTCF_DOREDIRECT;
1954
1955 if (skb->protocol != htons(ETH_P_IP)) {
1956 /* Not IP (i.e. ARP). Do not create route, if it is
1957 * invalid for proxy arp. DNAT routes are always valid.
1958 */
cb7928a5 1959 if (out_dev == in_dev) {
1da177e4
LT
1960 err = -EINVAL;
1961 goto cleanup;
1962 }
1963 }
1964
1965
1966 rth = dst_alloc(&ipv4_dst_ops);
1967 if (!rth) {
1968 err = -ENOBUFS;
1969 goto cleanup;
1970 }
1971
ce723d8e 1972 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 1973 rth->u.dst.flags= DST_HOST;
42f811b8 1974 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4 1975 rth->u.dst.flags |= DST_NOPOLICY;
42f811b8 1976 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1da177e4
LT
1977 rth->u.dst.flags |= DST_NOXFRM;
1978 rth->fl.fl4_dst = daddr;
1979 rth->rt_dst = daddr;
1980 rth->fl.fl4_tos = tos;
47dcf0cb 1981 rth->fl.mark = skb->mark;
1da177e4
LT
1982 rth->fl.fl4_src = saddr;
1983 rth->rt_src = saddr;
1984 rth->rt_gateway = daddr;
1985 rth->rt_iif =
1986 rth->fl.iif = in_dev->dev->ifindex;
1987 rth->u.dst.dev = (out_dev)->dev;
1988 dev_hold(rth->u.dst.dev);
1989 rth->idev = in_dev_get(rth->u.dst.dev);
1990 rth->fl.oif = 0;
1991 rth->rt_spec_dst= spec_dst;
1992
1993 rth->u.dst.input = ip_forward;
1994 rth->u.dst.output = ip_output;
e84f84f2 1995 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
1da177e4
LT
1996
1997 rt_set_nexthop(rth, res, itag);
1998
1999 rth->rt_flags = flags;
2000
2001 *result = rth;
2002 err = 0;
2003 cleanup:
2004 /* release the working reference to the output device */
2005 in_dev_put(out_dev);
2006 return err;
e905a9ed 2007}
1da177e4 2008
5969f71d
SH
2009static int ip_mkroute_input(struct sk_buff *skb,
2010 struct fib_result *res,
2011 const struct flowi *fl,
2012 struct in_device *in_dev,
2013 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 2014{
7abaa27c 2015 struct rtable* rth = NULL;
1da177e4
LT
2016 int err;
2017 unsigned hash;
2018
2019#ifdef CONFIG_IP_ROUTE_MULTIPATH
2020 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2021 fib_select_multipath(fl, res);
2022#endif
2023
2024 /* create a routing cache entry */
2025 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2026 if (err)
2027 return err;
1da177e4
LT
2028
2029 /* put it into the cache */
e84f84f2
DL
2030 hash = rt_hash(daddr, saddr, fl->iif,
2031 rt_genid(dev_net(rth->u.dst.dev)));
511c3f92 2032 return rt_intern_hash(hash, rth, NULL, skb);
1da177e4
LT
2033}
2034
1da177e4
LT
2035/*
2036 * NOTE. We drop all the packets that has local source
2037 * addresses, because every properly looped back packet
2038 * must have correct destination already attached by output routine.
2039 *
2040 * Such approach solves two big problems:
2041 * 1. Not simplex devices are handled properly.
2042 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2043 */
2044
9e12bb22 2045static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2046 u8 tos, struct net_device *dev)
2047{
2048 struct fib_result res;
2049 struct in_device *in_dev = in_dev_get(dev);
2050 struct flowi fl = { .nl_u = { .ip4_u =
2051 { .daddr = daddr,
2052 .saddr = saddr,
2053 .tos = tos,
2054 .scope = RT_SCOPE_UNIVERSE,
1da177e4 2055 } },
47dcf0cb 2056 .mark = skb->mark,
1da177e4
LT
2057 .iif = dev->ifindex };
2058 unsigned flags = 0;
2059 u32 itag = 0;
2060 struct rtable * rth;
2061 unsigned hash;
9e12bb22 2062 __be32 spec_dst;
1da177e4
LT
2063 int err = -EINVAL;
2064 int free_res = 0;
c346dca1 2065 struct net * net = dev_net(dev);
1da177e4
LT
2066
2067 /* IP on this device is disabled. */
2068
2069 if (!in_dev)
2070 goto out;
2071
2072 /* Check for the most weird martians, which can be not detected
2073 by fib_lookup.
2074 */
2075
1e637c74 2076 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2077 ipv4_is_loopback(saddr))
1da177e4
LT
2078 goto martian_source;
2079
e448515c 2080 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1da177e4
LT
2081 goto brd_input;
2082
2083 /* Accept zero addresses only to limited broadcast;
2084 * I even do not know to fix it or not. Waiting for complains :-)
2085 */
f97c1e0c 2086 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2087 goto martian_source;
2088
1e637c74 2089 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
f97c1e0c 2090 ipv4_is_loopback(daddr))
1da177e4
LT
2091 goto martian_destination;
2092
2093 /*
2094 * Now we are ready to route packet.
2095 */
84a885f4 2096 if ((err = fib_lookup(net, &fl, &res)) != 0) {
1da177e4 2097 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2098 goto e_hostunreach;
1da177e4
LT
2099 goto no_route;
2100 }
2101 free_res = 1;
2102
2103 RT_CACHE_STAT_INC(in_slow_tot);
2104
2105 if (res.type == RTN_BROADCAST)
2106 goto brd_input;
2107
2108 if (res.type == RTN_LOCAL) {
2109 int result;
2110 result = fib_validate_source(saddr, daddr, tos,
84a885f4 2111 net->loopback_dev->ifindex,
1da177e4
LT
2112 dev, &spec_dst, &itag);
2113 if (result < 0)
2114 goto martian_source;
2115 if (result)
2116 flags |= RTCF_DIRECTSRC;
2117 spec_dst = daddr;
2118 goto local_input;
2119 }
2120
2121 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2122 goto e_hostunreach;
1da177e4
LT
2123 if (res.type != RTN_UNICAST)
2124 goto martian_destination;
2125
2126 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1da177e4
LT
2127done:
2128 in_dev_put(in_dev);
2129 if (free_res)
2130 fib_res_put(&res);
2131out: return err;
2132
2133brd_input:
2134 if (skb->protocol != htons(ETH_P_IP))
2135 goto e_inval;
2136
f97c1e0c 2137 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2138 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2139 else {
2140 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2141 &itag);
2142 if (err < 0)
2143 goto martian_source;
2144 if (err)
2145 flags |= RTCF_DIRECTSRC;
2146 }
2147 flags |= RTCF_BROADCAST;
2148 res.type = RTN_BROADCAST;
2149 RT_CACHE_STAT_INC(in_brd);
2150
2151local_input:
2152 rth = dst_alloc(&ipv4_dst_ops);
2153 if (!rth)
2154 goto e_nobufs;
2155
2156 rth->u.dst.output= ip_rt_bug;
e84f84f2 2157 rth->rt_genid = rt_genid(net);
1da177e4
LT
2158
2159 atomic_set(&rth->u.dst.__refcnt, 1);
2160 rth->u.dst.flags= DST_HOST;
42f811b8 2161 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
2162 rth->u.dst.flags |= DST_NOPOLICY;
2163 rth->fl.fl4_dst = daddr;
2164 rth->rt_dst = daddr;
2165 rth->fl.fl4_tos = tos;
47dcf0cb 2166 rth->fl.mark = skb->mark;
1da177e4
LT
2167 rth->fl.fl4_src = saddr;
2168 rth->rt_src = saddr;
2169#ifdef CONFIG_NET_CLS_ROUTE
2170 rth->u.dst.tclassid = itag;
2171#endif
2172 rth->rt_iif =
2173 rth->fl.iif = dev->ifindex;
84a885f4 2174 rth->u.dst.dev = net->loopback_dev;
1da177e4
LT
2175 dev_hold(rth->u.dst.dev);
2176 rth->idev = in_dev_get(rth->u.dst.dev);
2177 rth->rt_gateway = daddr;
2178 rth->rt_spec_dst= spec_dst;
2179 rth->u.dst.input= ip_local_deliver;
2180 rth->rt_flags = flags|RTCF_LOCAL;
2181 if (res.type == RTN_UNREACHABLE) {
2182 rth->u.dst.input= ip_error;
2183 rth->u.dst.error= -err;
2184 rth->rt_flags &= ~RTCF_LOCAL;
2185 }
2186 rth->rt_type = res.type;
e84f84f2 2187 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
511c3f92 2188 err = rt_intern_hash(hash, rth, NULL, skb);
1da177e4
LT
2189 goto done;
2190
2191no_route:
2192 RT_CACHE_STAT_INC(in_no_route);
2193 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2194 res.type = RTN_UNREACHABLE;
7f53878d
MC
2195 if (err == -ESRCH)
2196 err = -ENETUNREACH;
1da177e4
LT
2197 goto local_input;
2198
2199 /*
2200 * Do not cache martian addresses: they should be logged (RFC1812)
2201 */
2202martian_destination:
2203 RT_CACHE_STAT_INC(in_martian_dst);
2204#ifdef CONFIG_IP_ROUTE_VERBOSE
2205 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
2206 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2207 &daddr, &saddr, dev->name);
1da177e4 2208#endif
2c2910a4
DE
2209
2210e_hostunreach:
e905a9ed
YH
2211 err = -EHOSTUNREACH;
2212 goto done;
2c2910a4 2213
1da177e4
LT
2214e_inval:
2215 err = -EINVAL;
2216 goto done;
2217
2218e_nobufs:
2219 err = -ENOBUFS;
2220 goto done;
2221
2222martian_source:
2223 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2224 goto e_inval;
2225}
2226
9e12bb22 2227int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2228 u8 tos, struct net_device *dev)
2229{
2230 struct rtable * rth;
2231 unsigned hash;
2232 int iif = dev->ifindex;
b5921910 2233 struct net *net;
1da177e4 2234
c346dca1 2235 net = dev_net(dev);
1080d709
NH
2236
2237 if (!rt_caching(net))
2238 goto skip_cache;
2239
1da177e4 2240 tos &= IPTOS_RT_MASK;
e84f84f2 2241 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
1da177e4
LT
2242
2243 rcu_read_lock();
2244 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 2245 rth = rcu_dereference(rth->u.dst.rt_next)) {
c0b8c32b
SH
2246 if (((rth->fl.fl4_dst ^ daddr) |
2247 (rth->fl.fl4_src ^ saddr) |
2248 (rth->fl.iif ^ iif) |
2249 rth->fl.oif |
2250 (rth->fl.fl4_tos ^ tos)) == 0 &&
47dcf0cb 2251 rth->fl.mark == skb->mark &&
878628fb 2252 net_eq(dev_net(rth->u.dst.dev), net) &&
e84f84f2 2253 !rt_is_expired(rth)) {
03f49f34 2254 dst_use(&rth->u.dst, jiffies);
1da177e4
LT
2255 RT_CACHE_STAT_INC(in_hit);
2256 rcu_read_unlock();
adf30907 2257 skb_dst_set(skb, &rth->u.dst);
1da177e4
LT
2258 return 0;
2259 }
2260 RT_CACHE_STAT_INC(in_hlist_search);
2261 }
2262 rcu_read_unlock();
2263
1080d709 2264skip_cache:
1da177e4
LT
2265 /* Multicast recognition logic is moved from route cache to here.
2266 The problem was that too many Ethernet cards have broken/missing
2267 hardware multicast filters :-( As result the host on multicasting
2268 network acquires a lot of useless route cache entries, sort of
2269 SDR messages from all the world. Now we try to get rid of them.
2270 Really, provided software IP multicast filter is organized
2271 reasonably (at least, hashed), it does not result in a slowdown
2272 comparing with route cache reject entries.
2273 Note, that multicast routers are not affected, because
2274 route cache entry is created eventually.
2275 */
f97c1e0c 2276 if (ipv4_is_multicast(daddr)) {
1da177e4
LT
2277 struct in_device *in_dev;
2278
2279 rcu_read_lock();
e5ed6399 2280 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1da177e4 2281 int our = ip_check_mc(in_dev, daddr, saddr,
eddc9ec5 2282 ip_hdr(skb)->protocol);
1da177e4
LT
2283 if (our
2284#ifdef CONFIG_IP_MROUTE
f97c1e0c
JP
2285 || (!ipv4_is_local_multicast(daddr) &&
2286 IN_DEV_MFORWARD(in_dev))
1da177e4
LT
2287#endif
2288 ) {
2289 rcu_read_unlock();
2290 return ip_route_input_mc(skb, daddr, saddr,
2291 tos, dev, our);
2292 }
2293 }
2294 rcu_read_unlock();
2295 return -EINVAL;
2296 }
2297 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2298}
2299
5969f71d
SH
2300static int __mkroute_output(struct rtable **result,
2301 struct fib_result *res,
2302 const struct flowi *fl,
2303 const struct flowi *oldflp,
2304 struct net_device *dev_out,
2305 unsigned flags)
1da177e4
LT
2306{
2307 struct rtable *rth;
2308 struct in_device *in_dev;
2309 u32 tos = RT_FL_TOS(oldflp);
2310 int err = 0;
2311
f97c1e0c 2312 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
1da177e4
LT
2313 return -EINVAL;
2314
e448515c 2315 if (fl->fl4_dst == htonl(0xFFFFFFFF))
1da177e4 2316 res->type = RTN_BROADCAST;
f97c1e0c 2317 else if (ipv4_is_multicast(fl->fl4_dst))
1da177e4 2318 res->type = RTN_MULTICAST;
1e637c74 2319 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
1da177e4
LT
2320 return -EINVAL;
2321
2322 if (dev_out->flags & IFF_LOOPBACK)
2323 flags |= RTCF_LOCAL;
2324
2325 /* get work reference to inet device */
2326 in_dev = in_dev_get(dev_out);
2327 if (!in_dev)
2328 return -EINVAL;
2329
2330 if (res->type == RTN_BROADCAST) {
2331 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2332 if (res->fi) {
2333 fib_info_put(res->fi);
2334 res->fi = NULL;
2335 }
2336 } else if (res->type == RTN_MULTICAST) {
2337 flags |= RTCF_MULTICAST|RTCF_LOCAL;
e905a9ed 2338 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
1da177e4
LT
2339 oldflp->proto))
2340 flags &= ~RTCF_LOCAL;
2341 /* If multicast route do not exist use
2342 default one, but do not gateway in this case.
2343 Yes, it is hack.
2344 */
2345 if (res->fi && res->prefixlen < 4) {
2346 fib_info_put(res->fi);
2347 res->fi = NULL;
2348 }
2349 }
2350
2351
2352 rth = dst_alloc(&ipv4_dst_ops);
2353 if (!rth) {
2354 err = -ENOBUFS;
2355 goto cleanup;
e905a9ed 2356 }
1da177e4 2357
ce723d8e 2358 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4 2359 rth->u.dst.flags= DST_HOST;
42f811b8 2360 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
1da177e4 2361 rth->u.dst.flags |= DST_NOXFRM;
42f811b8 2362 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1da177e4
LT
2363 rth->u.dst.flags |= DST_NOPOLICY;
2364
2365 rth->fl.fl4_dst = oldflp->fl4_dst;
2366 rth->fl.fl4_tos = tos;
2367 rth->fl.fl4_src = oldflp->fl4_src;
2368 rth->fl.oif = oldflp->oif;
47dcf0cb 2369 rth->fl.mark = oldflp->mark;
1da177e4
LT
2370 rth->rt_dst = fl->fl4_dst;
2371 rth->rt_src = fl->fl4_src;
2372 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
e905a9ed 2373 /* get references to the devices that are to be hold by the routing
1da177e4
LT
2374 cache entry */
2375 rth->u.dst.dev = dev_out;
2376 dev_hold(dev_out);
2377 rth->idev = in_dev_get(dev_out);
2378 rth->rt_gateway = fl->fl4_dst;
2379 rth->rt_spec_dst= fl->fl4_src;
2380
2381 rth->u.dst.output=ip_output;
e84f84f2 2382 rth->rt_genid = rt_genid(dev_net(dev_out));
1da177e4
LT
2383
2384 RT_CACHE_STAT_INC(out_slow_tot);
2385
2386 if (flags & RTCF_LOCAL) {
2387 rth->u.dst.input = ip_local_deliver;
2388 rth->rt_spec_dst = fl->fl4_dst;
2389 }
2390 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2391 rth->rt_spec_dst = fl->fl4_src;
e905a9ed 2392 if (flags & RTCF_LOCAL &&
1da177e4
LT
2393 !(dev_out->flags & IFF_LOOPBACK)) {
2394 rth->u.dst.output = ip_mc_output;
2395 RT_CACHE_STAT_INC(out_slow_mc);
2396 }
2397#ifdef CONFIG_IP_MROUTE
2398 if (res->type == RTN_MULTICAST) {
2399 if (IN_DEV_MFORWARD(in_dev) &&
f97c1e0c 2400 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
1da177e4
LT
2401 rth->u.dst.input = ip_mr_input;
2402 rth->u.dst.output = ip_mc_output;
2403 }
2404 }
2405#endif
2406 }
2407
2408 rt_set_nexthop(rth, res, 0);
2409
2410 rth->rt_flags = flags;
2411
2412 *result = rth;
2413 cleanup:
2414 /* release work reference to inet device */
2415 in_dev_put(in_dev);
2416
2417 return err;
2418}
2419
5969f71d
SH
2420static int ip_mkroute_output(struct rtable **rp,
2421 struct fib_result *res,
2422 const struct flowi *fl,
2423 const struct flowi *oldflp,
2424 struct net_device *dev_out,
2425 unsigned flags)
1da177e4 2426{
7abaa27c 2427 struct rtable *rth = NULL;
1da177e4
LT
2428 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2429 unsigned hash;
2430 if (err == 0) {
b00180de 2431 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
e84f84f2 2432 rt_genid(dev_net(dev_out)));
511c3f92 2433 err = rt_intern_hash(hash, rth, rp, NULL);
1da177e4 2434 }
e905a9ed 2435
1da177e4
LT
2436 return err;
2437}
2438
1da177e4
LT
2439/*
2440 * Major route resolver routine.
2441 */
2442
b40afd0e
DL
2443static int ip_route_output_slow(struct net *net, struct rtable **rp,
2444 const struct flowi *oldflp)
1da177e4
LT
2445{
2446 u32 tos = RT_FL_TOS(oldflp);
2447 struct flowi fl = { .nl_u = { .ip4_u =
2448 { .daddr = oldflp->fl4_dst,
2449 .saddr = oldflp->fl4_src,
2450 .tos = tos & IPTOS_RT_MASK,
2451 .scope = ((tos & RTO_ONLINK) ?
2452 RT_SCOPE_LINK :
2453 RT_SCOPE_UNIVERSE),
1da177e4 2454 } },
47dcf0cb 2455 .mark = oldflp->mark,
b40afd0e 2456 .iif = net->loopback_dev->ifindex,
1da177e4
LT
2457 .oif = oldflp->oif };
2458 struct fib_result res;
2459 unsigned flags = 0;
2460 struct net_device *dev_out = NULL;
2461 int free_res = 0;
2462 int err;
2463
2464
2465 res.fi = NULL;
2466#ifdef CONFIG_IP_MULTIPLE_TABLES
2467 res.r = NULL;
2468#endif
2469
2470 if (oldflp->fl4_src) {
2471 err = -EINVAL;
f97c1e0c 2472 if (ipv4_is_multicast(oldflp->fl4_src) ||
1e637c74 2473 ipv4_is_lbcast(oldflp->fl4_src) ||
f97c1e0c 2474 ipv4_is_zeronet(oldflp->fl4_src))
1da177e4
LT
2475 goto out;
2476
1da177e4
LT
2477 /* I removed check for oif == dev_out->oif here.
2478 It was wrong for two reasons:
1ab35276
DL
2479 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2480 is assigned to multiple interfaces.
1da177e4
LT
2481 2. Moreover, we are allowed to send packets with saddr
2482 of another iface. --ANK
2483 */
2484
f6c5d736 2485 if (oldflp->oif == 0
f97c1e0c
JP
2486 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2487 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
a210d01a
JA
2488 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2489 dev_out = ip_dev_find(net, oldflp->fl4_src);
2490 if (dev_out == NULL)
2491 goto out;
2492
1da177e4
LT
2493 /* Special hack: user can direct multicasts
2494 and limited broadcast via necessary interface
2495 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2496 This hack is not just for fun, it allows
2497 vic,vat and friends to work.
2498 They bind socket to loopback, set ttl to zero
2499 and expect that it will work.
2500 From the viewpoint of routing cache they are broken,
2501 because we are not allowed to build multicast path
2502 with loopback source addr (look, routing cache
2503 cannot know, that ttl is zero, so that packet
2504 will not leave this host and route is valid).
2505 Luckily, this hack is good workaround.
2506 */
2507
2508 fl.oif = dev_out->ifindex;
2509 goto make_route;
2510 }
a210d01a
JA
2511
2512 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2513 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2514 dev_out = ip_dev_find(net, oldflp->fl4_src);
2515 if (dev_out == NULL)
2516 goto out;
1da177e4 2517 dev_put(dev_out);
a210d01a
JA
2518 dev_out = NULL;
2519 }
1da177e4
LT
2520 }
2521
2522
2523 if (oldflp->oif) {
b40afd0e 2524 dev_out = dev_get_by_index(net, oldflp->oif);
1da177e4
LT
2525 err = -ENODEV;
2526 if (dev_out == NULL)
2527 goto out;
e5ed6399
HX
2528
2529 /* RACE: Check return value of inet_select_addr instead. */
2530 if (__in_dev_get_rtnl(dev_out) == NULL) {
1da177e4
LT
2531 dev_put(dev_out);
2532 goto out; /* Wrong error code */
2533 }
2534
f97c1e0c
JP
2535 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2536 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
1da177e4
LT
2537 if (!fl.fl4_src)
2538 fl.fl4_src = inet_select_addr(dev_out, 0,
2539 RT_SCOPE_LINK);
2540 goto make_route;
2541 }
2542 if (!fl.fl4_src) {
f97c1e0c 2543 if (ipv4_is_multicast(oldflp->fl4_dst))
1da177e4
LT
2544 fl.fl4_src = inet_select_addr(dev_out, 0,
2545 fl.fl4_scope);
2546 else if (!oldflp->fl4_dst)
2547 fl.fl4_src = inet_select_addr(dev_out, 0,
2548 RT_SCOPE_HOST);
2549 }
2550 }
2551
2552 if (!fl.fl4_dst) {
2553 fl.fl4_dst = fl.fl4_src;
2554 if (!fl.fl4_dst)
2555 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2556 if (dev_out)
2557 dev_put(dev_out);
b40afd0e 2558 dev_out = net->loopback_dev;
1da177e4 2559 dev_hold(dev_out);
b40afd0e 2560 fl.oif = net->loopback_dev->ifindex;
1da177e4
LT
2561 res.type = RTN_LOCAL;
2562 flags |= RTCF_LOCAL;
2563 goto make_route;
2564 }
2565
b40afd0e 2566 if (fib_lookup(net, &fl, &res)) {
1da177e4
LT
2567 res.fi = NULL;
2568 if (oldflp->oif) {
2569 /* Apparently, routing tables are wrong. Assume,
2570 that the destination is on link.
2571
2572 WHY? DW.
2573 Because we are allowed to send to iface
2574 even if it has NO routes and NO assigned
2575 addresses. When oif is specified, routing
2576 tables are looked up with only one purpose:
2577 to catch if destination is gatewayed, rather than
2578 direct. Moreover, if MSG_DONTROUTE is set,
2579 we send packet, ignoring both routing tables
2580 and ifaddr state. --ANK
2581
2582
2583 We could make it even if oif is unknown,
2584 likely IPv6, but we do not.
2585 */
2586
2587 if (fl.fl4_src == 0)
2588 fl.fl4_src = inet_select_addr(dev_out, 0,
2589 RT_SCOPE_LINK);
2590 res.type = RTN_UNICAST;
2591 goto make_route;
2592 }
2593 if (dev_out)
2594 dev_put(dev_out);
2595 err = -ENETUNREACH;
2596 goto out;
2597 }
2598 free_res = 1;
2599
2600 if (res.type == RTN_LOCAL) {
2601 if (!fl.fl4_src)
2602 fl.fl4_src = fl.fl4_dst;
2603 if (dev_out)
2604 dev_put(dev_out);
b40afd0e 2605 dev_out = net->loopback_dev;
1da177e4
LT
2606 dev_hold(dev_out);
2607 fl.oif = dev_out->ifindex;
2608 if (res.fi)
2609 fib_info_put(res.fi);
2610 res.fi = NULL;
2611 flags |= RTCF_LOCAL;
2612 goto make_route;
2613 }
2614
2615#ifdef CONFIG_IP_ROUTE_MULTIPATH
2616 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2617 fib_select_multipath(&fl, &res);
2618 else
2619#endif
2620 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
b40afd0e 2621 fib_select_default(net, &fl, &res);
1da177e4
LT
2622
2623 if (!fl.fl4_src)
2624 fl.fl4_src = FIB_RES_PREFSRC(res);
2625
2626 if (dev_out)
2627 dev_put(dev_out);
2628 dev_out = FIB_RES_DEV(res);
2629 dev_hold(dev_out);
2630 fl.oif = dev_out->ifindex;
2631
2632
2633make_route:
2634 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2635
2636
2637 if (free_res)
2638 fib_res_put(&res);
2639 if (dev_out)
2640 dev_put(dev_out);
2641out: return err;
2642}
2643
611c183e
DL
2644int __ip_route_output_key(struct net *net, struct rtable **rp,
2645 const struct flowi *flp)
1da177e4
LT
2646{
2647 unsigned hash;
2648 struct rtable *rth;
2649
1080d709
NH
2650 if (!rt_caching(net))
2651 goto slow_output;
2652
e84f84f2 2653 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
1da177e4
LT
2654
2655 rcu_read_lock_bh();
2656 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 2657 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
2658 if (rth->fl.fl4_dst == flp->fl4_dst &&
2659 rth->fl.fl4_src == flp->fl4_src &&
2660 rth->fl.iif == 0 &&
2661 rth->fl.oif == flp->oif &&
47dcf0cb 2662 rth->fl.mark == flp->mark &&
1da177e4 2663 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
b5921910 2664 (IPTOS_RT_MASK | RTO_ONLINK)) &&
878628fb 2665 net_eq(dev_net(rth->u.dst.dev), net) &&
e84f84f2 2666 !rt_is_expired(rth)) {
03f49f34 2667 dst_use(&rth->u.dst, jiffies);
1da177e4
LT
2668 RT_CACHE_STAT_INC(out_hit);
2669 rcu_read_unlock_bh();
2670 *rp = rth;
2671 return 0;
2672 }
2673 RT_CACHE_STAT_INC(out_hlist_search);
2674 }
2675 rcu_read_unlock_bh();
2676
1080d709 2677slow_output:
611c183e 2678 return ip_route_output_slow(net, rp, flp);
1da177e4
LT
2679}
2680
d8c97a94
ACM
2681EXPORT_SYMBOL_GPL(__ip_route_output_key);
2682
14e50e57
DM
2683static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2684{
2685}
2686
2687static struct dst_ops ipv4_dst_blackhole_ops = {
2688 .family = AF_INET,
09640e63 2689 .protocol = cpu_to_be16(ETH_P_IP),
14e50e57
DM
2690 .destroy = ipv4_dst_destroy,
2691 .check = ipv4_dst_check,
2692 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
e2422970 2693 .entries = ATOMIC_INIT(0),
14e50e57
DM
2694};
2695
2696
e84f84f2 2697static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
14e50e57
DM
2698{
2699 struct rtable *ort = *rp;
2700 struct rtable *rt = (struct rtable *)
2701 dst_alloc(&ipv4_dst_blackhole_ops);
2702
2703 if (rt) {
2704 struct dst_entry *new = &rt->u.dst;
2705
2706 atomic_set(&new->__refcnt, 1);
2707 new->__use = 1;
352e512c
HX
2708 new->input = dst_discard;
2709 new->output = dst_discard;
14e50e57
DM
2710 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2711
2712 new->dev = ort->u.dst.dev;
2713 if (new->dev)
2714 dev_hold(new->dev);
2715
2716 rt->fl = ort->fl;
2717
2718 rt->idev = ort->idev;
2719 if (rt->idev)
2720 in_dev_hold(rt->idev);
e84f84f2 2721 rt->rt_genid = rt_genid(net);
14e50e57
DM
2722 rt->rt_flags = ort->rt_flags;
2723 rt->rt_type = ort->rt_type;
2724 rt->rt_dst = ort->rt_dst;
2725 rt->rt_src = ort->rt_src;
2726 rt->rt_iif = ort->rt_iif;
2727 rt->rt_gateway = ort->rt_gateway;
2728 rt->rt_spec_dst = ort->rt_spec_dst;
2729 rt->peer = ort->peer;
2730 if (rt->peer)
2731 atomic_inc(&rt->peer->refcnt);
2732
2733 dst_free(new);
2734 }
2735
2736 dst_release(&(*rp)->u.dst);
2737 *rp = rt;
2738 return (rt ? 0 : -ENOMEM);
2739}
2740
f1b050bf
DL
2741int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2742 struct sock *sk, int flags)
1da177e4
LT
2743{
2744 int err;
2745
f1b050bf 2746 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
1da177e4
LT
2747 return err;
2748
2749 if (flp->proto) {
2750 if (!flp->fl4_src)
2751 flp->fl4_src = (*rp)->rt_src;
2752 if (!flp->fl4_dst)
2753 flp->fl4_dst = (*rp)->rt_dst;
52479b62 2754 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
bb72845e 2755 flags ? XFRM_LOOKUP_WAIT : 0);
14e50e57 2756 if (err == -EREMOTE)
e84f84f2 2757 err = ipv4_dst_blackhole(net, rp, flp);
14e50e57
DM
2758
2759 return err;
1da177e4
LT
2760 }
2761
2762 return 0;
2763}
2764
d8c97a94
ACM
2765EXPORT_SYMBOL_GPL(ip_route_output_flow);
2766
f206351a 2767int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
1da177e4 2768{
f206351a 2769 return ip_route_output_flow(net, rp, flp, NULL, 0);
1da177e4
LT
2770}
2771
4feb88e5
BT
2772static int rt_fill_info(struct net *net,
2773 struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2774 int nowait, unsigned int flags)
1da177e4 2775{
511c3f92 2776 struct rtable *rt = skb_rtable(skb);
1da177e4 2777 struct rtmsg *r;
be403ea1 2778 struct nlmsghdr *nlh;
e3703b3d
TG
2779 long expires;
2780 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2781
2782 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2783 if (nlh == NULL)
26932566 2784 return -EMSGSIZE;
be403ea1
TG
2785
2786 r = nlmsg_data(nlh);
1da177e4
LT
2787 r->rtm_family = AF_INET;
2788 r->rtm_dst_len = 32;
2789 r->rtm_src_len = 0;
2790 r->rtm_tos = rt->fl.fl4_tos;
2791 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2792 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2793 r->rtm_type = rt->rt_type;
2794 r->rtm_scope = RT_SCOPE_UNIVERSE;
2795 r->rtm_protocol = RTPROT_UNSPEC;
2796 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2797 if (rt->rt_flags & RTCF_NOTIFY)
2798 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2799
17fb2c64 2800 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2801
1da177e4
LT
2802 if (rt->fl.fl4_src) {
2803 r->rtm_src_len = 32;
17fb2c64 2804 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
1da177e4
LT
2805 }
2806 if (rt->u.dst.dev)
be403ea1 2807 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
1da177e4
LT
2808#ifdef CONFIG_NET_CLS_ROUTE
2809 if (rt->u.dst.tclassid)
be403ea1 2810 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
1da177e4
LT
2811#endif
2812 if (rt->fl.iif)
17fb2c64 2813 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
1da177e4 2814 else if (rt->rt_src != rt->fl.fl4_src)
17fb2c64 2815 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2816
1da177e4 2817 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2818 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2819
1da177e4 2820 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
be403ea1
TG
2821 goto nla_put_failure;
2822
e3703b3d
TG
2823 error = rt->u.dst.error;
2824 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
1da177e4 2825 if (rt->peer) {
e3703b3d 2826 id = rt->peer->ip_id_count;
1da177e4 2827 if (rt->peer->tcp_ts_stamp) {
e3703b3d 2828 ts = rt->peer->tcp_ts;
9d729f72 2829 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
1da177e4
LT
2830 }
2831 }
be403ea1 2832
1da177e4
LT
2833 if (rt->fl.iif) {
2834#ifdef CONFIG_IP_MROUTE
e448515c 2835 __be32 dst = rt->rt_dst;
1da177e4 2836
f97c1e0c 2837 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
4feb88e5
BT
2838 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2839 int err = ipmr_get_route(net, skb, r, nowait);
1da177e4
LT
2840 if (err <= 0) {
2841 if (!nowait) {
2842 if (err == 0)
2843 return 0;
be403ea1 2844 goto nla_put_failure;
1da177e4
LT
2845 } else {
2846 if (err == -EMSGSIZE)
be403ea1 2847 goto nla_put_failure;
e3703b3d 2848 error = err;
1da177e4
LT
2849 }
2850 }
2851 } else
2852#endif
be403ea1 2853 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
1da177e4
LT
2854 }
2855
e3703b3d
TG
2856 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2857 expires, error) < 0)
2858 goto nla_put_failure;
be403ea1
TG
2859
2860 return nlmsg_end(skb, nlh);
1da177e4 2861
be403ea1 2862nla_put_failure:
26932566
PM
2863 nlmsg_cancel(skb, nlh);
2864 return -EMSGSIZE;
1da177e4
LT
2865}
2866
63f3444f 2867static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 2868{
3b1e0a65 2869 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2870 struct rtmsg *rtm;
2871 struct nlattr *tb[RTA_MAX+1];
1da177e4 2872 struct rtable *rt = NULL;
9e12bb22
AV
2873 __be32 dst = 0;
2874 __be32 src = 0;
2875 u32 iif;
d889ce3b 2876 int err;
1da177e4
LT
2877 struct sk_buff *skb;
2878
d889ce3b
TG
2879 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2880 if (err < 0)
2881 goto errout;
2882
2883 rtm = nlmsg_data(nlh);
2884
1da177e4 2885 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2886 if (skb == NULL) {
2887 err = -ENOBUFS;
2888 goto errout;
2889 }
1da177e4
LT
2890
2891 /* Reserve room for dummy headers, this skb can pass
2892 through good chunk of routing engine.
2893 */
459a98ed 2894 skb_reset_mac_header(skb);
c1d2bbe1 2895 skb_reset_network_header(skb);
d2c962b8
SH
2896
2897 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2898 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2899 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2900
17fb2c64
AV
2901 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2902 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2903 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
1da177e4
LT
2904
2905 if (iif) {
d889ce3b
TG
2906 struct net_device *dev;
2907
1937504d 2908 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
2909 if (dev == NULL) {
2910 err = -ENODEV;
2911 goto errout_free;
2912 }
2913
1da177e4
LT
2914 skb->protocol = htons(ETH_P_IP);
2915 skb->dev = dev;
2916 local_bh_disable();
2917 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2918 local_bh_enable();
d889ce3b 2919
511c3f92 2920 rt = skb_rtable(skb);
d889ce3b 2921 if (err == 0 && rt->u.dst.error)
1da177e4
LT
2922 err = -rt->u.dst.error;
2923 } else {
d889ce3b
TG
2924 struct flowi fl = {
2925 .nl_u = {
2926 .ip4_u = {
2927 .daddr = dst,
2928 .saddr = src,
2929 .tos = rtm->rtm_tos,
2930 },
2931 },
2932 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2933 };
1937504d 2934 err = ip_route_output_key(net, &rt, &fl);
1da177e4 2935 }
d889ce3b 2936
1da177e4 2937 if (err)
d889ce3b 2938 goto errout_free;
1da177e4 2939
adf30907 2940 skb_dst_set(skb, &rt->u.dst);
1da177e4
LT
2941 if (rtm->rtm_flags & RTM_F_NOTIFY)
2942 rt->rt_flags |= RTCF_NOTIFY;
2943
4feb88e5 2944 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 2945 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2946 if (err <= 0)
2947 goto errout_free;
1da177e4 2948
1937504d 2949 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 2950errout:
2942e900 2951 return err;
1da177e4 2952
d889ce3b 2953errout_free:
1da177e4 2954 kfree_skb(skb);
d889ce3b 2955 goto errout;
1da177e4
LT
2956}
2957
2958int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2959{
2960 struct rtable *rt;
2961 int h, s_h;
2962 int idx, s_idx;
1937504d
DL
2963 struct net *net;
2964
3b1e0a65 2965 net = sock_net(skb->sk);
1da177e4
LT
2966
2967 s_h = cb->args[0];
d8c92830
ED
2968 if (s_h < 0)
2969 s_h = 0;
1da177e4 2970 s_idx = idx = cb->args[1];
a6272665
ED
2971 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2972 if (!rt_hash_table[h].chain)
2973 continue;
1da177e4
LT
2974 rcu_read_lock_bh();
2975 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
093c2ca4 2976 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
878628fb 2977 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
1da177e4 2978 continue;
e84f84f2 2979 if (rt_is_expired(rt))
29e75252 2980 continue;
adf30907 2981 skb_dst_set(skb, dst_clone(&rt->u.dst));
4feb88e5 2982 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
e905a9ed 2983 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 2984 1, NLM_F_MULTI) <= 0) {
adf30907 2985 skb_dst_drop(skb);
1da177e4
LT
2986 rcu_read_unlock_bh();
2987 goto done;
2988 }
adf30907 2989 skb_dst_drop(skb);
1da177e4
LT
2990 }
2991 rcu_read_unlock_bh();
2992 }
2993
2994done:
2995 cb->args[0] = h;
2996 cb->args[1] = idx;
2997 return skb->len;
2998}
2999
3000void ip_rt_multicast_event(struct in_device *in_dev)
3001{
76e6ebfb 3002 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
3003}
3004
3005#ifdef CONFIG_SYSCTL
81c684d1 3006static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
1da177e4
LT
3007 struct file *filp, void __user *buffer,
3008 size_t *lenp, loff_t *ppos)
3009{
3010 if (write) {
639e104f 3011 int flush_delay;
81c684d1 3012 ctl_table ctl;
39a23e75 3013 struct net *net;
639e104f 3014
81c684d1
DL
3015 memcpy(&ctl, __ctl, sizeof(ctl));
3016 ctl.data = &flush_delay;
3017 proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
639e104f 3018
81c684d1 3019 net = (struct net *)__ctl->extra1;
39a23e75 3020 rt_cache_flush(net, flush_delay);
1da177e4 3021 return 0;
e905a9ed 3022 }
1da177e4
LT
3023
3024 return -EINVAL;
3025}
3026
3027static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
1da177e4
LT
3028 void __user *oldval,
3029 size_t __user *oldlenp,
3030 void __user *newval,
1f29bcd7 3031 size_t newlen)
1da177e4
LT
3032{
3033 int delay;
39a23e75 3034 struct net *net;
1da177e4
LT
3035 if (newlen != sizeof(int))
3036 return -EINVAL;
3037 if (get_user(delay, (int __user *)newval))
e905a9ed 3038 return -EFAULT;
39a23e75
DL
3039 net = (struct net *)table->extra1;
3040 rt_cache_flush(net, delay);
1da177e4
LT
3041 return 0;
3042}
3043
c6153b5b
HX
3044static void rt_secret_reschedule(int old)
3045{
3046 struct net *net;
3047 int new = ip_rt_secret_interval;
3048 int diff = new - old;
3049
3050 if (!diff)
3051 return;
3052
3053 rtnl_lock();
3054 for_each_net(net) {
3055 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3056
3057 if (!new)
3058 continue;
3059
3060 if (deleted) {
3061 long time = net->ipv4.rt_secret_timer.expires - jiffies;
3062
3063 if (time <= 0 || (time += diff) <= 0)
3064 time = 0;
3065
3066 net->ipv4.rt_secret_timer.expires = time;
3067 } else
3068 net->ipv4.rt_secret_timer.expires = new;
3069
3070 net->ipv4.rt_secret_timer.expires += jiffies;
3071 add_timer(&net->ipv4.rt_secret_timer);
3072 }
3073 rtnl_unlock();
3074}
3075
3076static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3077 struct file *filp,
3078 void __user *buffer, size_t *lenp,
3079 loff_t *ppos)
3080{
3081 int old = ip_rt_secret_interval;
3082 int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
3083
3084 rt_secret_reschedule(old);
3085
3086 return ret;
3087}
3088
3089static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
c6153b5b
HX
3090 void __user *oldval,
3091 size_t __user *oldlenp,
3092 void __user *newval,
3093 size_t newlen)
3094{
3095 int old = ip_rt_secret_interval;
f221e726 3096 int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
c6153b5b
HX
3097
3098 rt_secret_reschedule(old);
3099
3100 return ret;
3101}
3102
eeb61f71 3103static ctl_table ipv4_route_table[] = {
1da177e4
LT
3104 {
3105 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
3106 .procname = "gc_thresh",
3107 .data = &ipv4_dst_ops.gc_thresh,
3108 .maxlen = sizeof(int),
3109 .mode = 0644,
6d9f239a 3110 .proc_handler = proc_dointvec,
1da177e4
LT
3111 },
3112 {
3113 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
3114 .procname = "max_size",
3115 .data = &ip_rt_max_size,
3116 .maxlen = sizeof(int),
3117 .mode = 0644,
6d9f239a 3118 .proc_handler = proc_dointvec,
1da177e4
LT
3119 },
3120 {
3121 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3122
1da177e4
LT
3123 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3124 .procname = "gc_min_interval",
3125 .data = &ip_rt_gc_min_interval,
3126 .maxlen = sizeof(int),
3127 .mode = 0644,
6d9f239a
AD
3128 .proc_handler = proc_dointvec_jiffies,
3129 .strategy = sysctl_jiffies,
1da177e4
LT
3130 },
3131 {
3132 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3133 .procname = "gc_min_interval_ms",
3134 .data = &ip_rt_gc_min_interval,
3135 .maxlen = sizeof(int),
3136 .mode = 0644,
6d9f239a
AD
3137 .proc_handler = proc_dointvec_ms_jiffies,
3138 .strategy = sysctl_ms_jiffies,
1da177e4
LT
3139 },
3140 {
3141 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
3142 .procname = "gc_timeout",
3143 .data = &ip_rt_gc_timeout,
3144 .maxlen = sizeof(int),
3145 .mode = 0644,
6d9f239a
AD
3146 .proc_handler = proc_dointvec_jiffies,
3147 .strategy = sysctl_jiffies,
1da177e4
LT
3148 },
3149 {
3150 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
3151 .procname = "gc_interval",
3152 .data = &ip_rt_gc_interval,
3153 .maxlen = sizeof(int),
3154 .mode = 0644,
6d9f239a
AD
3155 .proc_handler = proc_dointvec_jiffies,
3156 .strategy = sysctl_jiffies,
1da177e4
LT
3157 },
3158 {
3159 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
3160 .procname = "redirect_load",
3161 .data = &ip_rt_redirect_load,
3162 .maxlen = sizeof(int),
3163 .mode = 0644,
6d9f239a 3164 .proc_handler = proc_dointvec,
1da177e4
LT
3165 },
3166 {
3167 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3168 .procname = "redirect_number",
3169 .data = &ip_rt_redirect_number,
3170 .maxlen = sizeof(int),
3171 .mode = 0644,
6d9f239a 3172 .proc_handler = proc_dointvec,
1da177e4
LT
3173 },
3174 {
3175 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3176 .procname = "redirect_silence",
3177 .data = &ip_rt_redirect_silence,
3178 .maxlen = sizeof(int),
3179 .mode = 0644,
6d9f239a 3180 .proc_handler = proc_dointvec,
1da177e4
LT
3181 },
3182 {
3183 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
3184 .procname = "error_cost",
3185 .data = &ip_rt_error_cost,
3186 .maxlen = sizeof(int),
3187 .mode = 0644,
6d9f239a 3188 .proc_handler = proc_dointvec,
1da177e4
LT
3189 },
3190 {
3191 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3192 .procname = "error_burst",
3193 .data = &ip_rt_error_burst,
3194 .maxlen = sizeof(int),
3195 .mode = 0644,
6d9f239a 3196 .proc_handler = proc_dointvec,
1da177e4
LT
3197 },
3198 {
3199 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3200 .procname = "gc_elasticity",
3201 .data = &ip_rt_gc_elasticity,
3202 .maxlen = sizeof(int),
3203 .mode = 0644,
6d9f239a 3204 .proc_handler = proc_dointvec,
1da177e4
LT
3205 },
3206 {
3207 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3208 .procname = "mtu_expires",
3209 .data = &ip_rt_mtu_expires,
3210 .maxlen = sizeof(int),
3211 .mode = 0644,
6d9f239a
AD
3212 .proc_handler = proc_dointvec_jiffies,
3213 .strategy = sysctl_jiffies,
1da177e4
LT
3214 },
3215 {
3216 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3217 .procname = "min_pmtu",
3218 .data = &ip_rt_min_pmtu,
3219 .maxlen = sizeof(int),
3220 .mode = 0644,
6d9f239a 3221 .proc_handler = proc_dointvec,
1da177e4
LT
3222 },
3223 {
3224 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3225 .procname = "min_adv_mss",
3226 .data = &ip_rt_min_advmss,
3227 .maxlen = sizeof(int),
3228 .mode = 0644,
6d9f239a 3229 .proc_handler = proc_dointvec,
1da177e4
LT
3230 },
3231 {
3232 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3233 .procname = "secret_interval",
3234 .data = &ip_rt_secret_interval,
3235 .maxlen = sizeof(int),
3236 .mode = 0644,
6d9f239a
AD
3237 .proc_handler = ipv4_sysctl_rt_secret_interval,
3238 .strategy = ipv4_sysctl_rt_secret_interval_strategy,
1da177e4
LT
3239 },
3240 { .ctl_name = 0 }
3241};
39a23e75 3242
2f4520d3
AV
3243static struct ctl_table empty[1];
3244
3245static struct ctl_table ipv4_skeleton[] =
3246{
3247 { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
d994af0d 3248 .mode = 0555, .child = ipv4_route_table},
2f4520d3 3249 { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
d994af0d 3250 .mode = 0555, .child = empty},
2f4520d3
AV
3251 { }
3252};
3253
3254static __net_initdata struct ctl_path ipv4_path[] = {
39a23e75
DL
3255 { .procname = "net", .ctl_name = CTL_NET, },
3256 { .procname = "ipv4", .ctl_name = NET_IPV4, },
39a23e75
DL
3257 { },
3258};
3259
39a23e75
DL
3260static struct ctl_table ipv4_route_flush_table[] = {
3261 {
3262 .ctl_name = NET_IPV4_ROUTE_FLUSH,
3263 .procname = "flush",
3264 .maxlen = sizeof(int),
3265 .mode = 0200,
6d9f239a
AD
3266 .proc_handler = ipv4_sysctl_rtcache_flush,
3267 .strategy = ipv4_sysctl_rtcache_flush_strategy,
39a23e75
DL
3268 },
3269 { .ctl_name = 0 },
3270};
3271
2f4520d3
AV
3272static __net_initdata struct ctl_path ipv4_route_path[] = {
3273 { .procname = "net", .ctl_name = CTL_NET, },
3274 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3275 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3276 { },
3277};
3278
39a23e75
DL
3279static __net_init int sysctl_route_net_init(struct net *net)
3280{
3281 struct ctl_table *tbl;
3282
3283 tbl = ipv4_route_flush_table;
3284 if (net != &init_net) {
3285 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3286 if (tbl == NULL)
3287 goto err_dup;
3288 }
3289 tbl[0].extra1 = net;
3290
3291 net->ipv4.route_hdr =
3292 register_net_sysctl_table(net, ipv4_route_path, tbl);
3293 if (net->ipv4.route_hdr == NULL)
3294 goto err_reg;
3295 return 0;
3296
3297err_reg:
3298 if (tbl != ipv4_route_flush_table)
3299 kfree(tbl);
3300err_dup:
3301 return -ENOMEM;
3302}
3303
3304static __net_exit void sysctl_route_net_exit(struct net *net)
3305{
3306 struct ctl_table *tbl;
3307
3308 tbl = net->ipv4.route_hdr->ctl_table_arg;
3309 unregister_net_sysctl_table(net->ipv4.route_hdr);
3310 BUG_ON(tbl == ipv4_route_flush_table);
3311 kfree(tbl);
3312}
3313
3314static __net_initdata struct pernet_operations sysctl_route_ops = {
3315 .init = sysctl_route_net_init,
3316 .exit = sysctl_route_net_exit,
3317};
1da177e4
LT
3318#endif
3319
9f5e97e5
DL
3320
3321static __net_init int rt_secret_timer_init(struct net *net)
3322{
e84f84f2
DL
3323 atomic_set(&net->ipv4.rt_genid,
3324 (int) ((num_physpages ^ (num_physpages>>8)) ^
3325 (jiffies ^ (jiffies >> 7))));
3326
9f5e97e5
DL
3327 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3328 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3329 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3330
c6153b5b
HX
3331 if (ip_rt_secret_interval) {
3332 net->ipv4.rt_secret_timer.expires =
3333 jiffies + net_random() % ip_rt_secret_interval +
3334 ip_rt_secret_interval;
3335 add_timer(&net->ipv4.rt_secret_timer);
3336 }
9f5e97e5
DL
3337 return 0;
3338}
3339
3340static __net_exit void rt_secret_timer_exit(struct net *net)
3341{
3342 del_timer_sync(&net->ipv4.rt_secret_timer);
3343}
3344
3345static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3346 .init = rt_secret_timer_init,
3347 .exit = rt_secret_timer_exit,
3348};
3349
3350
1da177e4 3351#ifdef CONFIG_NET_CLS_ROUTE
8dbde28d 3352struct ip_rt_acct *ip_rt_acct __read_mostly;
1da177e4
LT
3353#endif /* CONFIG_NET_CLS_ROUTE */
3354
3355static __initdata unsigned long rhash_entries;
3356static int __init set_rhash_entries(char *str)
3357{
3358 if (!str)
3359 return 0;
3360 rhash_entries = simple_strtoul(str, &str, 0);
3361 return 1;
3362}
3363__setup("rhash_entries=", set_rhash_entries);
3364
3365int __init ip_rt_init(void)
3366{
424c4b70 3367 int rc = 0;
1da177e4 3368
1da177e4 3369#ifdef CONFIG_NET_CLS_ROUTE
0dcec8c2 3370 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3371 if (!ip_rt_acct)
3372 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3373#endif
3374
e5d679f3
AD
3375 ipv4_dst_ops.kmem_cachep =
3376 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3377 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3378
14e50e57
DM
3379 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3380
424c4b70
ED
3381 rt_hash_table = (struct rt_hash_bucket *)
3382 alloc_large_system_hash("IP route cache",
3383 sizeof(struct rt_hash_bucket),
3384 rhash_entries,
3385 (num_physpages >= 128 * 1024) ?
18955cfc 3386 15 : 17,
8d1502de 3387 0,
424c4b70
ED
3388 &rt_hash_log,
3389 &rt_hash_mask,
c9503e0f 3390 rhash_entries ? 0 : 512 * 1024);
22c047cc
ED
3391 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3392 rt_hash_lock_init();
1da177e4
LT
3393
3394 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3395 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3396
1da177e4
LT
3397 devinet_init();
3398 ip_fib_init();
3399
1da177e4
LT
3400 /* All the timers, started at system startup tend
3401 to synchronize. Perturb it a bit.
3402 */
125bb8f5
ED
3403 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3404 expires_ljiffies = jiffies;
39c90ece
ED
3405 schedule_delayed_work(&expires_work,
3406 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
1da177e4 3407
9f5e97e5
DL
3408 if (register_pernet_subsys(&rt_secret_timer_ops))
3409 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
1da177e4 3410
73b38711 3411 if (ip_rt_proc_init())
107f1634 3412 printk(KERN_ERR "Unable to create route proc files\n");
1da177e4
LT
3413#ifdef CONFIG_XFRM
3414 xfrm_init();
3415 xfrm4_init();
3416#endif
63f3444f
TG
3417 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3418
39a23e75
DL
3419#ifdef CONFIG_SYSCTL
3420 register_pernet_subsys(&sysctl_route_ops);
3421#endif
1da177e4
LT
3422 return rc;
3423}
3424
a1bc6eb4 3425#ifdef CONFIG_SYSCTL
eeb61f71
AV
3426/*
3427 * We really need to sanitize the damn ipv4 init order, then all
3428 * this nonsense will go away.
3429 */
3430void __init ip_static_sysctl_init(void)
3431{
2f4520d3 3432 register_sysctl_paths(ipv4_path, ipv4_skeleton);
eeb61f71 3433}
a1bc6eb4 3434#endif
eeb61f71 3435
1da177e4
LT
3436EXPORT_SYMBOL(__ip_select_ident);
3437EXPORT_SYMBOL(ip_route_input);
3438EXPORT_SYMBOL(ip_route_output_key);