]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/route.c
[IPV4]: Only panic if inetdev_init fails for loopback
[net-next-2.6.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
e905a9ed 23 * Alan Cox : Super /proc >4K
1da177e4
LT
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
e905a9ed 41 *
1da177e4
LT
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
60 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
1da177e4
LT
67#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
1da177e4 73#include <linux/mm.h>
424c4b70 74#include <linux/bootmem.h>
1da177e4
LT
75#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
84#include <linux/skbuff.h>
1da177e4
LT
85#include <linux/inetdevice.h>
86#include <linux/igmp.h>
87#include <linux/pkt_sched.h>
88#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h>
93#include <linux/times.h>
94#include <net/protocol.h>
95#include <net/ip.h>
96#include <net/route.h>
97#include <net/inetpeer.h>
98#include <net/sock.h>
99#include <net/ip_fib.h>
100#include <net/arp.h>
101#include <net/tcp.h>
102#include <net/icmp.h>
103#include <net/xfrm.h>
104#include <net/ip_mp_alg.h>
8d71740c 105#include <net/netevent.h>
63f3444f 106#include <net/rtnetlink.h>
1da177e4
LT
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
118static int ip_rt_min_delay = 2 * HZ;
119static int ip_rt_max_delay = 10 * HZ;
120static int ip_rt_max_size;
121static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
122static int ip_rt_gc_interval = 60 * HZ;
123static int ip_rt_gc_min_interval = HZ / 2;
124static int ip_rt_redirect_number = 9;
125static int ip_rt_redirect_load = HZ / 50;
126static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
127static int ip_rt_error_cost = HZ;
128static int ip_rt_error_burst = 5 * HZ;
129static int ip_rt_gc_elasticity = 8;
130static int ip_rt_mtu_expires = 10 * 60 * HZ;
131static int ip_rt_min_pmtu = 512 + 20 + 20;
132static int ip_rt_min_advmss = 256;
133static int ip_rt_secret_interval = 10 * 60 * HZ;
134static unsigned long rt_deadline;
135
136#define RTprint(a...) printk(KERN_DEBUG a)
137
138static struct timer_list rt_flush_timer;
139static struct timer_list rt_periodic_timer;
140static struct timer_list rt_secret_timer;
141
142/*
143 * Interface to generic destination cache.
144 */
145
146static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
147static void ipv4_dst_destroy(struct dst_entry *dst);
148static void ipv4_dst_ifdown(struct dst_entry *dst,
149 struct net_device *dev, int how);
150static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
151static void ipv4_link_failure(struct sk_buff *skb);
152static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
153static int rt_garbage_collect(void);
154
155
156static struct dst_ops ipv4_dst_ops = {
157 .family = AF_INET,
158 .protocol = __constant_htons(ETH_P_IP),
159 .gc = rt_garbage_collect,
160 .check = ipv4_dst_check,
161 .destroy = ipv4_dst_destroy,
162 .ifdown = ipv4_dst_ifdown,
163 .negative_advice = ipv4_negative_advice,
164 .link_failure = ipv4_link_failure,
165 .update_pmtu = ip_rt_update_pmtu,
166 .entry_size = sizeof(struct rtable),
167};
168
169#define ECN_OR_COST(class) TC_PRIO_##class
170
171__u8 ip_tos2prio[16] = {
172 TC_PRIO_BESTEFFORT,
173 ECN_OR_COST(FILLER),
174 TC_PRIO_BESTEFFORT,
175 ECN_OR_COST(BESTEFFORT),
176 TC_PRIO_BULK,
177 ECN_OR_COST(BULK),
178 TC_PRIO_BULK,
179 ECN_OR_COST(BULK),
180 TC_PRIO_INTERACTIVE,
181 ECN_OR_COST(INTERACTIVE),
182 TC_PRIO_INTERACTIVE,
183 ECN_OR_COST(INTERACTIVE),
184 TC_PRIO_INTERACTIVE_BULK,
185 ECN_OR_COST(INTERACTIVE_BULK),
186 TC_PRIO_INTERACTIVE_BULK,
187 ECN_OR_COST(INTERACTIVE_BULK)
188};
189
190
191/*
192 * Route cache.
193 */
194
195/* The locking scheme is rather straight forward:
196 *
197 * 1) Read-Copy Update protects the buckets of the central route hash.
198 * 2) Only writers remove entries, and they hold the lock
199 * as they look at rtable reference counts.
200 * 3) Only readers acquire references to rtable entries,
201 * they do so with atomic increments and with the
202 * lock held.
203 */
204
205struct rt_hash_bucket {
206 struct rtable *chain;
22c047cc 207};
8a25d5de
IM
208#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
209 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
210/*
211 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
212 * The size of this table is a power of two and depends on the number of CPUS.
62051200 213 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 214 */
62051200
IM
215#ifdef CONFIG_LOCKDEP
216# define RT_HASH_LOCK_SZ 256
22c047cc 217#else
62051200
IM
218# if NR_CPUS >= 32
219# define RT_HASH_LOCK_SZ 4096
220# elif NR_CPUS >= 16
221# define RT_HASH_LOCK_SZ 2048
222# elif NR_CPUS >= 8
223# define RT_HASH_LOCK_SZ 1024
224# elif NR_CPUS >= 4
225# define RT_HASH_LOCK_SZ 512
226# else
227# define RT_HASH_LOCK_SZ 256
228# endif
22c047cc
ED
229#endif
230
231static spinlock_t *rt_hash_locks;
232# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
233# define rt_hash_lock_init() { \
234 int i; \
235 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
236 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
237 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
238 spin_lock_init(&rt_hash_locks[i]); \
239 }
240#else
241# define rt_hash_lock_addr(slot) NULL
242# define rt_hash_lock_init()
243#endif
1da177e4
LT
244
245static struct rt_hash_bucket *rt_hash_table;
246static unsigned rt_hash_mask;
247static int rt_hash_log;
248static unsigned int rt_hash_rnd;
249
2f970d83 250static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
dbd2915c 251#define RT_CACHE_STAT_INC(field) \
bfe5d834 252 (__raw_get_cpu_var(rt_cache_stat).field++)
1da177e4
LT
253
254static int rt_intern_hash(unsigned hash, struct rtable *rth,
255 struct rtable **res);
256
cef2685e 257static unsigned int rt_hash_code(u32 daddr, u32 saddr)
1da177e4 258{
cef2685e 259 return (jhash_2words(daddr, saddr, rt_hash_rnd)
1da177e4
LT
260 & rt_hash_mask);
261}
262
8c7bc840
AV
263#define rt_hash(daddr, saddr, idx) \
264 rt_hash_code((__force u32)(__be32)(daddr),\
265 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
266
1da177e4
LT
267#ifdef CONFIG_PROC_FS
268struct rt_cache_iter_state {
269 int bucket;
270};
271
272static struct rtable *rt_cache_get_first(struct seq_file *seq)
273{
274 struct rtable *r = NULL;
275 struct rt_cache_iter_state *st = seq->private;
276
277 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
278 rcu_read_lock_bh();
279 r = rt_hash_table[st->bucket].chain;
280 if (r)
281 break;
282 rcu_read_unlock_bh();
283 }
284 return r;
285}
286
287static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
288{
289 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
290
093c2ca4 291 r = r->u.dst.rt_next;
1da177e4
LT
292 while (!r) {
293 rcu_read_unlock_bh();
294 if (--st->bucket < 0)
295 break;
296 rcu_read_lock_bh();
297 r = rt_hash_table[st->bucket].chain;
298 }
299 return r;
300}
301
302static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
303{
304 struct rtable *r = rt_cache_get_first(seq);
305
306 if (r)
307 while (pos && (r = rt_cache_get_next(seq, r)))
308 --pos;
309 return pos ? NULL : r;
310}
311
312static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
313{
314 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
315}
316
317static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
318{
319 struct rtable *r = NULL;
320
321 if (v == SEQ_START_TOKEN)
322 r = rt_cache_get_first(seq);
323 else
324 r = rt_cache_get_next(seq, v);
325 ++*pos;
326 return r;
327}
328
329static void rt_cache_seq_stop(struct seq_file *seq, void *v)
330{
331 if (v && v != SEQ_START_TOKEN)
332 rcu_read_unlock_bh();
333}
334
335static int rt_cache_seq_show(struct seq_file *seq, void *v)
336{
337 if (v == SEQ_START_TOKEN)
338 seq_printf(seq, "%-127s\n",
339 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
340 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
341 "HHUptod\tSpecDst");
342 else {
343 struct rtable *r = v;
344 char temp[256];
345
346 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
347 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
348 r->u.dst.dev ? r->u.dst.dev->name : "*",
349 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
350 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
351 r->u.dst.__use, 0, (unsigned long)r->rt_src,
352 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
353 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
354 dst_metric(&r->u.dst, RTAX_WINDOW),
355 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
356 dst_metric(&r->u.dst, RTAX_RTTVAR)),
357 r->fl.fl4_tos,
358 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
359 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
360 dev_queue_xmit) : 0,
361 r->rt_spec_dst);
362 seq_printf(seq, "%-127s\n", temp);
e905a9ed
YH
363 }
364 return 0;
1da177e4
LT
365}
366
f690808e 367static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
368 .start = rt_cache_seq_start,
369 .next = rt_cache_seq_next,
370 .stop = rt_cache_seq_stop,
371 .show = rt_cache_seq_show,
372};
373
374static int rt_cache_seq_open(struct inode *inode, struct file *file)
375{
376 struct seq_file *seq;
377 int rc = -ENOMEM;
378 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
379
380 if (!s)
381 goto out;
382 rc = seq_open(file, &rt_cache_seq_ops);
383 if (rc)
384 goto out_kfree;
385 seq = file->private_data;
386 seq->private = s;
387 memset(s, 0, sizeof(*s));
388out:
389 return rc;
390out_kfree:
391 kfree(s);
392 goto out;
393}
394
9a32144e 395static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
396 .owner = THIS_MODULE,
397 .open = rt_cache_seq_open,
398 .read = seq_read,
399 .llseek = seq_lseek,
400 .release = seq_release_private,
401};
402
403
404static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
405{
406 int cpu;
407
408 if (*pos == 0)
409 return SEQ_START_TOKEN;
410
411 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
412 if (!cpu_possible(cpu))
413 continue;
414 *pos = cpu+1;
2f970d83 415 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
416 }
417 return NULL;
418}
419
420static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
421{
422 int cpu;
423
424 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
425 if (!cpu_possible(cpu))
426 continue;
427 *pos = cpu+1;
2f970d83 428 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
429 }
430 return NULL;
e905a9ed 431
1da177e4
LT
432}
433
434static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
435{
436
437}
438
439static int rt_cpu_seq_show(struct seq_file *seq, void *v)
440{
441 struct rt_cache_stat *st = v;
442
443 if (v == SEQ_START_TOKEN) {
5bec0039 444 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
445 return 0;
446 }
e905a9ed 447
1da177e4
LT
448 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
449 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
450 atomic_read(&ipv4_dst_ops.entries),
451 st->in_hit,
452 st->in_slow_tot,
453 st->in_slow_mc,
454 st->in_no_route,
455 st->in_brd,
456 st->in_martian_dst,
457 st->in_martian_src,
458
459 st->out_hit,
460 st->out_slow_tot,
e905a9ed 461 st->out_slow_mc,
1da177e4
LT
462
463 st->gc_total,
464 st->gc_ignored,
465 st->gc_goal_miss,
466 st->gc_dst_overflow,
467 st->in_hlist_search,
468 st->out_hlist_search
469 );
470 return 0;
471}
472
f690808e 473static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
474 .start = rt_cpu_seq_start,
475 .next = rt_cpu_seq_next,
476 .stop = rt_cpu_seq_stop,
477 .show = rt_cpu_seq_show,
478};
479
480
481static int rt_cpu_seq_open(struct inode *inode, struct file *file)
482{
483 return seq_open(file, &rt_cpu_seq_ops);
484}
485
9a32144e 486static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
487 .owner = THIS_MODULE,
488 .open = rt_cpu_seq_open,
489 .read = seq_read,
490 .llseek = seq_lseek,
491 .release = seq_release,
492};
493
494#endif /* CONFIG_PROC_FS */
e905a9ed 495
1da177e4
LT
496static __inline__ void rt_free(struct rtable *rt)
497{
498 multipath_remove(rt);
499 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
500}
501
502static __inline__ void rt_drop(struct rtable *rt)
503{
504 multipath_remove(rt);
505 ip_rt_put(rt);
506 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
507}
508
509static __inline__ int rt_fast_clean(struct rtable *rth)
510{
511 /* Kill broadcast/multicast entries very aggresively, if they
512 collide in hash table with more useful entries */
513 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
093c2ca4 514 rth->fl.iif && rth->u.dst.rt_next;
1da177e4
LT
515}
516
517static __inline__ int rt_valuable(struct rtable *rth)
518{
519 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
520 rth->u.dst.expires;
521}
522
523static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
524{
525 unsigned long age;
526 int ret = 0;
527
528 if (atomic_read(&rth->u.dst.__refcnt))
529 goto out;
530
531 ret = 1;
532 if (rth->u.dst.expires &&
533 time_after_eq(jiffies, rth->u.dst.expires))
534 goto out;
535
536 age = jiffies - rth->u.dst.lastuse;
537 ret = 0;
538 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
539 (age <= tmo2 && rt_valuable(rth)))
540 goto out;
541 ret = 1;
542out: return ret;
543}
544
545/* Bits of score are:
546 * 31: very valuable
547 * 30: not quite useless
548 * 29..0: usage counter
549 */
550static inline u32 rt_score(struct rtable *rt)
551{
552 u32 score = jiffies - rt->u.dst.lastuse;
553
554 score = ~score & ~(3<<30);
555
556 if (rt_valuable(rt))
557 score |= (1<<31);
558
559 if (!rt->fl.iif ||
560 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
561 score |= (1<<30);
562
563 return score;
564}
565
566static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
567{
714e85be
AV
568 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
569 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
47dcf0cb 570 (fl1->mark ^ fl2->mark) |
8238b218
DM
571 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
572 *(u16 *)&fl2->nl_u.ip4_u.tos) |
573 (fl1->oif ^ fl2->oif) |
574 (fl1->iif ^ fl2->iif)) == 0;
1da177e4
LT
575}
576
577#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
578static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
579 struct rtable *expentry,
580 int *removed_count)
581{
582 int passedexpired = 0;
583 struct rtable **nextstep = NULL;
584 struct rtable **rthp = chain_head;
585 struct rtable *rth;
586
587 if (removed_count)
588 *removed_count = 0;
589
590 while ((rth = *rthp) != NULL) {
591 if (rth == expentry)
592 passedexpired = 1;
593
594 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
595 compare_keys(&(*rthp)->fl, &expentry->fl)) {
596 if (*rthp == expentry) {
093c2ca4 597 *rthp = rth->u.dst.rt_next;
1da177e4
LT
598 continue;
599 } else {
093c2ca4 600 *rthp = rth->u.dst.rt_next;
1da177e4
LT
601 rt_free(rth);
602 if (removed_count)
603 ++(*removed_count);
604 }
605 } else {
606 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
607 passedexpired && !nextstep)
093c2ca4 608 nextstep = &rth->u.dst.rt_next;
1da177e4 609
093c2ca4 610 rthp = &rth->u.dst.rt_next;
1da177e4
LT
611 }
612 }
613
614 rt_free(expentry);
615 if (removed_count)
616 ++(*removed_count);
617
618 return nextstep;
619}
620#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
621
622
623/* This runs via a timer and thus is always in BH context. */
624static void rt_check_expire(unsigned long dummy)
625{
bb1d23b0
ED
626 static unsigned int rover;
627 unsigned int i = rover, goal;
1da177e4
LT
628 struct rtable *rth, **rthp;
629 unsigned long now = jiffies;
bb1d23b0
ED
630 u64 mult;
631
632 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
633 if (ip_rt_gc_timeout > 1)
634 do_div(mult, ip_rt_gc_timeout);
635 goal = (unsigned int)mult;
636 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
637 for (; goal > 0; goal--) {
1da177e4
LT
638 unsigned long tmo = ip_rt_gc_timeout;
639
640 i = (i + 1) & rt_hash_mask;
641 rthp = &rt_hash_table[i].chain;
642
bb1d23b0
ED
643 if (*rthp == 0)
644 continue;
22c047cc 645 spin_lock(rt_hash_lock_addr(i));
1da177e4
LT
646 while ((rth = *rthp) != NULL) {
647 if (rth->u.dst.expires) {
648 /* Entry is expired even if it is in use */
649 if (time_before_eq(now, rth->u.dst.expires)) {
650 tmo >>= 1;
093c2ca4 651 rthp = &rth->u.dst.rt_next;
1da177e4
LT
652 continue;
653 }
654 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
655 tmo >>= 1;
093c2ca4 656 rthp = &rth->u.dst.rt_next;
1da177e4
LT
657 continue;
658 }
659
660 /* Cleanup aged off entries. */
661#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
662 /* remove all related balanced entries if necessary */
663 if (rth->u.dst.flags & DST_BALANCED) {
664 rthp = rt_remove_balanced_route(
665 &rt_hash_table[i].chain,
666 rth, NULL);
667 if (!rthp)
668 break;
669 } else {
093c2ca4 670 *rthp = rth->u.dst.rt_next;
1da177e4
LT
671 rt_free(rth);
672 }
673#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
093c2ca4 674 *rthp = rth->u.dst.rt_next;
e905a9ed 675 rt_free(rth);
1da177e4
LT
676#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
677 }
22c047cc 678 spin_unlock(rt_hash_lock_addr(i));
1da177e4
LT
679
680 /* Fallback loop breaker. */
681 if (time_after(jiffies, now))
682 break;
683 }
684 rover = i;
bb1d23b0 685 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
1da177e4
LT
686}
687
688/* This can run from both BH and non-BH contexts, the latter
689 * in the case of a forced flush event.
690 */
691static void rt_run_flush(unsigned long dummy)
692{
693 int i;
694 struct rtable *rth, *next;
695
696 rt_deadline = 0;
697
698 get_random_bytes(&rt_hash_rnd, 4);
699
700 for (i = rt_hash_mask; i >= 0; i--) {
22c047cc 701 spin_lock_bh(rt_hash_lock_addr(i));
1da177e4
LT
702 rth = rt_hash_table[i].chain;
703 if (rth)
704 rt_hash_table[i].chain = NULL;
22c047cc 705 spin_unlock_bh(rt_hash_lock_addr(i));
1da177e4
LT
706
707 for (; rth; rth = next) {
093c2ca4 708 next = rth->u.dst.rt_next;
1da177e4
LT
709 rt_free(rth);
710 }
711 }
712}
713
714static DEFINE_SPINLOCK(rt_flush_lock);
715
716void rt_cache_flush(int delay)
717{
718 unsigned long now = jiffies;
719 int user_mode = !in_softirq();
720
721 if (delay < 0)
722 delay = ip_rt_min_delay;
723
724 /* flush existing multipath state*/
725 multipath_flush();
726
727 spin_lock_bh(&rt_flush_lock);
728
729 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
730 long tmo = (long)(rt_deadline - now);
731
732 /* If flush timer is already running
733 and flush request is not immediate (delay > 0):
734
735 if deadline is not achieved, prolongate timer to "delay",
736 otherwise fire it at deadline time.
737 */
738
739 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
740 tmo = 0;
e905a9ed 741
1da177e4
LT
742 if (delay > tmo)
743 delay = tmo;
744 }
745
746 if (delay <= 0) {
747 spin_unlock_bh(&rt_flush_lock);
748 rt_run_flush(0);
749 return;
750 }
751
752 if (rt_deadline == 0)
753 rt_deadline = now + ip_rt_max_delay;
754
755 mod_timer(&rt_flush_timer, now+delay);
756 spin_unlock_bh(&rt_flush_lock);
757}
758
759static void rt_secret_rebuild(unsigned long dummy)
760{
761 unsigned long now = jiffies;
762
763 rt_cache_flush(0);
764 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
765}
766
767/*
768 Short description of GC goals.
769
770 We want to build algorithm, which will keep routing cache
771 at some equilibrium point, when number of aged off entries
772 is kept approximately equal to newly generated ones.
773
774 Current expiration strength is variable "expire".
775 We try to adjust it dynamically, so that if networking
776 is idle expires is large enough to keep enough of warm entries,
777 and when load increases it reduces to limit cache size.
778 */
779
780static int rt_garbage_collect(void)
781{
782 static unsigned long expire = RT_GC_TIMEOUT;
783 static unsigned long last_gc;
784 static int rover;
785 static int equilibrium;
786 struct rtable *rth, **rthp;
787 unsigned long now = jiffies;
788 int goal;
789
790 /*
791 * Garbage collection is pretty expensive,
792 * do not make it too frequently.
793 */
794
795 RT_CACHE_STAT_INC(gc_total);
796
797 if (now - last_gc < ip_rt_gc_min_interval &&
798 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
799 RT_CACHE_STAT_INC(gc_ignored);
800 goto out;
801 }
802
803 /* Calculate number of entries, which we want to expire now. */
804 goal = atomic_read(&ipv4_dst_ops.entries) -
805 (ip_rt_gc_elasticity << rt_hash_log);
806 if (goal <= 0) {
807 if (equilibrium < ipv4_dst_ops.gc_thresh)
808 equilibrium = ipv4_dst_ops.gc_thresh;
809 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
810 if (goal > 0) {
811 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
812 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
813 }
814 } else {
815 /* We are in dangerous area. Try to reduce cache really
816 * aggressively.
817 */
818 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
819 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
820 }
821
822 if (now - last_gc >= ip_rt_gc_min_interval)
823 last_gc = now;
824
825 if (goal <= 0) {
826 equilibrium += goal;
827 goto work_done;
828 }
829
830 do {
831 int i, k;
832
833 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
834 unsigned long tmo = expire;
835
836 k = (k + 1) & rt_hash_mask;
837 rthp = &rt_hash_table[k].chain;
22c047cc 838 spin_lock_bh(rt_hash_lock_addr(k));
1da177e4
LT
839 while ((rth = *rthp) != NULL) {
840 if (!rt_may_expire(rth, tmo, expire)) {
841 tmo >>= 1;
093c2ca4 842 rthp = &rth->u.dst.rt_next;
1da177e4
LT
843 continue;
844 }
845#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
846 /* remove all related balanced entries
847 * if necessary
848 */
849 if (rth->u.dst.flags & DST_BALANCED) {
850 int r;
851
852 rthp = rt_remove_balanced_route(
85259878 853 &rt_hash_table[k].chain,
1da177e4
LT
854 rth,
855 &r);
856 goal -= r;
857 if (!rthp)
858 break;
859 } else {
093c2ca4 860 *rthp = rth->u.dst.rt_next;
1da177e4
LT
861 rt_free(rth);
862 goal--;
863 }
864#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
093c2ca4 865 *rthp = rth->u.dst.rt_next;
1da177e4
LT
866 rt_free(rth);
867 goal--;
868#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
869 }
22c047cc 870 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
871 if (goal <= 0)
872 break;
873 }
874 rover = k;
875
876 if (goal <= 0)
877 goto work_done;
878
879 /* Goal is not achieved. We stop process if:
880
881 - if expire reduced to zero. Otherwise, expire is halfed.
882 - if table is not full.
883 - if we are called from interrupt.
884 - jiffies check is just fallback/debug loop breaker.
885 We will not spin here for long time in any case.
886 */
887
888 RT_CACHE_STAT_INC(gc_goal_miss);
889
890 if (expire == 0)
891 break;
892
893 expire >>= 1;
894#if RT_CACHE_DEBUG >= 2
895 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
896 atomic_read(&ipv4_dst_ops.entries), goal, i);
897#endif
898
899 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
900 goto out;
901 } while (!in_softirq() && time_before_eq(jiffies, now));
902
903 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
904 goto out;
905 if (net_ratelimit())
906 printk(KERN_WARNING "dst cache overflow\n");
907 RT_CACHE_STAT_INC(gc_dst_overflow);
908 return 1;
909
910work_done:
911 expire += ip_rt_gc_min_interval;
912 if (expire > ip_rt_gc_timeout ||
913 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
914 expire = ip_rt_gc_timeout;
915#if RT_CACHE_DEBUG >= 2
916 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
917 atomic_read(&ipv4_dst_ops.entries), goal, rover);
918#endif
919out: return 0;
920}
921
922static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
923{
924 struct rtable *rth, **rthp;
925 unsigned long now;
926 struct rtable *cand, **candp;
927 u32 min_score;
928 int chain_length;
929 int attempts = !in_softirq();
930
931restart:
932 chain_length = 0;
933 min_score = ~(u32)0;
934 cand = NULL;
935 candp = NULL;
936 now = jiffies;
937
938 rthp = &rt_hash_table[hash].chain;
939
22c047cc 940 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
941 while ((rth = *rthp) != NULL) {
942#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
943 if (!(rth->u.dst.flags & DST_BALANCED) &&
944 compare_keys(&rth->fl, &rt->fl)) {
945#else
946 if (compare_keys(&rth->fl, &rt->fl)) {
947#endif
948 /* Put it first */
093c2ca4 949 *rthp = rth->u.dst.rt_next;
1da177e4
LT
950 /*
951 * Since lookup is lockfree, the deletion
952 * must be visible to another weakly ordered CPU before
953 * the insertion at the start of the hash chain.
954 */
093c2ca4 955 rcu_assign_pointer(rth->u.dst.rt_next,
1da177e4
LT
956 rt_hash_table[hash].chain);
957 /*
958 * Since lookup is lockfree, the update writes
959 * must be ordered for consistency on SMP.
960 */
961 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
962
963 rth->u.dst.__use++;
964 dst_hold(&rth->u.dst);
965 rth->u.dst.lastuse = now;
22c047cc 966 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
967
968 rt_drop(rt);
969 *rp = rth;
970 return 0;
971 }
972
973 if (!atomic_read(&rth->u.dst.__refcnt)) {
974 u32 score = rt_score(rth);
975
976 if (score <= min_score) {
977 cand = rth;
978 candp = rthp;
979 min_score = score;
980 }
981 }
982
983 chain_length++;
984
093c2ca4 985 rthp = &rth->u.dst.rt_next;
1da177e4
LT
986 }
987
988 if (cand) {
989 /* ip_rt_gc_elasticity used to be average length of chain
990 * length, when exceeded gc becomes really aggressive.
991 *
992 * The second limit is less certain. At the moment it allows
993 * only 2 entries per bucket. We will see.
994 */
995 if (chain_length > ip_rt_gc_elasticity) {
093c2ca4 996 *candp = cand->u.dst.rt_next;
1da177e4
LT
997 rt_free(cand);
998 }
999 }
1000
1001 /* Try to bind route to arp only if it is output
1002 route or unicast forwarding path.
1003 */
1004 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1005 int err = arp_bind_neighbour(&rt->u.dst);
1006 if (err) {
22c047cc 1007 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1008
1009 if (err != -ENOBUFS) {
1010 rt_drop(rt);
1011 return err;
1012 }
1013
1014 /* Neighbour tables are full and nothing
1015 can be released. Try to shrink route cache,
1016 it is most likely it holds some neighbour records.
1017 */
1018 if (attempts-- > 0) {
1019 int saved_elasticity = ip_rt_gc_elasticity;
1020 int saved_int = ip_rt_gc_min_interval;
1021 ip_rt_gc_elasticity = 1;
1022 ip_rt_gc_min_interval = 0;
1023 rt_garbage_collect();
1024 ip_rt_gc_min_interval = saved_int;
1025 ip_rt_gc_elasticity = saved_elasticity;
1026 goto restart;
1027 }
1028
1029 if (net_ratelimit())
1030 printk(KERN_WARNING "Neighbour table overflow.\n");
1031 rt_drop(rt);
1032 return -ENOBUFS;
1033 }
1034 }
1035
093c2ca4 1036 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1da177e4 1037#if RT_CACHE_DEBUG >= 2
093c2ca4 1038 if (rt->u.dst.rt_next) {
1da177e4
LT
1039 struct rtable *trt;
1040 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1041 NIPQUAD(rt->rt_dst));
093c2ca4 1042 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1da177e4
LT
1043 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1044 printk("\n");
1045 }
1046#endif
1047 rt_hash_table[hash].chain = rt;
22c047cc 1048 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1049 *rp = rt;
1050 return 0;
1051}
1052
1053void rt_bind_peer(struct rtable *rt, int create)
1054{
1055 static DEFINE_SPINLOCK(rt_peer_lock);
1056 struct inet_peer *peer;
1057
1058 peer = inet_getpeer(rt->rt_dst, create);
1059
1060 spin_lock_bh(&rt_peer_lock);
1061 if (rt->peer == NULL) {
1062 rt->peer = peer;
1063 peer = NULL;
1064 }
1065 spin_unlock_bh(&rt_peer_lock);
1066 if (peer)
1067 inet_putpeer(peer);
1068}
1069
1070/*
1071 * Peer allocation may fail only in serious out-of-memory conditions. However
1072 * we still can generate some output.
1073 * Random ID selection looks a bit dangerous because we have no chances to
1074 * select ID being unique in a reasonable period of time.
1075 * But broken packet identifier may be better than no packet at all.
1076 */
1077static void ip_select_fb_ident(struct iphdr *iph)
1078{
1079 static DEFINE_SPINLOCK(ip_fb_id_lock);
1080 static u32 ip_fallback_id;
1081 u32 salt;
1082
1083 spin_lock_bh(&ip_fb_id_lock);
e448515c 1084 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1085 iph->id = htons(salt & 0xFFFF);
1086 ip_fallback_id = salt;
1087 spin_unlock_bh(&ip_fb_id_lock);
1088}
1089
1090void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1091{
1092 struct rtable *rt = (struct rtable *) dst;
1093
1094 if (rt) {
1095 if (rt->peer == NULL)
1096 rt_bind_peer(rt, 1);
1097
1098 /* If peer is attached to destination, it is never detached,
1099 so that we need not to grab a lock to dereference it.
1100 */
1101 if (rt->peer) {
1102 iph->id = htons(inet_getid(rt->peer, more));
1103 return;
1104 }
1105 } else
e905a9ed 1106 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1107 __builtin_return_address(0));
1da177e4
LT
1108
1109 ip_select_fb_ident(iph);
1110}
1111
1112static void rt_del(unsigned hash, struct rtable *rt)
1113{
1114 struct rtable **rthp;
1115
22c047cc 1116 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1117 ip_rt_put(rt);
1118 for (rthp = &rt_hash_table[hash].chain; *rthp;
093c2ca4 1119 rthp = &(*rthp)->u.dst.rt_next)
1da177e4 1120 if (*rthp == rt) {
093c2ca4 1121 *rthp = rt->u.dst.rt_next;
1da177e4
LT
1122 rt_free(rt);
1123 break;
1124 }
22c047cc 1125 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1126}
1127
f7655229
AV
1128void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1129 __be32 saddr, struct net_device *dev)
1da177e4
LT
1130{
1131 int i, k;
1132 struct in_device *in_dev = in_dev_get(dev);
1133 struct rtable *rth, **rthp;
f7655229 1134 __be32 skeys[2] = { saddr, 0 };
1da177e4 1135 int ikeys[2] = { dev->ifindex, 0 };
8d71740c 1136 struct netevent_redirect netevent;
1da177e4 1137
1da177e4
LT
1138 if (!in_dev)
1139 return;
1140
1141 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1142 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1143 goto reject_redirect;
1144
1145 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1146 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1147 goto reject_redirect;
1148 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1149 goto reject_redirect;
1150 } else {
1151 if (inet_addr_type(new_gw) != RTN_UNICAST)
1152 goto reject_redirect;
1153 }
1154
1155 for (i = 0; i < 2; i++) {
1156 for (k = 0; k < 2; k++) {
8c7bc840 1157 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1da177e4
LT
1158
1159 rthp=&rt_hash_table[hash].chain;
1160
1161 rcu_read_lock();
1162 while ((rth = rcu_dereference(*rthp)) != NULL) {
1163 struct rtable *rt;
1164
1165 if (rth->fl.fl4_dst != daddr ||
1166 rth->fl.fl4_src != skeys[i] ||
1da177e4
LT
1167 rth->fl.oif != ikeys[k] ||
1168 rth->fl.iif != 0) {
093c2ca4 1169 rthp = &rth->u.dst.rt_next;
1da177e4
LT
1170 continue;
1171 }
1172
1173 if (rth->rt_dst != daddr ||
1174 rth->rt_src != saddr ||
1175 rth->u.dst.error ||
1176 rth->rt_gateway != old_gw ||
1177 rth->u.dst.dev != dev)
1178 break;
1179
1180 dst_hold(&rth->u.dst);
1181 rcu_read_unlock();
1182
1183 rt = dst_alloc(&ipv4_dst_ops);
1184 if (rt == NULL) {
1185 ip_rt_put(rth);
1186 in_dev_put(in_dev);
1187 return;
1188 }
1189
1190 /* Copy all the information. */
1191 *rt = *rth;
e905a9ed 1192 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1da177e4
LT
1193 rt->u.dst.__use = 1;
1194 atomic_set(&rt->u.dst.__refcnt, 1);
1195 rt->u.dst.child = NULL;
1196 if (rt->u.dst.dev)
1197 dev_hold(rt->u.dst.dev);
1198 if (rt->idev)
1199 in_dev_hold(rt->idev);
1200 rt->u.dst.obsolete = 0;
1201 rt->u.dst.lastuse = jiffies;
1202 rt->u.dst.path = &rt->u.dst;
1203 rt->u.dst.neighbour = NULL;
1204 rt->u.dst.hh = NULL;
1205 rt->u.dst.xfrm = NULL;
1206
1207 rt->rt_flags |= RTCF_REDIRECTED;
1208
1209 /* Gateway is different ... */
1210 rt->rt_gateway = new_gw;
1211
1212 /* Redirect received -> path was valid */
1213 dst_confirm(&rth->u.dst);
1214
1215 if (rt->peer)
1216 atomic_inc(&rt->peer->refcnt);
1217
1218 if (arp_bind_neighbour(&rt->u.dst) ||
1219 !(rt->u.dst.neighbour->nud_state &
1220 NUD_VALID)) {
1221 if (rt->u.dst.neighbour)
1222 neigh_event_send(rt->u.dst.neighbour, NULL);
1223 ip_rt_put(rth);
1224 rt_drop(rt);
1225 goto do_next;
1226 }
e905a9ed 1227
8d71740c
TT
1228 netevent.old = &rth->u.dst;
1229 netevent.new = &rt->u.dst;
e905a9ed
YH
1230 call_netevent_notifiers(NETEVENT_REDIRECT,
1231 &netevent);
1da177e4
LT
1232
1233 rt_del(hash, rth);
1234 if (!rt_intern_hash(hash, rt, &rt))
1235 ip_rt_put(rt);
1236 goto do_next;
1237 }
1238 rcu_read_unlock();
1239 do_next:
1240 ;
1241 }
1242 }
1243 in_dev_put(in_dev);
1244 return;
1245
1246reject_redirect:
1247#ifdef CONFIG_IP_ROUTE_VERBOSE
1248 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1249 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1250 "%u.%u.%u.%u ignored.\n"
cef2685e 1251 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1da177e4 1252 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
cef2685e 1253 NIPQUAD(saddr), NIPQUAD(daddr));
1da177e4
LT
1254#endif
1255 in_dev_put(in_dev);
1256}
1257
1258static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1259{
1260 struct rtable *rt = (struct rtable*)dst;
1261 struct dst_entry *ret = dst;
1262
1263 if (rt) {
1264 if (dst->obsolete) {
1265 ip_rt_put(rt);
1266 ret = NULL;
1267 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1268 rt->u.dst.expires) {
8c7bc840
AV
1269 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1270 rt->fl.oif);
1da177e4
LT
1271#if RT_CACHE_DEBUG >= 1
1272 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1273 "%u.%u.%u.%u/%02x dropped\n",
1274 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1275#endif
1276 rt_del(hash, rt);
1277 ret = NULL;
1278 }
1279 }
1280 return ret;
1281}
1282
1283/*
1284 * Algorithm:
1285 * 1. The first ip_rt_redirect_number redirects are sent
1286 * with exponential backoff, then we stop sending them at all,
1287 * assuming that the host ignores our redirects.
1288 * 2. If we did not see packets requiring redirects
1289 * during ip_rt_redirect_silence, we assume that the host
1290 * forgot redirected route and start to send redirects again.
1291 *
1292 * This algorithm is much cheaper and more intelligent than dumb load limiting
1293 * in icmp.c.
1294 *
1295 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1296 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1297 */
1298
1299void ip_rt_send_redirect(struct sk_buff *skb)
1300{
1301 struct rtable *rt = (struct rtable*)skb->dst;
1302 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1303
1304 if (!in_dev)
1305 return;
1306
1307 if (!IN_DEV_TX_REDIRECTS(in_dev))
1308 goto out;
1309
1310 /* No redirected packets during ip_rt_redirect_silence;
1311 * reset the algorithm.
1312 */
1313 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1314 rt->u.dst.rate_tokens = 0;
1315
1316 /* Too many ignored redirects; do not send anything
1317 * set u.dst.rate_last to the last seen redirected packet.
1318 */
1319 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1320 rt->u.dst.rate_last = jiffies;
1321 goto out;
1322 }
1323
1324 /* Check for load limit; set rate_last to the latest sent
1325 * redirect.
1326 */
14fb8a76
LY
1327 if (rt->u.dst.rate_tokens == 0 ||
1328 time_after(jiffies,
1da177e4
LT
1329 (rt->u.dst.rate_last +
1330 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1331 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1332 rt->u.dst.rate_last = jiffies;
1333 ++rt->u.dst.rate_tokens;
1334#ifdef CONFIG_IP_ROUTE_VERBOSE
1335 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1336 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1337 net_ratelimit())
1338 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1339 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1340 NIPQUAD(rt->rt_src), rt->rt_iif,
1341 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1342#endif
1343 }
1344out:
e905a9ed 1345 in_dev_put(in_dev);
1da177e4
LT
1346}
1347
1348static int ip_error(struct sk_buff *skb)
1349{
1350 struct rtable *rt = (struct rtable*)skb->dst;
1351 unsigned long now;
1352 int code;
1353
1354 switch (rt->u.dst.error) {
1355 case EINVAL:
1356 default:
1357 goto out;
1358 case EHOSTUNREACH:
1359 code = ICMP_HOST_UNREACH;
1360 break;
1361 case ENETUNREACH:
1362 code = ICMP_NET_UNREACH;
1363 break;
1364 case EACCES:
1365 code = ICMP_PKT_FILTERED;
1366 break;
1367 }
1368
1369 now = jiffies;
1370 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1371 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1372 rt->u.dst.rate_tokens = ip_rt_error_burst;
1373 rt->u.dst.rate_last = now;
1374 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1375 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1376 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1377 }
1378
1379out: kfree_skb(skb);
1380 return 0;
e905a9ed 1381}
1da177e4
LT
1382
1383/*
1384 * The last two values are not from the RFC but
1385 * are needed for AMPRnet AX.25 paths.
1386 */
1387
9b5b5cff 1388static const unsigned short mtu_plateau[] =
1da177e4
LT
1389{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1390
1391static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1392{
1393 int i;
e905a9ed 1394
1da177e4
LT
1395 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1396 if (old_mtu > mtu_plateau[i])
1397 return mtu_plateau[i];
1398 return 68;
1399}
1400
1401unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1402{
1403 int i;
1404 unsigned short old_mtu = ntohs(iph->tot_len);
1405 struct rtable *rth;
e448515c
AV
1406 __be32 skeys[2] = { iph->saddr, 0, };
1407 __be32 daddr = iph->daddr;
1da177e4
LT
1408 unsigned short est_mtu = 0;
1409
1410 if (ipv4_config.no_pmtu_disc)
1411 return 0;
1412
1413 for (i = 0; i < 2; i++) {
8c7bc840 1414 unsigned hash = rt_hash(daddr, skeys[i], 0);
1da177e4
LT
1415
1416 rcu_read_lock();
1417 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 1418 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
1419 if (rth->fl.fl4_dst == daddr &&
1420 rth->fl.fl4_src == skeys[i] &&
1421 rth->rt_dst == daddr &&
1422 rth->rt_src == iph->saddr &&
1da177e4
LT
1423 rth->fl.iif == 0 &&
1424 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1425 unsigned short mtu = new_mtu;
1426
1427 if (new_mtu < 68 || new_mtu >= old_mtu) {
1428
1429 /* BSD 4.2 compatibility hack :-( */
1430 if (mtu == 0 &&
1431 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1432 old_mtu >= 68 + (iph->ihl << 2))
1433 old_mtu -= iph->ihl << 2;
1434
1435 mtu = guess_mtu(old_mtu);
1436 }
1437 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
e905a9ed 1438 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1da177e4
LT
1439 dst_confirm(&rth->u.dst);
1440 if (mtu < ip_rt_min_pmtu) {
1441 mtu = ip_rt_min_pmtu;
1442 rth->u.dst.metrics[RTAX_LOCK-1] |=
1443 (1 << RTAX_MTU);
1444 }
1445 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1446 dst_set_expires(&rth->u.dst,
1447 ip_rt_mtu_expires);
1448 }
1449 est_mtu = mtu;
1450 }
1451 }
1452 }
1453 rcu_read_unlock();
1454 }
1455 return est_mtu ? : new_mtu;
1456}
1457
1458static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1459{
1460 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1461 !(dst_metric_locked(dst, RTAX_MTU))) {
1462 if (mtu < ip_rt_min_pmtu) {
1463 mtu = ip_rt_min_pmtu;
1464 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1465 }
1466 dst->metrics[RTAX_MTU-1] = mtu;
1467 dst_set_expires(dst, ip_rt_mtu_expires);
8d71740c 1468 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1da177e4
LT
1469 }
1470}
1471
1472static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1473{
1474 return NULL;
1475}
1476
1477static void ipv4_dst_destroy(struct dst_entry *dst)
1478{
1479 struct rtable *rt = (struct rtable *) dst;
1480 struct inet_peer *peer = rt->peer;
1481 struct in_device *idev = rt->idev;
1482
1483 if (peer) {
1484 rt->peer = NULL;
1485 inet_putpeer(peer);
1486 }
1487
1488 if (idev) {
1489 rt->idev = NULL;
1490 in_dev_put(idev);
1491 }
1492}
1493
1494static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1495 int how)
1496{
1497 struct rtable *rt = (struct rtable *) dst;
1498 struct in_device *idev = rt->idev;
1499 if (dev != &loopback_dev && idev && idev->dev == dev) {
1500 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1501 if (loopback_idev) {
1502 rt->idev = loopback_idev;
1503 in_dev_put(idev);
1504 }
1505 }
1506}
1507
1508static void ipv4_link_failure(struct sk_buff *skb)
1509{
1510 struct rtable *rt;
1511
1512 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1513
1514 rt = (struct rtable *) skb->dst;
1515 if (rt)
1516 dst_set_expires(&rt->u.dst, 0);
1517}
1518
1519static int ip_rt_bug(struct sk_buff *skb)
1520{
1521 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
eddc9ec5 1522 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1da177e4
LT
1523 skb->dev ? skb->dev->name : "?");
1524 kfree_skb(skb);
1525 return 0;
1526}
1527
1528/*
1529 We do not cache source address of outgoing interface,
1530 because it is used only by IP RR, TS and SRR options,
1531 so that it out of fast path.
1532
1533 BTW remember: "addr" is allowed to be not aligned
1534 in IP options!
1535 */
1536
1537void ip_rt_get_source(u8 *addr, struct rtable *rt)
1538{
a61ced5d 1539 __be32 src;
1da177e4
LT
1540 struct fib_result res;
1541
1542 if (rt->fl.iif == 0)
1543 src = rt->rt_src;
1544 else if (fib_lookup(&rt->fl, &res) == 0) {
1545 src = FIB_RES_PREFSRC(res);
1546 fib_res_put(&res);
1547 } else
1548 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1549 RT_SCOPE_UNIVERSE);
1550 memcpy(addr, &src, 4);
1551}
1552
1553#ifdef CONFIG_NET_CLS_ROUTE
1554static void set_class_tag(struct rtable *rt, u32 tag)
1555{
1556 if (!(rt->u.dst.tclassid & 0xFFFF))
1557 rt->u.dst.tclassid |= tag & 0xFFFF;
1558 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1559 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1560}
1561#endif
1562
1563static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1564{
1565 struct fib_info *fi = res->fi;
1566
1567 if (fi) {
1568 if (FIB_RES_GW(*res) &&
1569 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1570 rt->rt_gateway = FIB_RES_GW(*res);
1571 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1572 sizeof(rt->u.dst.metrics));
1573 if (fi->fib_mtu == 0) {
1574 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1575 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1576 rt->rt_gateway != rt->rt_dst &&
1577 rt->u.dst.dev->mtu > 576)
1578 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1579 }
1580#ifdef CONFIG_NET_CLS_ROUTE
1581 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1582#endif
1583 } else
1584 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1585
1586 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1587 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1588 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1589 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1590 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1591 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1592 ip_rt_min_advmss);
1593 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1594 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1595
1596#ifdef CONFIG_NET_CLS_ROUTE
1597#ifdef CONFIG_IP_MULTIPLE_TABLES
1598 set_class_tag(rt, fib_rules_tclass(res));
1599#endif
1600 set_class_tag(rt, itag);
1601#endif
e905a9ed 1602 rt->rt_type = res->type;
1da177e4
LT
1603}
1604
9e12bb22 1605static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1606 u8 tos, struct net_device *dev, int our)
1607{
1608 unsigned hash;
1609 struct rtable *rth;
a61ced5d 1610 __be32 spec_dst;
1da177e4
LT
1611 struct in_device *in_dev = in_dev_get(dev);
1612 u32 itag = 0;
1613
1614 /* Primary sanity checks. */
1615
1616 if (in_dev == NULL)
1617 return -EINVAL;
1618
1619 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1620 skb->protocol != htons(ETH_P_IP))
1621 goto e_inval;
1622
1623 if (ZERONET(saddr)) {
1624 if (!LOCAL_MCAST(daddr))
1625 goto e_inval;
1626 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1627 } else if (fib_validate_source(saddr, 0, tos, 0,
1628 dev, &spec_dst, &itag) < 0)
1629 goto e_inval;
1630
1631 rth = dst_alloc(&ipv4_dst_ops);
1632 if (!rth)
1633 goto e_nobufs;
1634
1635 rth->u.dst.output= ip_rt_bug;
1636
1637 atomic_set(&rth->u.dst.__refcnt, 1);
1638 rth->u.dst.flags= DST_HOST;
1639 if (in_dev->cnf.no_policy)
1640 rth->u.dst.flags |= DST_NOPOLICY;
1641 rth->fl.fl4_dst = daddr;
1642 rth->rt_dst = daddr;
1643 rth->fl.fl4_tos = tos;
47dcf0cb 1644 rth->fl.mark = skb->mark;
1da177e4
LT
1645 rth->fl.fl4_src = saddr;
1646 rth->rt_src = saddr;
1647#ifdef CONFIG_NET_CLS_ROUTE
1648 rth->u.dst.tclassid = itag;
1649#endif
1650 rth->rt_iif =
1651 rth->fl.iif = dev->ifindex;
1652 rth->u.dst.dev = &loopback_dev;
1653 dev_hold(rth->u.dst.dev);
1654 rth->idev = in_dev_get(rth->u.dst.dev);
1655 rth->fl.oif = 0;
1656 rth->rt_gateway = daddr;
1657 rth->rt_spec_dst= spec_dst;
1658 rth->rt_type = RTN_MULTICAST;
1659 rth->rt_flags = RTCF_MULTICAST;
1660 if (our) {
1661 rth->u.dst.input= ip_local_deliver;
1662 rth->rt_flags |= RTCF_LOCAL;
1663 }
1664
1665#ifdef CONFIG_IP_MROUTE
1666 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1667 rth->u.dst.input = ip_mr_input;
1668#endif
1669 RT_CACHE_STAT_INC(in_slow_mc);
1670
1671 in_dev_put(in_dev);
8c7bc840 1672 hash = rt_hash(daddr, saddr, dev->ifindex);
1da177e4
LT
1673 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1674
1675e_nobufs:
1676 in_dev_put(in_dev);
1677 return -ENOBUFS;
1678
1679e_inval:
1680 in_dev_put(in_dev);
1681 return -EINVAL;
1682}
1683
1684
1685static void ip_handle_martian_source(struct net_device *dev,
1686 struct in_device *in_dev,
1687 struct sk_buff *skb,
9e12bb22
AV
1688 __be32 daddr,
1689 __be32 saddr)
1da177e4
LT
1690{
1691 RT_CACHE_STAT_INC(in_martian_src);
1692#ifdef CONFIG_IP_ROUTE_VERBOSE
1693 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1694 /*
1695 * RFC1812 recommendation, if source is martian,
1696 * the only hint is MAC header.
1697 */
1698 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1699 "%u.%u.%u.%u, on dev %s\n",
1700 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
98e399f8 1701 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 1702 int i;
98e399f8 1703 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
1704 printk(KERN_WARNING "ll header: ");
1705 for (i = 0; i < dev->hard_header_len; i++, p++) {
1706 printk("%02x", *p);
1707 if (i < (dev->hard_header_len - 1))
1708 printk(":");
1709 }
1710 printk("\n");
1711 }
1712 }
1713#endif
1714}
1715
e905a9ed
YH
1716static inline int __mkroute_input(struct sk_buff *skb,
1717 struct fib_result* res,
1718 struct in_device *in_dev,
9e12bb22 1719 __be32 daddr, __be32 saddr, u32 tos,
e905a9ed 1720 struct rtable **result)
1da177e4
LT
1721{
1722
1723 struct rtable *rth;
1724 int err;
1725 struct in_device *out_dev;
1726 unsigned flags = 0;
d9c9df8c
AV
1727 __be32 spec_dst;
1728 u32 itag;
1da177e4
LT
1729
1730 /* get a working reference to the output device */
1731 out_dev = in_dev_get(FIB_RES_DEV(*res));
1732 if (out_dev == NULL) {
1733 if (net_ratelimit())
1734 printk(KERN_CRIT "Bug in ip_route_input" \
1735 "_slow(). Please, report\n");
1736 return -EINVAL;
1737 }
1738
1739
e905a9ed 1740 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1da177e4
LT
1741 in_dev->dev, &spec_dst, &itag);
1742 if (err < 0) {
e905a9ed 1743 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1744 saddr);
e905a9ed 1745
1da177e4
LT
1746 err = -EINVAL;
1747 goto cleanup;
1748 }
1749
1750 if (err)
1751 flags |= RTCF_DIRECTSRC;
1752
1753 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1754 (IN_DEV_SHARED_MEDIA(out_dev) ||
1755 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1756 flags |= RTCF_DOREDIRECT;
1757
1758 if (skb->protocol != htons(ETH_P_IP)) {
1759 /* Not IP (i.e. ARP). Do not create route, if it is
1760 * invalid for proxy arp. DNAT routes are always valid.
1761 */
1762 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1763 err = -EINVAL;
1764 goto cleanup;
1765 }
1766 }
1767
1768
1769 rth = dst_alloc(&ipv4_dst_ops);
1770 if (!rth) {
1771 err = -ENOBUFS;
1772 goto cleanup;
1773 }
1774
ce723d8e 1775 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4
LT
1776 rth->u.dst.flags= DST_HOST;
1777#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1778 if (res->fi->fib_nhs > 1)
1779 rth->u.dst.flags |= DST_BALANCED;
1780#endif
1781 if (in_dev->cnf.no_policy)
1782 rth->u.dst.flags |= DST_NOPOLICY;
1b6651f1 1783 if (out_dev->cnf.no_xfrm)
1da177e4
LT
1784 rth->u.dst.flags |= DST_NOXFRM;
1785 rth->fl.fl4_dst = daddr;
1786 rth->rt_dst = daddr;
1787 rth->fl.fl4_tos = tos;
47dcf0cb 1788 rth->fl.mark = skb->mark;
1da177e4
LT
1789 rth->fl.fl4_src = saddr;
1790 rth->rt_src = saddr;
1791 rth->rt_gateway = daddr;
1792 rth->rt_iif =
1793 rth->fl.iif = in_dev->dev->ifindex;
1794 rth->u.dst.dev = (out_dev)->dev;
1795 dev_hold(rth->u.dst.dev);
1796 rth->idev = in_dev_get(rth->u.dst.dev);
1797 rth->fl.oif = 0;
1798 rth->rt_spec_dst= spec_dst;
1799
1800 rth->u.dst.input = ip_forward;
1801 rth->u.dst.output = ip_output;
1802
1803 rt_set_nexthop(rth, res, itag);
1804
1805 rth->rt_flags = flags;
1806
1807 *result = rth;
1808 err = 0;
1809 cleanup:
1810 /* release the working reference to the output device */
1811 in_dev_put(out_dev);
1812 return err;
e905a9ed 1813}
1da177e4 1814
e905a9ed
YH
1815static inline int ip_mkroute_input_def(struct sk_buff *skb,
1816 struct fib_result* res,
1da177e4
LT
1817 const struct flowi *fl,
1818 struct in_device *in_dev,
9e12bb22 1819 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1820{
7abaa27c 1821 struct rtable* rth = NULL;
1da177e4
LT
1822 int err;
1823 unsigned hash;
1824
1825#ifdef CONFIG_IP_ROUTE_MULTIPATH
1826 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1827 fib_select_multipath(fl, res);
1828#endif
1829
1830 /* create a routing cache entry */
1831 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1832 if (err)
1833 return err;
1da177e4
LT
1834
1835 /* put it into the cache */
8c7bc840 1836 hash = rt_hash(daddr, saddr, fl->iif);
e905a9ed 1837 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1da177e4
LT
1838}
1839
e905a9ed
YH
1840static inline int ip_mkroute_input(struct sk_buff *skb,
1841 struct fib_result* res,
1da177e4
LT
1842 const struct flowi *fl,
1843 struct in_device *in_dev,
9e12bb22 1844 __be32 daddr, __be32 saddr, u32 tos)
1da177e4
LT
1845{
1846#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
ce723d8e
JA
1847 struct rtable* rth = NULL, *rtres;
1848 unsigned char hop, hopcount;
1da177e4
LT
1849 int err = -EINVAL;
1850 unsigned int hash;
1851
1852 if (res->fi)
1853 hopcount = res->fi->fib_nhs;
1854 else
1855 hopcount = 1;
1856
1da177e4
LT
1857 /* distinguish between multipath and singlepath */
1858 if (hopcount < 2)
1859 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1860 saddr, tos);
e905a9ed 1861
1da177e4
LT
1862 /* add all alternatives to the routing cache */
1863 for (hop = 0; hop < hopcount; hop++) {
1864 res->nh_sel = hop;
1865
ce723d8e
JA
1866 /* put reference to previous result */
1867 if (hop)
1868 ip_rt_put(rtres);
1869
1da177e4
LT
1870 /* create a routing cache entry */
1871 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1872 &rth);
1873 if (err)
1874 return err;
1875
1876 /* put it into the cache */
8c7bc840 1877 hash = rt_hash(daddr, saddr, fl->iif);
ce723d8e 1878 err = rt_intern_hash(hash, rth, &rtres);
1da177e4
LT
1879 if (err)
1880 return err;
1881
1882 /* forward hop information to multipath impl. */
1883 multipath_set_nhinfo(rth,
1884 FIB_RES_NETWORK(*res),
1885 FIB_RES_NETMASK(*res),
1886 res->prefixlen,
1887 &FIB_RES_NH(*res));
1da177e4 1888 }
ce723d8e 1889 skb->dst = &rtres->u.dst;
1da177e4
LT
1890 return err;
1891#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1892 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1893#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1894}
1895
1896
1897/*
1898 * NOTE. We drop all the packets that has local source
1899 * addresses, because every properly looped back packet
1900 * must have correct destination already attached by output routine.
1901 *
1902 * Such approach solves two big problems:
1903 * 1. Not simplex devices are handled properly.
1904 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1905 */
1906
9e12bb22 1907static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1908 u8 tos, struct net_device *dev)
1909{
1910 struct fib_result res;
1911 struct in_device *in_dev = in_dev_get(dev);
1912 struct flowi fl = { .nl_u = { .ip4_u =
1913 { .daddr = daddr,
1914 .saddr = saddr,
1915 .tos = tos,
1916 .scope = RT_SCOPE_UNIVERSE,
1da177e4 1917 } },
47dcf0cb 1918 .mark = skb->mark,
1da177e4
LT
1919 .iif = dev->ifindex };
1920 unsigned flags = 0;
1921 u32 itag = 0;
1922 struct rtable * rth;
1923 unsigned hash;
9e12bb22 1924 __be32 spec_dst;
1da177e4
LT
1925 int err = -EINVAL;
1926 int free_res = 0;
1927
1928 /* IP on this device is disabled. */
1929
1930 if (!in_dev)
1931 goto out;
1932
1933 /* Check for the most weird martians, which can be not detected
1934 by fib_lookup.
1935 */
1936
1937 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1938 goto martian_source;
1939
e448515c 1940 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1da177e4
LT
1941 goto brd_input;
1942
1943 /* Accept zero addresses only to limited broadcast;
1944 * I even do not know to fix it or not. Waiting for complains :-)
1945 */
1946 if (ZERONET(saddr))
1947 goto martian_source;
1948
1949 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1950 goto martian_destination;
1951
1952 /*
1953 * Now we are ready to route packet.
1954 */
1955 if ((err = fib_lookup(&fl, &res)) != 0) {
1956 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1957 goto e_hostunreach;
1da177e4
LT
1958 goto no_route;
1959 }
1960 free_res = 1;
1961
1962 RT_CACHE_STAT_INC(in_slow_tot);
1963
1964 if (res.type == RTN_BROADCAST)
1965 goto brd_input;
1966
1967 if (res.type == RTN_LOCAL) {
1968 int result;
1969 result = fib_validate_source(saddr, daddr, tos,
1970 loopback_dev.ifindex,
1971 dev, &spec_dst, &itag);
1972 if (result < 0)
1973 goto martian_source;
1974 if (result)
1975 flags |= RTCF_DIRECTSRC;
1976 spec_dst = daddr;
1977 goto local_input;
1978 }
1979
1980 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1981 goto e_hostunreach;
1da177e4
LT
1982 if (res.type != RTN_UNICAST)
1983 goto martian_destination;
1984
1985 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1986 if (err == -ENOBUFS)
1987 goto e_nobufs;
1988 if (err == -EINVAL)
1989 goto e_inval;
e905a9ed 1990
1da177e4
LT
1991done:
1992 in_dev_put(in_dev);
1993 if (free_res)
1994 fib_res_put(&res);
1995out: return err;
1996
1997brd_input:
1998 if (skb->protocol != htons(ETH_P_IP))
1999 goto e_inval;
2000
2001 if (ZERONET(saddr))
2002 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2003 else {
2004 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2005 &itag);
2006 if (err < 0)
2007 goto martian_source;
2008 if (err)
2009 flags |= RTCF_DIRECTSRC;
2010 }
2011 flags |= RTCF_BROADCAST;
2012 res.type = RTN_BROADCAST;
2013 RT_CACHE_STAT_INC(in_brd);
2014
2015local_input:
2016 rth = dst_alloc(&ipv4_dst_ops);
2017 if (!rth)
2018 goto e_nobufs;
2019
2020 rth->u.dst.output= ip_rt_bug;
2021
2022 atomic_set(&rth->u.dst.__refcnt, 1);
2023 rth->u.dst.flags= DST_HOST;
2024 if (in_dev->cnf.no_policy)
2025 rth->u.dst.flags |= DST_NOPOLICY;
2026 rth->fl.fl4_dst = daddr;
2027 rth->rt_dst = daddr;
2028 rth->fl.fl4_tos = tos;
47dcf0cb 2029 rth->fl.mark = skb->mark;
1da177e4
LT
2030 rth->fl.fl4_src = saddr;
2031 rth->rt_src = saddr;
2032#ifdef CONFIG_NET_CLS_ROUTE
2033 rth->u.dst.tclassid = itag;
2034#endif
2035 rth->rt_iif =
2036 rth->fl.iif = dev->ifindex;
2037 rth->u.dst.dev = &loopback_dev;
2038 dev_hold(rth->u.dst.dev);
2039 rth->idev = in_dev_get(rth->u.dst.dev);
2040 rth->rt_gateway = daddr;
2041 rth->rt_spec_dst= spec_dst;
2042 rth->u.dst.input= ip_local_deliver;
2043 rth->rt_flags = flags|RTCF_LOCAL;
2044 if (res.type == RTN_UNREACHABLE) {
2045 rth->u.dst.input= ip_error;
2046 rth->u.dst.error= -err;
2047 rth->rt_flags &= ~RTCF_LOCAL;
2048 }
2049 rth->rt_type = res.type;
8c7bc840 2050 hash = rt_hash(daddr, saddr, fl.iif);
1da177e4
LT
2051 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2052 goto done;
2053
2054no_route:
2055 RT_CACHE_STAT_INC(in_no_route);
2056 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2057 res.type = RTN_UNREACHABLE;
2058 goto local_input;
2059
2060 /*
2061 * Do not cache martian addresses: they should be logged (RFC1812)
2062 */
2063martian_destination:
2064 RT_CACHE_STAT_INC(in_martian_dst);
2065#ifdef CONFIG_IP_ROUTE_VERBOSE
2066 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2067 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2068 "%u.%u.%u.%u, dev %s\n",
2069 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2070#endif
2c2910a4
DE
2071
2072e_hostunreach:
e905a9ed
YH
2073 err = -EHOSTUNREACH;
2074 goto done;
2c2910a4 2075
1da177e4
LT
2076e_inval:
2077 err = -EINVAL;
2078 goto done;
2079
2080e_nobufs:
2081 err = -ENOBUFS;
2082 goto done;
2083
2084martian_source:
2085 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2086 goto e_inval;
2087}
2088
9e12bb22 2089int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2090 u8 tos, struct net_device *dev)
2091{
2092 struct rtable * rth;
2093 unsigned hash;
2094 int iif = dev->ifindex;
2095
2096 tos &= IPTOS_RT_MASK;
8c7bc840 2097 hash = rt_hash(daddr, saddr, iif);
1da177e4
LT
2098
2099 rcu_read_lock();
2100 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 2101 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
2102 if (rth->fl.fl4_dst == daddr &&
2103 rth->fl.fl4_src == saddr &&
2104 rth->fl.iif == iif &&
2105 rth->fl.oif == 0 &&
47dcf0cb 2106 rth->fl.mark == skb->mark &&
1da177e4
LT
2107 rth->fl.fl4_tos == tos) {
2108 rth->u.dst.lastuse = jiffies;
2109 dst_hold(&rth->u.dst);
2110 rth->u.dst.__use++;
2111 RT_CACHE_STAT_INC(in_hit);
2112 rcu_read_unlock();
2113 skb->dst = (struct dst_entry*)rth;
2114 return 0;
2115 }
2116 RT_CACHE_STAT_INC(in_hlist_search);
2117 }
2118 rcu_read_unlock();
2119
2120 /* Multicast recognition logic is moved from route cache to here.
2121 The problem was that too many Ethernet cards have broken/missing
2122 hardware multicast filters :-( As result the host on multicasting
2123 network acquires a lot of useless route cache entries, sort of
2124 SDR messages from all the world. Now we try to get rid of them.
2125 Really, provided software IP multicast filter is organized
2126 reasonably (at least, hashed), it does not result in a slowdown
2127 comparing with route cache reject entries.
2128 Note, that multicast routers are not affected, because
2129 route cache entry is created eventually.
2130 */
2131 if (MULTICAST(daddr)) {
2132 struct in_device *in_dev;
2133
2134 rcu_read_lock();
e5ed6399 2135 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1da177e4 2136 int our = ip_check_mc(in_dev, daddr, saddr,
eddc9ec5 2137 ip_hdr(skb)->protocol);
1da177e4
LT
2138 if (our
2139#ifdef CONFIG_IP_MROUTE
2140 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2141#endif
2142 ) {
2143 rcu_read_unlock();
2144 return ip_route_input_mc(skb, daddr, saddr,
2145 tos, dev, our);
2146 }
2147 }
2148 rcu_read_unlock();
2149 return -EINVAL;
2150 }
2151 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2152}
2153
2154static inline int __mkroute_output(struct rtable **result,
e905a9ed 2155 struct fib_result* res,
1da177e4 2156 const struct flowi *fl,
e905a9ed
YH
2157 const struct flowi *oldflp,
2158 struct net_device *dev_out,
2159 unsigned flags)
1da177e4
LT
2160{
2161 struct rtable *rth;
2162 struct in_device *in_dev;
2163 u32 tos = RT_FL_TOS(oldflp);
2164 int err = 0;
2165
2166 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2167 return -EINVAL;
2168
e448515c 2169 if (fl->fl4_dst == htonl(0xFFFFFFFF))
1da177e4
LT
2170 res->type = RTN_BROADCAST;
2171 else if (MULTICAST(fl->fl4_dst))
2172 res->type = RTN_MULTICAST;
2173 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2174 return -EINVAL;
2175
2176 if (dev_out->flags & IFF_LOOPBACK)
2177 flags |= RTCF_LOCAL;
2178
2179 /* get work reference to inet device */
2180 in_dev = in_dev_get(dev_out);
2181 if (!in_dev)
2182 return -EINVAL;
2183
2184 if (res->type == RTN_BROADCAST) {
2185 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2186 if (res->fi) {
2187 fib_info_put(res->fi);
2188 res->fi = NULL;
2189 }
2190 } else if (res->type == RTN_MULTICAST) {
2191 flags |= RTCF_MULTICAST|RTCF_LOCAL;
e905a9ed 2192 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
1da177e4
LT
2193 oldflp->proto))
2194 flags &= ~RTCF_LOCAL;
2195 /* If multicast route do not exist use
2196 default one, but do not gateway in this case.
2197 Yes, it is hack.
2198 */
2199 if (res->fi && res->prefixlen < 4) {
2200 fib_info_put(res->fi);
2201 res->fi = NULL;
2202 }
2203 }
2204
2205
2206 rth = dst_alloc(&ipv4_dst_ops);
2207 if (!rth) {
2208 err = -ENOBUFS;
2209 goto cleanup;
e905a9ed 2210 }
1da177e4 2211
ce723d8e 2212 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4
LT
2213 rth->u.dst.flags= DST_HOST;
2214#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2215 if (res->fi) {
2216 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2217 if (res->fi->fib_nhs > 1)
2218 rth->u.dst.flags |= DST_BALANCED;
2219 }
2220#endif
2221 if (in_dev->cnf.no_xfrm)
2222 rth->u.dst.flags |= DST_NOXFRM;
2223 if (in_dev->cnf.no_policy)
2224 rth->u.dst.flags |= DST_NOPOLICY;
2225
2226 rth->fl.fl4_dst = oldflp->fl4_dst;
2227 rth->fl.fl4_tos = tos;
2228 rth->fl.fl4_src = oldflp->fl4_src;
2229 rth->fl.oif = oldflp->oif;
47dcf0cb 2230 rth->fl.mark = oldflp->mark;
1da177e4
LT
2231 rth->rt_dst = fl->fl4_dst;
2232 rth->rt_src = fl->fl4_src;
2233 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
e905a9ed 2234 /* get references to the devices that are to be hold by the routing
1da177e4
LT
2235 cache entry */
2236 rth->u.dst.dev = dev_out;
2237 dev_hold(dev_out);
2238 rth->idev = in_dev_get(dev_out);
2239 rth->rt_gateway = fl->fl4_dst;
2240 rth->rt_spec_dst= fl->fl4_src;
2241
2242 rth->u.dst.output=ip_output;
2243
2244 RT_CACHE_STAT_INC(out_slow_tot);
2245
2246 if (flags & RTCF_LOCAL) {
2247 rth->u.dst.input = ip_local_deliver;
2248 rth->rt_spec_dst = fl->fl4_dst;
2249 }
2250 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2251 rth->rt_spec_dst = fl->fl4_src;
e905a9ed 2252 if (flags & RTCF_LOCAL &&
1da177e4
LT
2253 !(dev_out->flags & IFF_LOOPBACK)) {
2254 rth->u.dst.output = ip_mc_output;
2255 RT_CACHE_STAT_INC(out_slow_mc);
2256 }
2257#ifdef CONFIG_IP_MROUTE
2258 if (res->type == RTN_MULTICAST) {
2259 if (IN_DEV_MFORWARD(in_dev) &&
2260 !LOCAL_MCAST(oldflp->fl4_dst)) {
2261 rth->u.dst.input = ip_mr_input;
2262 rth->u.dst.output = ip_mc_output;
2263 }
2264 }
2265#endif
2266 }
2267
2268 rt_set_nexthop(rth, res, 0);
2269
2270 rth->rt_flags = flags;
2271
2272 *result = rth;
2273 cleanup:
2274 /* release work reference to inet device */
2275 in_dev_put(in_dev);
2276
2277 return err;
2278}
2279
2280static inline int ip_mkroute_output_def(struct rtable **rp,
2281 struct fib_result* res,
2282 const struct flowi *fl,
2283 const struct flowi *oldflp,
2284 struct net_device *dev_out,
2285 unsigned flags)
2286{
7abaa27c 2287 struct rtable *rth = NULL;
1da177e4
LT
2288 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2289 unsigned hash;
2290 if (err == 0) {
8c7bc840 2291 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
1da177e4
LT
2292 err = rt_intern_hash(hash, rth, rp);
2293 }
e905a9ed 2294
1da177e4
LT
2295 return err;
2296}
2297
2298static inline int ip_mkroute_output(struct rtable** rp,
2299 struct fib_result* res,
2300 const struct flowi *fl,
2301 const struct flowi *oldflp,
2302 struct net_device *dev_out,
2303 unsigned flags)
2304{
2305#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1da177e4
LT
2306 unsigned char hop;
2307 unsigned hash;
2308 int err = -EINVAL;
7abaa27c 2309 struct rtable *rth = NULL;
1da177e4
LT
2310
2311 if (res->fi && res->fi->fib_nhs > 1) {
2312 unsigned char hopcount = res->fi->fib_nhs;
2313
2314 for (hop = 0; hop < hopcount; hop++) {
2315 struct net_device *dev2nexthop;
2316
2317 res->nh_sel = hop;
2318
2319 /* hold a work reference to the output device */
2320 dev2nexthop = FIB_RES_DEV(*res);
2321 dev_hold(dev2nexthop);
2322
ce723d8e
JA
2323 /* put reference to previous result */
2324 if (hop)
2325 ip_rt_put(*rp);
2326
1da177e4
LT
2327 err = __mkroute_output(&rth, res, fl, oldflp,
2328 dev2nexthop, flags);
2329
2330 if (err != 0)
2331 goto cleanup;
2332
8c7bc840
AV
2333 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2334 oldflp->oif);
1da177e4
LT
2335 err = rt_intern_hash(hash, rth, rp);
2336
2337 /* forward hop information to multipath impl. */
2338 multipath_set_nhinfo(rth,
2339 FIB_RES_NETWORK(*res),
2340 FIB_RES_NETMASK(*res),
2341 res->prefixlen,
2342 &FIB_RES_NH(*res));
2343 cleanup:
2344 /* release work reference to output device */
2345 dev_put(dev2nexthop);
2346
2347 if (err != 0)
2348 return err;
2349 }
1da177e4
LT
2350 return err;
2351 } else {
2352 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2353 flags);
2354 }
2355#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2356 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2357#endif
2358}
2359
2360/*
2361 * Major route resolver routine.
2362 */
2363
2364static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2365{
2366 u32 tos = RT_FL_TOS(oldflp);
2367 struct flowi fl = { .nl_u = { .ip4_u =
2368 { .daddr = oldflp->fl4_dst,
2369 .saddr = oldflp->fl4_src,
2370 .tos = tos & IPTOS_RT_MASK,
2371 .scope = ((tos & RTO_ONLINK) ?
2372 RT_SCOPE_LINK :
2373 RT_SCOPE_UNIVERSE),
1da177e4 2374 } },
47dcf0cb 2375 .mark = oldflp->mark,
1da177e4
LT
2376 .iif = loopback_dev.ifindex,
2377 .oif = oldflp->oif };
2378 struct fib_result res;
2379 unsigned flags = 0;
2380 struct net_device *dev_out = NULL;
2381 int free_res = 0;
2382 int err;
2383
2384
2385 res.fi = NULL;
2386#ifdef CONFIG_IP_MULTIPLE_TABLES
2387 res.r = NULL;
2388#endif
2389
2390 if (oldflp->fl4_src) {
2391 err = -EINVAL;
2392 if (MULTICAST(oldflp->fl4_src) ||
2393 BADCLASS(oldflp->fl4_src) ||
2394 ZERONET(oldflp->fl4_src))
2395 goto out;
2396
2397 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2398 dev_out = ip_dev_find(oldflp->fl4_src);
f6c5d736 2399 if (dev_out == NULL)
1da177e4
LT
2400 goto out;
2401
2402 /* I removed check for oif == dev_out->oif here.
2403 It was wrong for two reasons:
2404 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2405 assigned to multiple interfaces.
2406 2. Moreover, we are allowed to send packets with saddr
2407 of another iface. --ANK
2408 */
2409
f6c5d736 2410 if (oldflp->oif == 0
e448515c 2411 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
1da177e4
LT
2412 /* Special hack: user can direct multicasts
2413 and limited broadcast via necessary interface
2414 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2415 This hack is not just for fun, it allows
2416 vic,vat and friends to work.
2417 They bind socket to loopback, set ttl to zero
2418 and expect that it will work.
2419 From the viewpoint of routing cache they are broken,
2420 because we are not allowed to build multicast path
2421 with loopback source addr (look, routing cache
2422 cannot know, that ttl is zero, so that packet
2423 will not leave this host and route is valid).
2424 Luckily, this hack is good workaround.
2425 */
2426
2427 fl.oif = dev_out->ifindex;
2428 goto make_route;
2429 }
2430 if (dev_out)
2431 dev_put(dev_out);
2432 dev_out = NULL;
2433 }
2434
2435
2436 if (oldflp->oif) {
2437 dev_out = dev_get_by_index(oldflp->oif);
2438 err = -ENODEV;
2439 if (dev_out == NULL)
2440 goto out;
e5ed6399
HX
2441
2442 /* RACE: Check return value of inet_select_addr instead. */
2443 if (__in_dev_get_rtnl(dev_out) == NULL) {
1da177e4
LT
2444 dev_put(dev_out);
2445 goto out; /* Wrong error code */
2446 }
2447
e448515c 2448 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
1da177e4
LT
2449 if (!fl.fl4_src)
2450 fl.fl4_src = inet_select_addr(dev_out, 0,
2451 RT_SCOPE_LINK);
2452 goto make_route;
2453 }
2454 if (!fl.fl4_src) {
2455 if (MULTICAST(oldflp->fl4_dst))
2456 fl.fl4_src = inet_select_addr(dev_out, 0,
2457 fl.fl4_scope);
2458 else if (!oldflp->fl4_dst)
2459 fl.fl4_src = inet_select_addr(dev_out, 0,
2460 RT_SCOPE_HOST);
2461 }
2462 }
2463
2464 if (!fl.fl4_dst) {
2465 fl.fl4_dst = fl.fl4_src;
2466 if (!fl.fl4_dst)
2467 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2468 if (dev_out)
2469 dev_put(dev_out);
2470 dev_out = &loopback_dev;
2471 dev_hold(dev_out);
2472 fl.oif = loopback_dev.ifindex;
2473 res.type = RTN_LOCAL;
2474 flags |= RTCF_LOCAL;
2475 goto make_route;
2476 }
2477
2478 if (fib_lookup(&fl, &res)) {
2479 res.fi = NULL;
2480 if (oldflp->oif) {
2481 /* Apparently, routing tables are wrong. Assume,
2482 that the destination is on link.
2483
2484 WHY? DW.
2485 Because we are allowed to send to iface
2486 even if it has NO routes and NO assigned
2487 addresses. When oif is specified, routing
2488 tables are looked up with only one purpose:
2489 to catch if destination is gatewayed, rather than
2490 direct. Moreover, if MSG_DONTROUTE is set,
2491 we send packet, ignoring both routing tables
2492 and ifaddr state. --ANK
2493
2494
2495 We could make it even if oif is unknown,
2496 likely IPv6, but we do not.
2497 */
2498
2499 if (fl.fl4_src == 0)
2500 fl.fl4_src = inet_select_addr(dev_out, 0,
2501 RT_SCOPE_LINK);
2502 res.type = RTN_UNICAST;
2503 goto make_route;
2504 }
2505 if (dev_out)
2506 dev_put(dev_out);
2507 err = -ENETUNREACH;
2508 goto out;
2509 }
2510 free_res = 1;
2511
2512 if (res.type == RTN_LOCAL) {
2513 if (!fl.fl4_src)
2514 fl.fl4_src = fl.fl4_dst;
2515 if (dev_out)
2516 dev_put(dev_out);
2517 dev_out = &loopback_dev;
2518 dev_hold(dev_out);
2519 fl.oif = dev_out->ifindex;
2520 if (res.fi)
2521 fib_info_put(res.fi);
2522 res.fi = NULL;
2523 flags |= RTCF_LOCAL;
2524 goto make_route;
2525 }
2526
2527#ifdef CONFIG_IP_ROUTE_MULTIPATH
2528 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2529 fib_select_multipath(&fl, &res);
2530 else
2531#endif
2532 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2533 fib_select_default(&fl, &res);
2534
2535 if (!fl.fl4_src)
2536 fl.fl4_src = FIB_RES_PREFSRC(res);
2537
2538 if (dev_out)
2539 dev_put(dev_out);
2540 dev_out = FIB_RES_DEV(res);
2541 dev_hold(dev_out);
2542 fl.oif = dev_out->ifindex;
2543
2544
2545make_route:
2546 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2547
2548
2549 if (free_res)
2550 fib_res_put(&res);
2551 if (dev_out)
2552 dev_put(dev_out);
2553out: return err;
2554}
2555
2556int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2557{
2558 unsigned hash;
2559 struct rtable *rth;
2560
8c7bc840 2561 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
1da177e4
LT
2562
2563 rcu_read_lock_bh();
2564 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
093c2ca4 2565 rth = rcu_dereference(rth->u.dst.rt_next)) {
1da177e4
LT
2566 if (rth->fl.fl4_dst == flp->fl4_dst &&
2567 rth->fl.fl4_src == flp->fl4_src &&
2568 rth->fl.iif == 0 &&
2569 rth->fl.oif == flp->oif &&
47dcf0cb 2570 rth->fl.mark == flp->mark &&
1da177e4
LT
2571 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2572 (IPTOS_RT_MASK | RTO_ONLINK))) {
2573
2574 /* check for multipath routes and choose one if
2575 * necessary
2576 */
2577 if (multipath_select_route(flp, rth, rp)) {
2578 dst_hold(&(*rp)->u.dst);
2579 RT_CACHE_STAT_INC(out_hit);
2580 rcu_read_unlock_bh();
2581 return 0;
2582 }
2583
2584 rth->u.dst.lastuse = jiffies;
2585 dst_hold(&rth->u.dst);
2586 rth->u.dst.__use++;
2587 RT_CACHE_STAT_INC(out_hit);
2588 rcu_read_unlock_bh();
2589 *rp = rth;
2590 return 0;
2591 }
2592 RT_CACHE_STAT_INC(out_hlist_search);
2593 }
2594 rcu_read_unlock_bh();
2595
2596 return ip_route_output_slow(rp, flp);
2597}
2598
d8c97a94
ACM
2599EXPORT_SYMBOL_GPL(__ip_route_output_key);
2600
14e50e57
DM
2601static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2602{
2603}
2604
2605static struct dst_ops ipv4_dst_blackhole_ops = {
2606 .family = AF_INET,
2607 .protocol = __constant_htons(ETH_P_IP),
2608 .destroy = ipv4_dst_destroy,
2609 .check = ipv4_dst_check,
2610 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2611 .entry_size = sizeof(struct rtable),
2612};
2613
2614
2615static int ipv4_blackhole_output(struct sk_buff *skb)
2616{
2617 kfree_skb(skb);
2618 return 0;
2619}
2620
2621static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2622{
2623 struct rtable *ort = *rp;
2624 struct rtable *rt = (struct rtable *)
2625 dst_alloc(&ipv4_dst_blackhole_ops);
2626
2627 if (rt) {
2628 struct dst_entry *new = &rt->u.dst;
2629
2630 atomic_set(&new->__refcnt, 1);
2631 new->__use = 1;
2632 new->input = ipv4_blackhole_output;
2633 new->output = ipv4_blackhole_output;
2634 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2635
2636 new->dev = ort->u.dst.dev;
2637 if (new->dev)
2638 dev_hold(new->dev);
2639
2640 rt->fl = ort->fl;
2641
2642 rt->idev = ort->idev;
2643 if (rt->idev)
2644 in_dev_hold(rt->idev);
2645 rt->rt_flags = ort->rt_flags;
2646 rt->rt_type = ort->rt_type;
2647 rt->rt_dst = ort->rt_dst;
2648 rt->rt_src = ort->rt_src;
2649 rt->rt_iif = ort->rt_iif;
2650 rt->rt_gateway = ort->rt_gateway;
2651 rt->rt_spec_dst = ort->rt_spec_dst;
2652 rt->peer = ort->peer;
2653 if (rt->peer)
2654 atomic_inc(&rt->peer->refcnt);
2655
2656 dst_free(new);
2657 }
2658
2659 dst_release(&(*rp)->u.dst);
2660 *rp = rt;
2661 return (rt ? 0 : -ENOMEM);
2662}
2663
1da177e4
LT
2664int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2665{
2666 int err;
2667
2668 if ((err = __ip_route_output_key(rp, flp)) != 0)
2669 return err;
2670
2671 if (flp->proto) {
2672 if (!flp->fl4_src)
2673 flp->fl4_src = (*rp)->rt_src;
2674 if (!flp->fl4_dst)
2675 flp->fl4_dst = (*rp)->rt_dst;
14e50e57
DM
2676 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2677 if (err == -EREMOTE)
2678 err = ipv4_dst_blackhole(rp, flp, sk);
2679
2680 return err;
1da177e4
LT
2681 }
2682
2683 return 0;
2684}
2685
d8c97a94
ACM
2686EXPORT_SYMBOL_GPL(ip_route_output_flow);
2687
1da177e4
LT
2688int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2689{
2690 return ip_route_output_flow(rp, flp, NULL, 0);
2691}
2692
2693static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2694 int nowait, unsigned int flags)
1da177e4
LT
2695{
2696 struct rtable *rt = (struct rtable*)skb->dst;
2697 struct rtmsg *r;
be403ea1 2698 struct nlmsghdr *nlh;
e3703b3d
TG
2699 long expires;
2700 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2701
2702 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2703 if (nlh == NULL)
26932566 2704 return -EMSGSIZE;
be403ea1
TG
2705
2706 r = nlmsg_data(nlh);
1da177e4
LT
2707 r->rtm_family = AF_INET;
2708 r->rtm_dst_len = 32;
2709 r->rtm_src_len = 0;
2710 r->rtm_tos = rt->fl.fl4_tos;
2711 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2712 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2713 r->rtm_type = rt->rt_type;
2714 r->rtm_scope = RT_SCOPE_UNIVERSE;
2715 r->rtm_protocol = RTPROT_UNSPEC;
2716 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2717 if (rt->rt_flags & RTCF_NOTIFY)
2718 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2719
17fb2c64 2720 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2721
1da177e4
LT
2722 if (rt->fl.fl4_src) {
2723 r->rtm_src_len = 32;
17fb2c64 2724 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
1da177e4
LT
2725 }
2726 if (rt->u.dst.dev)
be403ea1 2727 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
1da177e4
LT
2728#ifdef CONFIG_NET_CLS_ROUTE
2729 if (rt->u.dst.tclassid)
be403ea1 2730 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
1da177e4
LT
2731#endif
2732#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
be403ea1
TG
2733 if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2734 NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
1da177e4
LT
2735#endif
2736 if (rt->fl.iif)
17fb2c64 2737 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
1da177e4 2738 else if (rt->rt_src != rt->fl.fl4_src)
17fb2c64 2739 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2740
1da177e4 2741 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2742 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2743
1da177e4 2744 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
be403ea1
TG
2745 goto nla_put_failure;
2746
e3703b3d
TG
2747 error = rt->u.dst.error;
2748 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
1da177e4 2749 if (rt->peer) {
e3703b3d 2750 id = rt->peer->ip_id_count;
1da177e4 2751 if (rt->peer->tcp_ts_stamp) {
e3703b3d 2752 ts = rt->peer->tcp_ts;
9d729f72 2753 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
1da177e4
LT
2754 }
2755 }
be403ea1 2756
1da177e4
LT
2757 if (rt->fl.iif) {
2758#ifdef CONFIG_IP_MROUTE
e448515c 2759 __be32 dst = rt->rt_dst;
1da177e4
LT
2760
2761 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2762 ipv4_devconf.mc_forwarding) {
2763 int err = ipmr_get_route(skb, r, nowait);
2764 if (err <= 0) {
2765 if (!nowait) {
2766 if (err == 0)
2767 return 0;
be403ea1 2768 goto nla_put_failure;
1da177e4
LT
2769 } else {
2770 if (err == -EMSGSIZE)
be403ea1 2771 goto nla_put_failure;
e3703b3d 2772 error = err;
1da177e4
LT
2773 }
2774 }
2775 } else
2776#endif
be403ea1 2777 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
1da177e4
LT
2778 }
2779
e3703b3d
TG
2780 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2781 expires, error) < 0)
2782 goto nla_put_failure;
be403ea1
TG
2783
2784 return nlmsg_end(skb, nlh);
1da177e4 2785
be403ea1 2786nla_put_failure:
26932566
PM
2787 nlmsg_cancel(skb, nlh);
2788 return -EMSGSIZE;
1da177e4
LT
2789}
2790
63f3444f 2791static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 2792{
d889ce3b
TG
2793 struct rtmsg *rtm;
2794 struct nlattr *tb[RTA_MAX+1];
1da177e4 2795 struct rtable *rt = NULL;
9e12bb22
AV
2796 __be32 dst = 0;
2797 __be32 src = 0;
2798 u32 iif;
d889ce3b 2799 int err;
1da177e4
LT
2800 struct sk_buff *skb;
2801
d889ce3b
TG
2802 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2803 if (err < 0)
2804 goto errout;
2805
2806 rtm = nlmsg_data(nlh);
2807
1da177e4 2808 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2809 if (skb == NULL) {
2810 err = -ENOBUFS;
2811 goto errout;
2812 }
1da177e4
LT
2813
2814 /* Reserve room for dummy headers, this skb can pass
2815 through good chunk of routing engine.
2816 */
459a98ed 2817 skb_reset_mac_header(skb);
c1d2bbe1 2818 skb_reset_network_header(skb);
d2c962b8
SH
2819
2820 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2821 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2822 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2823
17fb2c64
AV
2824 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2825 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2826 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
1da177e4
LT
2827
2828 if (iif) {
d889ce3b
TG
2829 struct net_device *dev;
2830
2831 dev = __dev_get_by_index(iif);
2832 if (dev == NULL) {
2833 err = -ENODEV;
2834 goto errout_free;
2835 }
2836
1da177e4
LT
2837 skb->protocol = htons(ETH_P_IP);
2838 skb->dev = dev;
2839 local_bh_disable();
2840 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2841 local_bh_enable();
d889ce3b
TG
2842
2843 rt = (struct rtable*) skb->dst;
2844 if (err == 0 && rt->u.dst.error)
1da177e4
LT
2845 err = -rt->u.dst.error;
2846 } else {
d889ce3b
TG
2847 struct flowi fl = {
2848 .nl_u = {
2849 .ip4_u = {
2850 .daddr = dst,
2851 .saddr = src,
2852 .tos = rtm->rtm_tos,
2853 },
2854 },
2855 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2856 };
1da177e4
LT
2857 err = ip_route_output_key(&rt, &fl);
2858 }
d889ce3b 2859
1da177e4 2860 if (err)
d889ce3b 2861 goto errout_free;
1da177e4
LT
2862
2863 skb->dst = &rt->u.dst;
2864 if (rtm->rtm_flags & RTM_F_NOTIFY)
2865 rt->rt_flags |= RTCF_NOTIFY;
2866
1da177e4 2867 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
b6544c0b 2868 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2869 if (err <= 0)
2870 goto errout_free;
1da177e4 2871
2942e900 2872 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
d889ce3b 2873errout:
2942e900 2874 return err;
1da177e4 2875
d889ce3b 2876errout_free:
1da177e4 2877 kfree_skb(skb);
d889ce3b 2878 goto errout;
1da177e4
LT
2879}
2880
2881int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2882{
2883 struct rtable *rt;
2884 int h, s_h;
2885 int idx, s_idx;
2886
2887 s_h = cb->args[0];
2888 s_idx = idx = cb->args[1];
2889 for (h = 0; h <= rt_hash_mask; h++) {
2890 if (h < s_h) continue;
2891 if (h > s_h)
2892 s_idx = 0;
2893 rcu_read_lock_bh();
2894 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
093c2ca4 2895 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
1da177e4
LT
2896 if (idx < s_idx)
2897 continue;
2898 skb->dst = dst_clone(&rt->u.dst);
2899 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
e905a9ed 2900 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 2901 1, NLM_F_MULTI) <= 0) {
1da177e4
LT
2902 dst_release(xchg(&skb->dst, NULL));
2903 rcu_read_unlock_bh();
2904 goto done;
2905 }
2906 dst_release(xchg(&skb->dst, NULL));
2907 }
2908 rcu_read_unlock_bh();
2909 }
2910
2911done:
2912 cb->args[0] = h;
2913 cb->args[1] = idx;
2914 return skb->len;
2915}
2916
2917void ip_rt_multicast_event(struct in_device *in_dev)
2918{
2919 rt_cache_flush(0);
2920}
2921
2922#ifdef CONFIG_SYSCTL
2923static int flush_delay;
2924
2925static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2926 struct file *filp, void __user *buffer,
2927 size_t *lenp, loff_t *ppos)
2928{
2929 if (write) {
2930 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2931 rt_cache_flush(flush_delay);
2932 return 0;
e905a9ed 2933 }
1da177e4
LT
2934
2935 return -EINVAL;
2936}
2937
2938static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2939 int __user *name,
2940 int nlen,
2941 void __user *oldval,
2942 size_t __user *oldlenp,
2943 void __user *newval,
1f29bcd7 2944 size_t newlen)
1da177e4
LT
2945{
2946 int delay;
2947 if (newlen != sizeof(int))
2948 return -EINVAL;
2949 if (get_user(delay, (int __user *)newval))
e905a9ed
YH
2950 return -EFAULT;
2951 rt_cache_flush(delay);
1da177e4
LT
2952 return 0;
2953}
2954
2955ctl_table ipv4_route_table[] = {
e905a9ed 2956 {
1da177e4
LT
2957 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2958 .procname = "flush",
2959 .data = &flush_delay,
2960 .maxlen = sizeof(int),
7e3e0360 2961 .mode = 0200,
1da177e4
LT
2962 .proc_handler = &ipv4_sysctl_rtcache_flush,
2963 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2964 },
2965 {
2966 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2967 .procname = "min_delay",
2968 .data = &ip_rt_min_delay,
2969 .maxlen = sizeof(int),
2970 .mode = 0644,
2971 .proc_handler = &proc_dointvec_jiffies,
2972 .strategy = &sysctl_jiffies,
2973 },
2974 {
2975 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2976 .procname = "max_delay",
2977 .data = &ip_rt_max_delay,
2978 .maxlen = sizeof(int),
2979 .mode = 0644,
2980 .proc_handler = &proc_dointvec_jiffies,
2981 .strategy = &sysctl_jiffies,
2982 },
2983 {
2984 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2985 .procname = "gc_thresh",
2986 .data = &ipv4_dst_ops.gc_thresh,
2987 .maxlen = sizeof(int),
2988 .mode = 0644,
2989 .proc_handler = &proc_dointvec,
2990 },
2991 {
2992 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2993 .procname = "max_size",
2994 .data = &ip_rt_max_size,
2995 .maxlen = sizeof(int),
2996 .mode = 0644,
2997 .proc_handler = &proc_dointvec,
2998 },
2999 {
3000 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3001
1da177e4
LT
3002 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3003 .procname = "gc_min_interval",
3004 .data = &ip_rt_gc_min_interval,
3005 .maxlen = sizeof(int),
3006 .mode = 0644,
3007 .proc_handler = &proc_dointvec_jiffies,
3008 .strategy = &sysctl_jiffies,
3009 },
3010 {
3011 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3012 .procname = "gc_min_interval_ms",
3013 .data = &ip_rt_gc_min_interval,
3014 .maxlen = sizeof(int),
3015 .mode = 0644,
3016 .proc_handler = &proc_dointvec_ms_jiffies,
3017 .strategy = &sysctl_ms_jiffies,
3018 },
3019 {
3020 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
3021 .procname = "gc_timeout",
3022 .data = &ip_rt_gc_timeout,
3023 .maxlen = sizeof(int),
3024 .mode = 0644,
3025 .proc_handler = &proc_dointvec_jiffies,
3026 .strategy = &sysctl_jiffies,
3027 },
3028 {
3029 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
3030 .procname = "gc_interval",
3031 .data = &ip_rt_gc_interval,
3032 .maxlen = sizeof(int),
3033 .mode = 0644,
3034 .proc_handler = &proc_dointvec_jiffies,
3035 .strategy = &sysctl_jiffies,
3036 },
3037 {
3038 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
3039 .procname = "redirect_load",
3040 .data = &ip_rt_redirect_load,
3041 .maxlen = sizeof(int),
3042 .mode = 0644,
3043 .proc_handler = &proc_dointvec,
3044 },
3045 {
3046 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3047 .procname = "redirect_number",
3048 .data = &ip_rt_redirect_number,
3049 .maxlen = sizeof(int),
3050 .mode = 0644,
3051 .proc_handler = &proc_dointvec,
3052 },
3053 {
3054 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3055 .procname = "redirect_silence",
3056 .data = &ip_rt_redirect_silence,
3057 .maxlen = sizeof(int),
3058 .mode = 0644,
3059 .proc_handler = &proc_dointvec,
3060 },
3061 {
3062 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
3063 .procname = "error_cost",
3064 .data = &ip_rt_error_cost,
3065 .maxlen = sizeof(int),
3066 .mode = 0644,
3067 .proc_handler = &proc_dointvec,
3068 },
3069 {
3070 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3071 .procname = "error_burst",
3072 .data = &ip_rt_error_burst,
3073 .maxlen = sizeof(int),
3074 .mode = 0644,
3075 .proc_handler = &proc_dointvec,
3076 },
3077 {
3078 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3079 .procname = "gc_elasticity",
3080 .data = &ip_rt_gc_elasticity,
3081 .maxlen = sizeof(int),
3082 .mode = 0644,
3083 .proc_handler = &proc_dointvec,
3084 },
3085 {
3086 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3087 .procname = "mtu_expires",
3088 .data = &ip_rt_mtu_expires,
3089 .maxlen = sizeof(int),
3090 .mode = 0644,
3091 .proc_handler = &proc_dointvec_jiffies,
3092 .strategy = &sysctl_jiffies,
3093 },
3094 {
3095 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3096 .procname = "min_pmtu",
3097 .data = &ip_rt_min_pmtu,
3098 .maxlen = sizeof(int),
3099 .mode = 0644,
3100 .proc_handler = &proc_dointvec,
3101 },
3102 {
3103 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3104 .procname = "min_adv_mss",
3105 .data = &ip_rt_min_advmss,
3106 .maxlen = sizeof(int),
3107 .mode = 0644,
3108 .proc_handler = &proc_dointvec,
3109 },
3110 {
3111 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3112 .procname = "secret_interval",
3113 .data = &ip_rt_secret_interval,
3114 .maxlen = sizeof(int),
3115 .mode = 0644,
3116 .proc_handler = &proc_dointvec_jiffies,
3117 .strategy = &sysctl_jiffies,
3118 },
3119 { .ctl_name = 0 }
3120};
3121#endif
3122
3123#ifdef CONFIG_NET_CLS_ROUTE
3124struct ip_rt_acct *ip_rt_acct;
3125
3126/* This code sucks. But you should have seen it before! --RR */
3127
3128/* IP route accounting ptr for this logical cpu number. */
3129#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3130
3131#ifdef CONFIG_PROC_FS
3132static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3133 int length, int *eof, void *data)
3134{
3135 unsigned int i;
3136
3137 if ((offset & 3) || (length & 3))
3138 return -EIO;
3139
3140 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3141 *eof = 1;
3142 return 0;
3143 }
3144
3145 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3146 length = sizeof(struct ip_rt_acct) * 256 - offset;
3147 *eof = 1;
3148 }
3149
3150 offset /= sizeof(u32);
3151
3152 if (length > 0) {
3153 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3154 u32 *dst = (u32 *) buffer;
3155
3156 /* Copy first cpu. */
3157 *start = buffer;
3158 memcpy(dst, src, length);
3159
3160 /* Add the other cpus in, one int at a time */
6f912042 3161 for_each_possible_cpu(i) {
1da177e4
LT
3162 unsigned int j;
3163
3164 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3165
3166 for (j = 0; j < length/4; j++)
3167 dst[j] += src[j];
3168 }
3169 }
3170 return length;
3171}
3172#endif /* CONFIG_PROC_FS */
3173#endif /* CONFIG_NET_CLS_ROUTE */
3174
3175static __initdata unsigned long rhash_entries;
3176static int __init set_rhash_entries(char *str)
3177{
3178 if (!str)
3179 return 0;
3180 rhash_entries = simple_strtoul(str, &str, 0);
3181 return 1;
3182}
3183__setup("rhash_entries=", set_rhash_entries);
3184
3185int __init ip_rt_init(void)
3186{
424c4b70 3187 int rc = 0;
1da177e4
LT
3188
3189 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3190 (jiffies ^ (jiffies >> 7)));
3191
3192#ifdef CONFIG_NET_CLS_ROUTE
424c4b70
ED
3193 {
3194 int order;
1da177e4
LT
3195 for (order = 0;
3196 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3197 /* NOTHING */;
3198 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3199 if (!ip_rt_acct)
3200 panic("IP: failed to allocate ip_rt_acct\n");
3201 memset(ip_rt_acct, 0, PAGE_SIZE << order);
424c4b70 3202 }
1da177e4
LT
3203#endif
3204
e5d679f3
AD
3205 ipv4_dst_ops.kmem_cachep =
3206 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3207 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1da177e4 3208
14e50e57
DM
3209 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3210
424c4b70
ED
3211 rt_hash_table = (struct rt_hash_bucket *)
3212 alloc_large_system_hash("IP route cache",
3213 sizeof(struct rt_hash_bucket),
3214 rhash_entries,
3215 (num_physpages >= 128 * 1024) ?
18955cfc 3216 15 : 17,
8d1502de 3217 0,
424c4b70
ED
3218 &rt_hash_log,
3219 &rt_hash_mask,
3220 0);
22c047cc
ED
3221 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3222 rt_hash_lock_init();
1da177e4
LT
3223
3224 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3225 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3226
1da177e4
LT
3227 devinet_init();
3228 ip_fib_init();
3229
3230 init_timer(&rt_flush_timer);
3231 rt_flush_timer.function = rt_run_flush;
3232 init_timer(&rt_periodic_timer);
3233 rt_periodic_timer.function = rt_check_expire;
3234 init_timer(&rt_secret_timer);
3235 rt_secret_timer.function = rt_secret_rebuild;
3236
3237 /* All the timers, started at system startup tend
3238 to synchronize. Perturb it a bit.
3239 */
3240 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3241 ip_rt_gc_interval;
3242 add_timer(&rt_periodic_timer);
3243
3244 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3245 ip_rt_secret_interval;
3246 add_timer(&rt_secret_timer);
3247
3248#ifdef CONFIG_PROC_FS
3249 {
3250 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3251 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
e905a9ed
YH
3252 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3253 proc_net_stat))) {
1da177e4
LT
3254 return -ENOMEM;
3255 }
3256 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3257 }
3258#ifdef CONFIG_NET_CLS_ROUTE
3259 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3260#endif
3261#endif
3262#ifdef CONFIG_XFRM
3263 xfrm_init();
3264 xfrm4_init();
3265#endif
63f3444f
TG
3266 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3267
1da177e4
LT
3268 return rc;
3269}
3270
3271EXPORT_SYMBOL(__ip_select_ident);
3272EXPORT_SYMBOL(ip_route_input);
3273EXPORT_SYMBOL(ip_route_output_key);