]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/route.c
[NET_SCHED] sch_prio: class statistics printing enabled
[net-next-2.6.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
41 *
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
60 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
1da177e4
LT
67#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/mm.h>
424c4b70 75#include <linux/bootmem.h>
1da177e4
LT
76#include <linux/string.h>
77#include <linux/socket.h>
78#include <linux/sockios.h>
79#include <linux/errno.h>
80#include <linux/in.h>
81#include <linux/inet.h>
82#include <linux/netdevice.h>
83#include <linux/proc_fs.h>
84#include <linux/init.h>
85#include <linux/skbuff.h>
86#include <linux/rtnetlink.h>
87#include <linux/inetdevice.h>
88#include <linux/igmp.h>
89#include <linux/pkt_sched.h>
90#include <linux/mroute.h>
91#include <linux/netfilter_ipv4.h>
92#include <linux/random.h>
93#include <linux/jhash.h>
94#include <linux/rcupdate.h>
95#include <linux/times.h>
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
106#include <net/ip_mp_alg.h>
8d71740c 107#include <net/netevent.h>
1da177e4
LT
108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
112#define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
119static int ip_rt_min_delay = 2 * HZ;
120static int ip_rt_max_delay = 10 * HZ;
121static int ip_rt_max_size;
122static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
123static int ip_rt_gc_interval = 60 * HZ;
124static int ip_rt_gc_min_interval = HZ / 2;
125static int ip_rt_redirect_number = 9;
126static int ip_rt_redirect_load = HZ / 50;
127static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost = HZ;
129static int ip_rt_error_burst = 5 * HZ;
130static int ip_rt_gc_elasticity = 8;
131static int ip_rt_mtu_expires = 10 * 60 * HZ;
132static int ip_rt_min_pmtu = 512 + 20 + 20;
133static int ip_rt_min_advmss = 256;
134static int ip_rt_secret_interval = 10 * 60 * HZ;
135static unsigned long rt_deadline;
136
137#define RTprint(a...) printk(KERN_DEBUG a)
138
139static struct timer_list rt_flush_timer;
140static struct timer_list rt_periodic_timer;
141static struct timer_list rt_secret_timer;
142
143/*
144 * Interface to generic destination cache.
145 */
146
147static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
148static void ipv4_dst_destroy(struct dst_entry *dst);
149static void ipv4_dst_ifdown(struct dst_entry *dst,
150 struct net_device *dev, int how);
151static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
152static void ipv4_link_failure(struct sk_buff *skb);
153static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
154static int rt_garbage_collect(void);
155
156
157static struct dst_ops ipv4_dst_ops = {
158 .family = AF_INET,
159 .protocol = __constant_htons(ETH_P_IP),
160 .gc = rt_garbage_collect,
161 .check = ipv4_dst_check,
162 .destroy = ipv4_dst_destroy,
163 .ifdown = ipv4_dst_ifdown,
164 .negative_advice = ipv4_negative_advice,
165 .link_failure = ipv4_link_failure,
166 .update_pmtu = ip_rt_update_pmtu,
167 .entry_size = sizeof(struct rtable),
168};
169
170#define ECN_OR_COST(class) TC_PRIO_##class
171
172__u8 ip_tos2prio[16] = {
173 TC_PRIO_BESTEFFORT,
174 ECN_OR_COST(FILLER),
175 TC_PRIO_BESTEFFORT,
176 ECN_OR_COST(BESTEFFORT),
177 TC_PRIO_BULK,
178 ECN_OR_COST(BULK),
179 TC_PRIO_BULK,
180 ECN_OR_COST(BULK),
181 TC_PRIO_INTERACTIVE,
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE,
184 ECN_OR_COST(INTERACTIVE),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK),
187 TC_PRIO_INTERACTIVE_BULK,
188 ECN_OR_COST(INTERACTIVE_BULK)
189};
190
191
192/*
193 * Route cache.
194 */
195
196/* The locking scheme is rather straight forward:
197 *
198 * 1) Read-Copy Update protects the buckets of the central route hash.
199 * 2) Only writers remove entries, and they hold the lock
200 * as they look at rtable reference counts.
201 * 3) Only readers acquire references to rtable entries,
202 * they do so with atomic increments and with the
203 * lock held.
204 */
205
206struct rt_hash_bucket {
207 struct rtable *chain;
22c047cc 208};
8a25d5de
IM
209#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
211/*
212 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213 * The size of this table is a power of two and depends on the number of CPUS.
62051200 214 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 215 */
62051200
IM
216#ifdef CONFIG_LOCKDEP
217# define RT_HASH_LOCK_SZ 256
22c047cc 218#else
62051200
IM
219# if NR_CPUS >= 32
220# define RT_HASH_LOCK_SZ 4096
221# elif NR_CPUS >= 16
222# define RT_HASH_LOCK_SZ 2048
223# elif NR_CPUS >= 8
224# define RT_HASH_LOCK_SZ 1024
225# elif NR_CPUS >= 4
226# define RT_HASH_LOCK_SZ 512
227# else
228# define RT_HASH_LOCK_SZ 256
229# endif
22c047cc
ED
230#endif
231
232static spinlock_t *rt_hash_locks;
233# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234# define rt_hash_lock_init() { \
235 int i; \
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
237 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
238 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
239 spin_lock_init(&rt_hash_locks[i]); \
240 }
241#else
242# define rt_hash_lock_addr(slot) NULL
243# define rt_hash_lock_init()
244#endif
1da177e4
LT
245
246static struct rt_hash_bucket *rt_hash_table;
247static unsigned rt_hash_mask;
248static int rt_hash_log;
249static unsigned int rt_hash_rnd;
250
2f970d83 251static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
dbd2915c 252#define RT_CACHE_STAT_INC(field) \
bfe5d834 253 (__raw_get_cpu_var(rt_cache_stat).field++)
1da177e4
LT
254
255static int rt_intern_hash(unsigned hash, struct rtable *rth,
256 struct rtable **res);
257
cef2685e 258static unsigned int rt_hash_code(u32 daddr, u32 saddr)
1da177e4 259{
cef2685e 260 return (jhash_2words(daddr, saddr, rt_hash_rnd)
1da177e4
LT
261 & rt_hash_mask);
262}
263
8c7bc840
AV
264#define rt_hash(daddr, saddr, idx) \
265 rt_hash_code((__force u32)(__be32)(daddr),\
266 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
267
1da177e4
LT
268#ifdef CONFIG_PROC_FS
269struct rt_cache_iter_state {
270 int bucket;
271};
272
273static struct rtable *rt_cache_get_first(struct seq_file *seq)
274{
275 struct rtable *r = NULL;
276 struct rt_cache_iter_state *st = seq->private;
277
278 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
279 rcu_read_lock_bh();
280 r = rt_hash_table[st->bucket].chain;
281 if (r)
282 break;
283 rcu_read_unlock_bh();
284 }
285 return r;
286}
287
288static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
289{
290 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
291
292 r = r->u.rt_next;
293 while (!r) {
294 rcu_read_unlock_bh();
295 if (--st->bucket < 0)
296 break;
297 rcu_read_lock_bh();
298 r = rt_hash_table[st->bucket].chain;
299 }
300 return r;
301}
302
303static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
304{
305 struct rtable *r = rt_cache_get_first(seq);
306
307 if (r)
308 while (pos && (r = rt_cache_get_next(seq, r)))
309 --pos;
310 return pos ? NULL : r;
311}
312
313static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
314{
315 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
316}
317
318static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
319{
320 struct rtable *r = NULL;
321
322 if (v == SEQ_START_TOKEN)
323 r = rt_cache_get_first(seq);
324 else
325 r = rt_cache_get_next(seq, v);
326 ++*pos;
327 return r;
328}
329
330static void rt_cache_seq_stop(struct seq_file *seq, void *v)
331{
332 if (v && v != SEQ_START_TOKEN)
333 rcu_read_unlock_bh();
334}
335
336static int rt_cache_seq_show(struct seq_file *seq, void *v)
337{
338 if (v == SEQ_START_TOKEN)
339 seq_printf(seq, "%-127s\n",
340 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
341 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
342 "HHUptod\tSpecDst");
343 else {
344 struct rtable *r = v;
345 char temp[256];
346
347 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
348 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
349 r->u.dst.dev ? r->u.dst.dev->name : "*",
350 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
351 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
352 r->u.dst.__use, 0, (unsigned long)r->rt_src,
353 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
354 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
355 dst_metric(&r->u.dst, RTAX_WINDOW),
356 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
357 dst_metric(&r->u.dst, RTAX_RTTVAR)),
358 r->fl.fl4_tos,
359 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
360 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
361 dev_queue_xmit) : 0,
362 r->rt_spec_dst);
363 seq_printf(seq, "%-127s\n", temp);
364 }
365 return 0;
366}
367
368static struct seq_operations rt_cache_seq_ops = {
369 .start = rt_cache_seq_start,
370 .next = rt_cache_seq_next,
371 .stop = rt_cache_seq_stop,
372 .show = rt_cache_seq_show,
373};
374
375static int rt_cache_seq_open(struct inode *inode, struct file *file)
376{
377 struct seq_file *seq;
378 int rc = -ENOMEM;
379 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
380
381 if (!s)
382 goto out;
383 rc = seq_open(file, &rt_cache_seq_ops);
384 if (rc)
385 goto out_kfree;
386 seq = file->private_data;
387 seq->private = s;
388 memset(s, 0, sizeof(*s));
389out:
390 return rc;
391out_kfree:
392 kfree(s);
393 goto out;
394}
395
396static struct file_operations rt_cache_seq_fops = {
397 .owner = THIS_MODULE,
398 .open = rt_cache_seq_open,
399 .read = seq_read,
400 .llseek = seq_lseek,
401 .release = seq_release_private,
402};
403
404
405static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
406{
407 int cpu;
408
409 if (*pos == 0)
410 return SEQ_START_TOKEN;
411
412 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
413 if (!cpu_possible(cpu))
414 continue;
415 *pos = cpu+1;
2f970d83 416 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
417 }
418 return NULL;
419}
420
421static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
422{
423 int cpu;
424
425 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
426 if (!cpu_possible(cpu))
427 continue;
428 *pos = cpu+1;
2f970d83 429 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
430 }
431 return NULL;
432
433}
434
435static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
436{
437
438}
439
440static int rt_cpu_seq_show(struct seq_file *seq, void *v)
441{
442 struct rt_cache_stat *st = v;
443
444 if (v == SEQ_START_TOKEN) {
5bec0039 445 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
446 return 0;
447 }
448
449 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
450 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
451 atomic_read(&ipv4_dst_ops.entries),
452 st->in_hit,
453 st->in_slow_tot,
454 st->in_slow_mc,
455 st->in_no_route,
456 st->in_brd,
457 st->in_martian_dst,
458 st->in_martian_src,
459
460 st->out_hit,
461 st->out_slow_tot,
462 st->out_slow_mc,
463
464 st->gc_total,
465 st->gc_ignored,
466 st->gc_goal_miss,
467 st->gc_dst_overflow,
468 st->in_hlist_search,
469 st->out_hlist_search
470 );
471 return 0;
472}
473
474static struct seq_operations rt_cpu_seq_ops = {
475 .start = rt_cpu_seq_start,
476 .next = rt_cpu_seq_next,
477 .stop = rt_cpu_seq_stop,
478 .show = rt_cpu_seq_show,
479};
480
481
482static int rt_cpu_seq_open(struct inode *inode, struct file *file)
483{
484 return seq_open(file, &rt_cpu_seq_ops);
485}
486
487static struct file_operations rt_cpu_seq_fops = {
488 .owner = THIS_MODULE,
489 .open = rt_cpu_seq_open,
490 .read = seq_read,
491 .llseek = seq_lseek,
492 .release = seq_release,
493};
494
495#endif /* CONFIG_PROC_FS */
496
497static __inline__ void rt_free(struct rtable *rt)
498{
499 multipath_remove(rt);
500 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
501}
502
503static __inline__ void rt_drop(struct rtable *rt)
504{
505 multipath_remove(rt);
506 ip_rt_put(rt);
507 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
508}
509
510static __inline__ int rt_fast_clean(struct rtable *rth)
511{
512 /* Kill broadcast/multicast entries very aggresively, if they
513 collide in hash table with more useful entries */
514 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
515 rth->fl.iif && rth->u.rt_next;
516}
517
518static __inline__ int rt_valuable(struct rtable *rth)
519{
520 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
521 rth->u.dst.expires;
522}
523
524static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
525{
526 unsigned long age;
527 int ret = 0;
528
529 if (atomic_read(&rth->u.dst.__refcnt))
530 goto out;
531
532 ret = 1;
533 if (rth->u.dst.expires &&
534 time_after_eq(jiffies, rth->u.dst.expires))
535 goto out;
536
537 age = jiffies - rth->u.dst.lastuse;
538 ret = 0;
539 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
540 (age <= tmo2 && rt_valuable(rth)))
541 goto out;
542 ret = 1;
543out: return ret;
544}
545
546/* Bits of score are:
547 * 31: very valuable
548 * 30: not quite useless
549 * 29..0: usage counter
550 */
551static inline u32 rt_score(struct rtable *rt)
552{
553 u32 score = jiffies - rt->u.dst.lastuse;
554
555 score = ~score & ~(3<<30);
556
557 if (rt_valuable(rt))
558 score |= (1<<31);
559
560 if (!rt->fl.iif ||
561 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
562 score |= (1<<30);
563
564 return score;
565}
566
567static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
568{
714e85be
AV
569 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
570 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
47dcf0cb 571 (fl1->mark ^ fl2->mark) |
8238b218
DM
572 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
573 *(u16 *)&fl2->nl_u.ip4_u.tos) |
574 (fl1->oif ^ fl2->oif) |
575 (fl1->iif ^ fl2->iif)) == 0;
1da177e4
LT
576}
577
578#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
579static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
580 struct rtable *expentry,
581 int *removed_count)
582{
583 int passedexpired = 0;
584 struct rtable **nextstep = NULL;
585 struct rtable **rthp = chain_head;
586 struct rtable *rth;
587
588 if (removed_count)
589 *removed_count = 0;
590
591 while ((rth = *rthp) != NULL) {
592 if (rth == expentry)
593 passedexpired = 1;
594
595 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
596 compare_keys(&(*rthp)->fl, &expentry->fl)) {
597 if (*rthp == expentry) {
598 *rthp = rth->u.rt_next;
599 continue;
600 } else {
601 *rthp = rth->u.rt_next;
602 rt_free(rth);
603 if (removed_count)
604 ++(*removed_count);
605 }
606 } else {
607 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
608 passedexpired && !nextstep)
609 nextstep = &rth->u.rt_next;
610
611 rthp = &rth->u.rt_next;
612 }
613 }
614
615 rt_free(expentry);
616 if (removed_count)
617 ++(*removed_count);
618
619 return nextstep;
620}
621#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622
623
624/* This runs via a timer and thus is always in BH context. */
625static void rt_check_expire(unsigned long dummy)
626{
bb1d23b0
ED
627 static unsigned int rover;
628 unsigned int i = rover, goal;
1da177e4
LT
629 struct rtable *rth, **rthp;
630 unsigned long now = jiffies;
bb1d23b0
ED
631 u64 mult;
632
633 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
634 if (ip_rt_gc_timeout > 1)
635 do_div(mult, ip_rt_gc_timeout);
636 goal = (unsigned int)mult;
637 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
638 for (; goal > 0; goal--) {
1da177e4
LT
639 unsigned long tmo = ip_rt_gc_timeout;
640
641 i = (i + 1) & rt_hash_mask;
642 rthp = &rt_hash_table[i].chain;
643
bb1d23b0
ED
644 if (*rthp == 0)
645 continue;
22c047cc 646 spin_lock(rt_hash_lock_addr(i));
1da177e4
LT
647 while ((rth = *rthp) != NULL) {
648 if (rth->u.dst.expires) {
649 /* Entry is expired even if it is in use */
650 if (time_before_eq(now, rth->u.dst.expires)) {
651 tmo >>= 1;
652 rthp = &rth->u.rt_next;
653 continue;
654 }
655 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
656 tmo >>= 1;
657 rthp = &rth->u.rt_next;
658 continue;
659 }
660
661 /* Cleanup aged off entries. */
662#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
663 /* remove all related balanced entries if necessary */
664 if (rth->u.dst.flags & DST_BALANCED) {
665 rthp = rt_remove_balanced_route(
666 &rt_hash_table[i].chain,
667 rth, NULL);
668 if (!rthp)
669 break;
670 } else {
671 *rthp = rth->u.rt_next;
672 rt_free(rth);
673 }
674#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
675 *rthp = rth->u.rt_next;
676 rt_free(rth);
677#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
678 }
22c047cc 679 spin_unlock(rt_hash_lock_addr(i));
1da177e4
LT
680
681 /* Fallback loop breaker. */
682 if (time_after(jiffies, now))
683 break;
684 }
685 rover = i;
bb1d23b0 686 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
1da177e4
LT
687}
688
689/* This can run from both BH and non-BH contexts, the latter
690 * in the case of a forced flush event.
691 */
692static void rt_run_flush(unsigned long dummy)
693{
694 int i;
695 struct rtable *rth, *next;
696
697 rt_deadline = 0;
698
699 get_random_bytes(&rt_hash_rnd, 4);
700
701 for (i = rt_hash_mask; i >= 0; i--) {
22c047cc 702 spin_lock_bh(rt_hash_lock_addr(i));
1da177e4
LT
703 rth = rt_hash_table[i].chain;
704 if (rth)
705 rt_hash_table[i].chain = NULL;
22c047cc 706 spin_unlock_bh(rt_hash_lock_addr(i));
1da177e4
LT
707
708 for (; rth; rth = next) {
709 next = rth->u.rt_next;
710 rt_free(rth);
711 }
712 }
713}
714
715static DEFINE_SPINLOCK(rt_flush_lock);
716
717void rt_cache_flush(int delay)
718{
719 unsigned long now = jiffies;
720 int user_mode = !in_softirq();
721
722 if (delay < 0)
723 delay = ip_rt_min_delay;
724
725 /* flush existing multipath state*/
726 multipath_flush();
727
728 spin_lock_bh(&rt_flush_lock);
729
730 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
731 long tmo = (long)(rt_deadline - now);
732
733 /* If flush timer is already running
734 and flush request is not immediate (delay > 0):
735
736 if deadline is not achieved, prolongate timer to "delay",
737 otherwise fire it at deadline time.
738 */
739
740 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
741 tmo = 0;
742
743 if (delay > tmo)
744 delay = tmo;
745 }
746
747 if (delay <= 0) {
748 spin_unlock_bh(&rt_flush_lock);
749 rt_run_flush(0);
750 return;
751 }
752
753 if (rt_deadline == 0)
754 rt_deadline = now + ip_rt_max_delay;
755
756 mod_timer(&rt_flush_timer, now+delay);
757 spin_unlock_bh(&rt_flush_lock);
758}
759
760static void rt_secret_rebuild(unsigned long dummy)
761{
762 unsigned long now = jiffies;
763
764 rt_cache_flush(0);
765 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
766}
767
768/*
769 Short description of GC goals.
770
771 We want to build algorithm, which will keep routing cache
772 at some equilibrium point, when number of aged off entries
773 is kept approximately equal to newly generated ones.
774
775 Current expiration strength is variable "expire".
776 We try to adjust it dynamically, so that if networking
777 is idle expires is large enough to keep enough of warm entries,
778 and when load increases it reduces to limit cache size.
779 */
780
781static int rt_garbage_collect(void)
782{
783 static unsigned long expire = RT_GC_TIMEOUT;
784 static unsigned long last_gc;
785 static int rover;
786 static int equilibrium;
787 struct rtable *rth, **rthp;
788 unsigned long now = jiffies;
789 int goal;
790
791 /*
792 * Garbage collection is pretty expensive,
793 * do not make it too frequently.
794 */
795
796 RT_CACHE_STAT_INC(gc_total);
797
798 if (now - last_gc < ip_rt_gc_min_interval &&
799 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
800 RT_CACHE_STAT_INC(gc_ignored);
801 goto out;
802 }
803
804 /* Calculate number of entries, which we want to expire now. */
805 goal = atomic_read(&ipv4_dst_ops.entries) -
806 (ip_rt_gc_elasticity << rt_hash_log);
807 if (goal <= 0) {
808 if (equilibrium < ipv4_dst_ops.gc_thresh)
809 equilibrium = ipv4_dst_ops.gc_thresh;
810 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
811 if (goal > 0) {
812 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
813 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
814 }
815 } else {
816 /* We are in dangerous area. Try to reduce cache really
817 * aggressively.
818 */
819 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
820 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
821 }
822
823 if (now - last_gc >= ip_rt_gc_min_interval)
824 last_gc = now;
825
826 if (goal <= 0) {
827 equilibrium += goal;
828 goto work_done;
829 }
830
831 do {
832 int i, k;
833
834 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
835 unsigned long tmo = expire;
836
837 k = (k + 1) & rt_hash_mask;
838 rthp = &rt_hash_table[k].chain;
22c047cc 839 spin_lock_bh(rt_hash_lock_addr(k));
1da177e4
LT
840 while ((rth = *rthp) != NULL) {
841 if (!rt_may_expire(rth, tmo, expire)) {
842 tmo >>= 1;
843 rthp = &rth->u.rt_next;
844 continue;
845 }
846#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
847 /* remove all related balanced entries
848 * if necessary
849 */
850 if (rth->u.dst.flags & DST_BALANCED) {
851 int r;
852
853 rthp = rt_remove_balanced_route(
85259878 854 &rt_hash_table[k].chain,
1da177e4
LT
855 rth,
856 &r);
857 goal -= r;
858 if (!rthp)
859 break;
860 } else {
861 *rthp = rth->u.rt_next;
862 rt_free(rth);
863 goal--;
864 }
865#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
866 *rthp = rth->u.rt_next;
867 rt_free(rth);
868 goal--;
869#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
870 }
22c047cc 871 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
872 if (goal <= 0)
873 break;
874 }
875 rover = k;
876
877 if (goal <= 0)
878 goto work_done;
879
880 /* Goal is not achieved. We stop process if:
881
882 - if expire reduced to zero. Otherwise, expire is halfed.
883 - if table is not full.
884 - if we are called from interrupt.
885 - jiffies check is just fallback/debug loop breaker.
886 We will not spin here for long time in any case.
887 */
888
889 RT_CACHE_STAT_INC(gc_goal_miss);
890
891 if (expire == 0)
892 break;
893
894 expire >>= 1;
895#if RT_CACHE_DEBUG >= 2
896 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
897 atomic_read(&ipv4_dst_ops.entries), goal, i);
898#endif
899
900 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
901 goto out;
902 } while (!in_softirq() && time_before_eq(jiffies, now));
903
904 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
905 goto out;
906 if (net_ratelimit())
907 printk(KERN_WARNING "dst cache overflow\n");
908 RT_CACHE_STAT_INC(gc_dst_overflow);
909 return 1;
910
911work_done:
912 expire += ip_rt_gc_min_interval;
913 if (expire > ip_rt_gc_timeout ||
914 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
915 expire = ip_rt_gc_timeout;
916#if RT_CACHE_DEBUG >= 2
917 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
918 atomic_read(&ipv4_dst_ops.entries), goal, rover);
919#endif
920out: return 0;
921}
922
923static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
924{
925 struct rtable *rth, **rthp;
926 unsigned long now;
927 struct rtable *cand, **candp;
928 u32 min_score;
929 int chain_length;
930 int attempts = !in_softirq();
931
932restart:
933 chain_length = 0;
934 min_score = ~(u32)0;
935 cand = NULL;
936 candp = NULL;
937 now = jiffies;
938
939 rthp = &rt_hash_table[hash].chain;
940
22c047cc 941 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
942 while ((rth = *rthp) != NULL) {
943#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
944 if (!(rth->u.dst.flags & DST_BALANCED) &&
945 compare_keys(&rth->fl, &rt->fl)) {
946#else
947 if (compare_keys(&rth->fl, &rt->fl)) {
948#endif
949 /* Put it first */
950 *rthp = rth->u.rt_next;
951 /*
952 * Since lookup is lockfree, the deletion
953 * must be visible to another weakly ordered CPU before
954 * the insertion at the start of the hash chain.
955 */
956 rcu_assign_pointer(rth->u.rt_next,
957 rt_hash_table[hash].chain);
958 /*
959 * Since lookup is lockfree, the update writes
960 * must be ordered for consistency on SMP.
961 */
962 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
963
964 rth->u.dst.__use++;
965 dst_hold(&rth->u.dst);
966 rth->u.dst.lastuse = now;
22c047cc 967 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
968
969 rt_drop(rt);
970 *rp = rth;
971 return 0;
972 }
973
974 if (!atomic_read(&rth->u.dst.__refcnt)) {
975 u32 score = rt_score(rth);
976
977 if (score <= min_score) {
978 cand = rth;
979 candp = rthp;
980 min_score = score;
981 }
982 }
983
984 chain_length++;
985
986 rthp = &rth->u.rt_next;
987 }
988
989 if (cand) {
990 /* ip_rt_gc_elasticity used to be average length of chain
991 * length, when exceeded gc becomes really aggressive.
992 *
993 * The second limit is less certain. At the moment it allows
994 * only 2 entries per bucket. We will see.
995 */
996 if (chain_length > ip_rt_gc_elasticity) {
997 *candp = cand->u.rt_next;
998 rt_free(cand);
999 }
1000 }
1001
1002 /* Try to bind route to arp only if it is output
1003 route or unicast forwarding path.
1004 */
1005 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1006 int err = arp_bind_neighbour(&rt->u.dst);
1007 if (err) {
22c047cc 1008 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1009
1010 if (err != -ENOBUFS) {
1011 rt_drop(rt);
1012 return err;
1013 }
1014
1015 /* Neighbour tables are full and nothing
1016 can be released. Try to shrink route cache,
1017 it is most likely it holds some neighbour records.
1018 */
1019 if (attempts-- > 0) {
1020 int saved_elasticity = ip_rt_gc_elasticity;
1021 int saved_int = ip_rt_gc_min_interval;
1022 ip_rt_gc_elasticity = 1;
1023 ip_rt_gc_min_interval = 0;
1024 rt_garbage_collect();
1025 ip_rt_gc_min_interval = saved_int;
1026 ip_rt_gc_elasticity = saved_elasticity;
1027 goto restart;
1028 }
1029
1030 if (net_ratelimit())
1031 printk(KERN_WARNING "Neighbour table overflow.\n");
1032 rt_drop(rt);
1033 return -ENOBUFS;
1034 }
1035 }
1036
1037 rt->u.rt_next = rt_hash_table[hash].chain;
1038#if RT_CACHE_DEBUG >= 2
1039 if (rt->u.rt_next) {
1040 struct rtable *trt;
1041 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1042 NIPQUAD(rt->rt_dst));
1043 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1044 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1045 printk("\n");
1046 }
1047#endif
1048 rt_hash_table[hash].chain = rt;
22c047cc 1049 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1050 *rp = rt;
1051 return 0;
1052}
1053
1054void rt_bind_peer(struct rtable *rt, int create)
1055{
1056 static DEFINE_SPINLOCK(rt_peer_lock);
1057 struct inet_peer *peer;
1058
1059 peer = inet_getpeer(rt->rt_dst, create);
1060
1061 spin_lock_bh(&rt_peer_lock);
1062 if (rt->peer == NULL) {
1063 rt->peer = peer;
1064 peer = NULL;
1065 }
1066 spin_unlock_bh(&rt_peer_lock);
1067 if (peer)
1068 inet_putpeer(peer);
1069}
1070
1071/*
1072 * Peer allocation may fail only in serious out-of-memory conditions. However
1073 * we still can generate some output.
1074 * Random ID selection looks a bit dangerous because we have no chances to
1075 * select ID being unique in a reasonable period of time.
1076 * But broken packet identifier may be better than no packet at all.
1077 */
1078static void ip_select_fb_ident(struct iphdr *iph)
1079{
1080 static DEFINE_SPINLOCK(ip_fb_id_lock);
1081 static u32 ip_fallback_id;
1082 u32 salt;
1083
1084 spin_lock_bh(&ip_fb_id_lock);
e448515c 1085 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1086 iph->id = htons(salt & 0xFFFF);
1087 ip_fallback_id = salt;
1088 spin_unlock_bh(&ip_fb_id_lock);
1089}
1090
1091void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1092{
1093 struct rtable *rt = (struct rtable *) dst;
1094
1095 if (rt) {
1096 if (rt->peer == NULL)
1097 rt_bind_peer(rt, 1);
1098
1099 /* If peer is attached to destination, it is never detached,
1100 so that we need not to grab a lock to dereference it.
1101 */
1102 if (rt->peer) {
1103 iph->id = htons(inet_getid(rt->peer, more));
1104 return;
1105 }
1106 } else
9c2b3328
SH
1107 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1108 __builtin_return_address(0));
1da177e4
LT
1109
1110 ip_select_fb_ident(iph);
1111}
1112
1113static void rt_del(unsigned hash, struct rtable *rt)
1114{
1115 struct rtable **rthp;
1116
22c047cc 1117 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1118 ip_rt_put(rt);
1119 for (rthp = &rt_hash_table[hash].chain; *rthp;
1120 rthp = &(*rthp)->u.rt_next)
1121 if (*rthp == rt) {
1122 *rthp = rt->u.rt_next;
1123 rt_free(rt);
1124 break;
1125 }
22c047cc 1126 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1127}
1128
f7655229
AV
1129void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1130 __be32 saddr, struct net_device *dev)
1da177e4
LT
1131{
1132 int i, k;
1133 struct in_device *in_dev = in_dev_get(dev);
1134 struct rtable *rth, **rthp;
f7655229 1135 __be32 skeys[2] = { saddr, 0 };
1da177e4 1136 int ikeys[2] = { dev->ifindex, 0 };
8d71740c 1137 struct netevent_redirect netevent;
1da177e4 1138
1da177e4
LT
1139 if (!in_dev)
1140 return;
1141
1142 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1143 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1144 goto reject_redirect;
1145
1146 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1147 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1148 goto reject_redirect;
1149 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1150 goto reject_redirect;
1151 } else {
1152 if (inet_addr_type(new_gw) != RTN_UNICAST)
1153 goto reject_redirect;
1154 }
1155
1156 for (i = 0; i < 2; i++) {
1157 for (k = 0; k < 2; k++) {
8c7bc840 1158 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1da177e4
LT
1159
1160 rthp=&rt_hash_table[hash].chain;
1161
1162 rcu_read_lock();
1163 while ((rth = rcu_dereference(*rthp)) != NULL) {
1164 struct rtable *rt;
1165
1166 if (rth->fl.fl4_dst != daddr ||
1167 rth->fl.fl4_src != skeys[i] ||
1da177e4
LT
1168 rth->fl.oif != ikeys[k] ||
1169 rth->fl.iif != 0) {
1170 rthp = &rth->u.rt_next;
1171 continue;
1172 }
1173
1174 if (rth->rt_dst != daddr ||
1175 rth->rt_src != saddr ||
1176 rth->u.dst.error ||
1177 rth->rt_gateway != old_gw ||
1178 rth->u.dst.dev != dev)
1179 break;
1180
1181 dst_hold(&rth->u.dst);
1182 rcu_read_unlock();
1183
1184 rt = dst_alloc(&ipv4_dst_ops);
1185 if (rt == NULL) {
1186 ip_rt_put(rth);
1187 in_dev_put(in_dev);
1188 return;
1189 }
1190
1191 /* Copy all the information. */
1192 *rt = *rth;
1193 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1194 rt->u.dst.__use = 1;
1195 atomic_set(&rt->u.dst.__refcnt, 1);
1196 rt->u.dst.child = NULL;
1197 if (rt->u.dst.dev)
1198 dev_hold(rt->u.dst.dev);
1199 if (rt->idev)
1200 in_dev_hold(rt->idev);
1201 rt->u.dst.obsolete = 0;
1202 rt->u.dst.lastuse = jiffies;
1203 rt->u.dst.path = &rt->u.dst;
1204 rt->u.dst.neighbour = NULL;
1205 rt->u.dst.hh = NULL;
1206 rt->u.dst.xfrm = NULL;
1207
1208 rt->rt_flags |= RTCF_REDIRECTED;
1209
1210 /* Gateway is different ... */
1211 rt->rt_gateway = new_gw;
1212
1213 /* Redirect received -> path was valid */
1214 dst_confirm(&rth->u.dst);
1215
1216 if (rt->peer)
1217 atomic_inc(&rt->peer->refcnt);
1218
1219 if (arp_bind_neighbour(&rt->u.dst) ||
1220 !(rt->u.dst.neighbour->nud_state &
1221 NUD_VALID)) {
1222 if (rt->u.dst.neighbour)
1223 neigh_event_send(rt->u.dst.neighbour, NULL);
1224 ip_rt_put(rth);
1225 rt_drop(rt);
1226 goto do_next;
1227 }
8d71740c
TT
1228
1229 netevent.old = &rth->u.dst;
1230 netevent.new = &rt->u.dst;
1231 call_netevent_notifiers(NETEVENT_REDIRECT,
1232 &netevent);
1da177e4
LT
1233
1234 rt_del(hash, rth);
1235 if (!rt_intern_hash(hash, rt, &rt))
1236 ip_rt_put(rt);
1237 goto do_next;
1238 }
1239 rcu_read_unlock();
1240 do_next:
1241 ;
1242 }
1243 }
1244 in_dev_put(in_dev);
1245 return;
1246
1247reject_redirect:
1248#ifdef CONFIG_IP_ROUTE_VERBOSE
1249 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1250 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1251 "%u.%u.%u.%u ignored.\n"
cef2685e 1252 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1da177e4 1253 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
cef2685e 1254 NIPQUAD(saddr), NIPQUAD(daddr));
1da177e4
LT
1255#endif
1256 in_dev_put(in_dev);
1257}
1258
1259static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1260{
1261 struct rtable *rt = (struct rtable*)dst;
1262 struct dst_entry *ret = dst;
1263
1264 if (rt) {
1265 if (dst->obsolete) {
1266 ip_rt_put(rt);
1267 ret = NULL;
1268 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1269 rt->u.dst.expires) {
8c7bc840
AV
1270 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1271 rt->fl.oif);
1da177e4
LT
1272#if RT_CACHE_DEBUG >= 1
1273 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1274 "%u.%u.%u.%u/%02x dropped\n",
1275 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1276#endif
1277 rt_del(hash, rt);
1278 ret = NULL;
1279 }
1280 }
1281 return ret;
1282}
1283
1284/*
1285 * Algorithm:
1286 * 1. The first ip_rt_redirect_number redirects are sent
1287 * with exponential backoff, then we stop sending them at all,
1288 * assuming that the host ignores our redirects.
1289 * 2. If we did not see packets requiring redirects
1290 * during ip_rt_redirect_silence, we assume that the host
1291 * forgot redirected route and start to send redirects again.
1292 *
1293 * This algorithm is much cheaper and more intelligent than dumb load limiting
1294 * in icmp.c.
1295 *
1296 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1297 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1298 */
1299
1300void ip_rt_send_redirect(struct sk_buff *skb)
1301{
1302 struct rtable *rt = (struct rtable*)skb->dst;
1303 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1304
1305 if (!in_dev)
1306 return;
1307
1308 if (!IN_DEV_TX_REDIRECTS(in_dev))
1309 goto out;
1310
1311 /* No redirected packets during ip_rt_redirect_silence;
1312 * reset the algorithm.
1313 */
1314 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1315 rt->u.dst.rate_tokens = 0;
1316
1317 /* Too many ignored redirects; do not send anything
1318 * set u.dst.rate_last to the last seen redirected packet.
1319 */
1320 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1321 rt->u.dst.rate_last = jiffies;
1322 goto out;
1323 }
1324
1325 /* Check for load limit; set rate_last to the latest sent
1326 * redirect.
1327 */
14fb8a76
LY
1328 if (rt->u.dst.rate_tokens == 0 ||
1329 time_after(jiffies,
1da177e4
LT
1330 (rt->u.dst.rate_last +
1331 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1332 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1333 rt->u.dst.rate_last = jiffies;
1334 ++rt->u.dst.rate_tokens;
1335#ifdef CONFIG_IP_ROUTE_VERBOSE
1336 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1337 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1338 net_ratelimit())
1339 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1340 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1341 NIPQUAD(rt->rt_src), rt->rt_iif,
1342 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1343#endif
1344 }
1345out:
1346 in_dev_put(in_dev);
1347}
1348
1349static int ip_error(struct sk_buff *skb)
1350{
1351 struct rtable *rt = (struct rtable*)skb->dst;
1352 unsigned long now;
1353 int code;
1354
1355 switch (rt->u.dst.error) {
1356 case EINVAL:
1357 default:
1358 goto out;
1359 case EHOSTUNREACH:
1360 code = ICMP_HOST_UNREACH;
1361 break;
1362 case ENETUNREACH:
1363 code = ICMP_NET_UNREACH;
1364 break;
1365 case EACCES:
1366 code = ICMP_PKT_FILTERED;
1367 break;
1368 }
1369
1370 now = jiffies;
1371 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1372 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1373 rt->u.dst.rate_tokens = ip_rt_error_burst;
1374 rt->u.dst.rate_last = now;
1375 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1376 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1377 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1378 }
1379
1380out: kfree_skb(skb);
1381 return 0;
1382}
1383
1384/*
1385 * The last two values are not from the RFC but
1386 * are needed for AMPRnet AX.25 paths.
1387 */
1388
9b5b5cff 1389static const unsigned short mtu_plateau[] =
1da177e4
LT
1390{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1391
1392static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1393{
1394 int i;
1395
1396 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1397 if (old_mtu > mtu_plateau[i])
1398 return mtu_plateau[i];
1399 return 68;
1400}
1401
1402unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1403{
1404 int i;
1405 unsigned short old_mtu = ntohs(iph->tot_len);
1406 struct rtable *rth;
e448515c
AV
1407 __be32 skeys[2] = { iph->saddr, 0, };
1408 __be32 daddr = iph->daddr;
1da177e4
LT
1409 unsigned short est_mtu = 0;
1410
1411 if (ipv4_config.no_pmtu_disc)
1412 return 0;
1413
1414 for (i = 0; i < 2; i++) {
8c7bc840 1415 unsigned hash = rt_hash(daddr, skeys[i], 0);
1da177e4
LT
1416
1417 rcu_read_lock();
1418 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1419 rth = rcu_dereference(rth->u.rt_next)) {
1420 if (rth->fl.fl4_dst == daddr &&
1421 rth->fl.fl4_src == skeys[i] &&
1422 rth->rt_dst == daddr &&
1423 rth->rt_src == iph->saddr &&
1da177e4
LT
1424 rth->fl.iif == 0 &&
1425 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1426 unsigned short mtu = new_mtu;
1427
1428 if (new_mtu < 68 || new_mtu >= old_mtu) {
1429
1430 /* BSD 4.2 compatibility hack :-( */
1431 if (mtu == 0 &&
1432 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1433 old_mtu >= 68 + (iph->ihl << 2))
1434 old_mtu -= iph->ihl << 2;
1435
1436 mtu = guess_mtu(old_mtu);
1437 }
1438 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1439 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1440 dst_confirm(&rth->u.dst);
1441 if (mtu < ip_rt_min_pmtu) {
1442 mtu = ip_rt_min_pmtu;
1443 rth->u.dst.metrics[RTAX_LOCK-1] |=
1444 (1 << RTAX_MTU);
1445 }
1446 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1447 dst_set_expires(&rth->u.dst,
1448 ip_rt_mtu_expires);
1449 }
1450 est_mtu = mtu;
1451 }
1452 }
1453 }
1454 rcu_read_unlock();
1455 }
1456 return est_mtu ? : new_mtu;
1457}
1458
1459static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1460{
1461 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1462 !(dst_metric_locked(dst, RTAX_MTU))) {
1463 if (mtu < ip_rt_min_pmtu) {
1464 mtu = ip_rt_min_pmtu;
1465 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1466 }
1467 dst->metrics[RTAX_MTU-1] = mtu;
1468 dst_set_expires(dst, ip_rt_mtu_expires);
8d71740c 1469 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1da177e4
LT
1470 }
1471}
1472
1473static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1474{
1475 return NULL;
1476}
1477
1478static void ipv4_dst_destroy(struct dst_entry *dst)
1479{
1480 struct rtable *rt = (struct rtable *) dst;
1481 struct inet_peer *peer = rt->peer;
1482 struct in_device *idev = rt->idev;
1483
1484 if (peer) {
1485 rt->peer = NULL;
1486 inet_putpeer(peer);
1487 }
1488
1489 if (idev) {
1490 rt->idev = NULL;
1491 in_dev_put(idev);
1492 }
1493}
1494
1495static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1496 int how)
1497{
1498 struct rtable *rt = (struct rtable *) dst;
1499 struct in_device *idev = rt->idev;
1500 if (dev != &loopback_dev && idev && idev->dev == dev) {
1501 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1502 if (loopback_idev) {
1503 rt->idev = loopback_idev;
1504 in_dev_put(idev);
1505 }
1506 }
1507}
1508
1509static void ipv4_link_failure(struct sk_buff *skb)
1510{
1511 struct rtable *rt;
1512
1513 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1514
1515 rt = (struct rtable *) skb->dst;
1516 if (rt)
1517 dst_set_expires(&rt->u.dst, 0);
1518}
1519
1520static int ip_rt_bug(struct sk_buff *skb)
1521{
1522 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1523 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1524 skb->dev ? skb->dev->name : "?");
1525 kfree_skb(skb);
1526 return 0;
1527}
1528
1529/*
1530 We do not cache source address of outgoing interface,
1531 because it is used only by IP RR, TS and SRR options,
1532 so that it out of fast path.
1533
1534 BTW remember: "addr" is allowed to be not aligned
1535 in IP options!
1536 */
1537
1538void ip_rt_get_source(u8 *addr, struct rtable *rt)
1539{
a61ced5d 1540 __be32 src;
1da177e4
LT
1541 struct fib_result res;
1542
1543 if (rt->fl.iif == 0)
1544 src = rt->rt_src;
1545 else if (fib_lookup(&rt->fl, &res) == 0) {
1546 src = FIB_RES_PREFSRC(res);
1547 fib_res_put(&res);
1548 } else
1549 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1550 RT_SCOPE_UNIVERSE);
1551 memcpy(addr, &src, 4);
1552}
1553
1554#ifdef CONFIG_NET_CLS_ROUTE
1555static void set_class_tag(struct rtable *rt, u32 tag)
1556{
1557 if (!(rt->u.dst.tclassid & 0xFFFF))
1558 rt->u.dst.tclassid |= tag & 0xFFFF;
1559 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1560 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1561}
1562#endif
1563
1564static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1565{
1566 struct fib_info *fi = res->fi;
1567
1568 if (fi) {
1569 if (FIB_RES_GW(*res) &&
1570 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1571 rt->rt_gateway = FIB_RES_GW(*res);
1572 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1573 sizeof(rt->u.dst.metrics));
1574 if (fi->fib_mtu == 0) {
1575 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1576 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1577 rt->rt_gateway != rt->rt_dst &&
1578 rt->u.dst.dev->mtu > 576)
1579 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1580 }
1581#ifdef CONFIG_NET_CLS_ROUTE
1582 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1583#endif
1584 } else
1585 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1586
1587 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1588 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1589 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1590 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1591 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1592 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1593 ip_rt_min_advmss);
1594 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1595 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1596
1597#ifdef CONFIG_NET_CLS_ROUTE
1598#ifdef CONFIG_IP_MULTIPLE_TABLES
1599 set_class_tag(rt, fib_rules_tclass(res));
1600#endif
1601 set_class_tag(rt, itag);
1602#endif
1603 rt->rt_type = res->type;
1604}
1605
9e12bb22 1606static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1607 u8 tos, struct net_device *dev, int our)
1608{
1609 unsigned hash;
1610 struct rtable *rth;
a61ced5d 1611 __be32 spec_dst;
1da177e4
LT
1612 struct in_device *in_dev = in_dev_get(dev);
1613 u32 itag = 0;
1614
1615 /* Primary sanity checks. */
1616
1617 if (in_dev == NULL)
1618 return -EINVAL;
1619
1620 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1621 skb->protocol != htons(ETH_P_IP))
1622 goto e_inval;
1623
1624 if (ZERONET(saddr)) {
1625 if (!LOCAL_MCAST(daddr))
1626 goto e_inval;
1627 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1628 } else if (fib_validate_source(saddr, 0, tos, 0,
1629 dev, &spec_dst, &itag) < 0)
1630 goto e_inval;
1631
1632 rth = dst_alloc(&ipv4_dst_ops);
1633 if (!rth)
1634 goto e_nobufs;
1635
1636 rth->u.dst.output= ip_rt_bug;
1637
1638 atomic_set(&rth->u.dst.__refcnt, 1);
1639 rth->u.dst.flags= DST_HOST;
1640 if (in_dev->cnf.no_policy)
1641 rth->u.dst.flags |= DST_NOPOLICY;
1642 rth->fl.fl4_dst = daddr;
1643 rth->rt_dst = daddr;
1644 rth->fl.fl4_tos = tos;
47dcf0cb 1645 rth->fl.mark = skb->mark;
1da177e4
LT
1646 rth->fl.fl4_src = saddr;
1647 rth->rt_src = saddr;
1648#ifdef CONFIG_NET_CLS_ROUTE
1649 rth->u.dst.tclassid = itag;
1650#endif
1651 rth->rt_iif =
1652 rth->fl.iif = dev->ifindex;
1653 rth->u.dst.dev = &loopback_dev;
1654 dev_hold(rth->u.dst.dev);
1655 rth->idev = in_dev_get(rth->u.dst.dev);
1656 rth->fl.oif = 0;
1657 rth->rt_gateway = daddr;
1658 rth->rt_spec_dst= spec_dst;
1659 rth->rt_type = RTN_MULTICAST;
1660 rth->rt_flags = RTCF_MULTICAST;
1661 if (our) {
1662 rth->u.dst.input= ip_local_deliver;
1663 rth->rt_flags |= RTCF_LOCAL;
1664 }
1665
1666#ifdef CONFIG_IP_MROUTE
1667 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1668 rth->u.dst.input = ip_mr_input;
1669#endif
1670 RT_CACHE_STAT_INC(in_slow_mc);
1671
1672 in_dev_put(in_dev);
8c7bc840 1673 hash = rt_hash(daddr, saddr, dev->ifindex);
1da177e4
LT
1674 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1675
1676e_nobufs:
1677 in_dev_put(in_dev);
1678 return -ENOBUFS;
1679
1680e_inval:
1681 in_dev_put(in_dev);
1682 return -EINVAL;
1683}
1684
1685
1686static void ip_handle_martian_source(struct net_device *dev,
1687 struct in_device *in_dev,
1688 struct sk_buff *skb,
9e12bb22
AV
1689 __be32 daddr,
1690 __be32 saddr)
1da177e4
LT
1691{
1692 RT_CACHE_STAT_INC(in_martian_src);
1693#ifdef CONFIG_IP_ROUTE_VERBOSE
1694 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1695 /*
1696 * RFC1812 recommendation, if source is martian,
1697 * the only hint is MAC header.
1698 */
1699 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1700 "%u.%u.%u.%u, on dev %s\n",
1701 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
0b7f22aa 1702 if (dev->hard_header_len && skb->mac.raw) {
1da177e4
LT
1703 int i;
1704 unsigned char *p = skb->mac.raw;
1705 printk(KERN_WARNING "ll header: ");
1706 for (i = 0; i < dev->hard_header_len; i++, p++) {
1707 printk("%02x", *p);
1708 if (i < (dev->hard_header_len - 1))
1709 printk(":");
1710 }
1711 printk("\n");
1712 }
1713 }
1714#endif
1715}
1716
1717static inline int __mkroute_input(struct sk_buff *skb,
1718 struct fib_result* res,
1719 struct in_device *in_dev,
9e12bb22 1720 __be32 daddr, __be32 saddr, u32 tos,
1da177e4
LT
1721 struct rtable **result)
1722{
1723
1724 struct rtable *rth;
1725 int err;
1726 struct in_device *out_dev;
1727 unsigned flags = 0;
d9c9df8c
AV
1728 __be32 spec_dst;
1729 u32 itag;
1da177e4
LT
1730
1731 /* get a working reference to the output device */
1732 out_dev = in_dev_get(FIB_RES_DEV(*res));
1733 if (out_dev == NULL) {
1734 if (net_ratelimit())
1735 printk(KERN_CRIT "Bug in ip_route_input" \
1736 "_slow(). Please, report\n");
1737 return -EINVAL;
1738 }
1739
1740
1741 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1742 in_dev->dev, &spec_dst, &itag);
1743 if (err < 0) {
1744 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1745 saddr);
1746
1747 err = -EINVAL;
1748 goto cleanup;
1749 }
1750
1751 if (err)
1752 flags |= RTCF_DIRECTSRC;
1753
1754 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1755 (IN_DEV_SHARED_MEDIA(out_dev) ||
1756 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1757 flags |= RTCF_DOREDIRECT;
1758
1759 if (skb->protocol != htons(ETH_P_IP)) {
1760 /* Not IP (i.e. ARP). Do not create route, if it is
1761 * invalid for proxy arp. DNAT routes are always valid.
1762 */
1763 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1764 err = -EINVAL;
1765 goto cleanup;
1766 }
1767 }
1768
1769
1770 rth = dst_alloc(&ipv4_dst_ops);
1771 if (!rth) {
1772 err = -ENOBUFS;
1773 goto cleanup;
1774 }
1775
ce723d8e 1776 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4
LT
1777 rth->u.dst.flags= DST_HOST;
1778#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1779 if (res->fi->fib_nhs > 1)
1780 rth->u.dst.flags |= DST_BALANCED;
1781#endif
1782 if (in_dev->cnf.no_policy)
1783 rth->u.dst.flags |= DST_NOPOLICY;
1b6651f1 1784 if (out_dev->cnf.no_xfrm)
1da177e4
LT
1785 rth->u.dst.flags |= DST_NOXFRM;
1786 rth->fl.fl4_dst = daddr;
1787 rth->rt_dst = daddr;
1788 rth->fl.fl4_tos = tos;
47dcf0cb 1789 rth->fl.mark = skb->mark;
1da177e4
LT
1790 rth->fl.fl4_src = saddr;
1791 rth->rt_src = saddr;
1792 rth->rt_gateway = daddr;
1793 rth->rt_iif =
1794 rth->fl.iif = in_dev->dev->ifindex;
1795 rth->u.dst.dev = (out_dev)->dev;
1796 dev_hold(rth->u.dst.dev);
1797 rth->idev = in_dev_get(rth->u.dst.dev);
1798 rth->fl.oif = 0;
1799 rth->rt_spec_dst= spec_dst;
1800
1801 rth->u.dst.input = ip_forward;
1802 rth->u.dst.output = ip_output;
1803
1804 rt_set_nexthop(rth, res, itag);
1805
1806 rth->rt_flags = flags;
1807
1808 *result = rth;
1809 err = 0;
1810 cleanup:
1811 /* release the working reference to the output device */
1812 in_dev_put(out_dev);
1813 return err;
1814}
1815
1816static inline int ip_mkroute_input_def(struct sk_buff *skb,
1817 struct fib_result* res,
1818 const struct flowi *fl,
1819 struct in_device *in_dev,
9e12bb22 1820 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1821{
7abaa27c 1822 struct rtable* rth = NULL;
1da177e4
LT
1823 int err;
1824 unsigned hash;
1825
1826#ifdef CONFIG_IP_ROUTE_MULTIPATH
1827 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1828 fib_select_multipath(fl, res);
1829#endif
1830
1831 /* create a routing cache entry */
1832 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1833 if (err)
1834 return err;
1da177e4
LT
1835
1836 /* put it into the cache */
8c7bc840 1837 hash = rt_hash(daddr, saddr, fl->iif);
1da177e4
LT
1838 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1839}
1840
1841static inline int ip_mkroute_input(struct sk_buff *skb,
1842 struct fib_result* res,
1843 const struct flowi *fl,
1844 struct in_device *in_dev,
9e12bb22 1845 __be32 daddr, __be32 saddr, u32 tos)
1da177e4
LT
1846{
1847#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
ce723d8e
JA
1848 struct rtable* rth = NULL, *rtres;
1849 unsigned char hop, hopcount;
1da177e4
LT
1850 int err = -EINVAL;
1851 unsigned int hash;
1852
1853 if (res->fi)
1854 hopcount = res->fi->fib_nhs;
1855 else
1856 hopcount = 1;
1857
1da177e4
LT
1858 /* distinguish between multipath and singlepath */
1859 if (hopcount < 2)
1860 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1861 saddr, tos);
1862
1863 /* add all alternatives to the routing cache */
1864 for (hop = 0; hop < hopcount; hop++) {
1865 res->nh_sel = hop;
1866
ce723d8e
JA
1867 /* put reference to previous result */
1868 if (hop)
1869 ip_rt_put(rtres);
1870
1da177e4
LT
1871 /* create a routing cache entry */
1872 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1873 &rth);
1874 if (err)
1875 return err;
1876
1877 /* put it into the cache */
8c7bc840 1878 hash = rt_hash(daddr, saddr, fl->iif);
ce723d8e 1879 err = rt_intern_hash(hash, rth, &rtres);
1da177e4
LT
1880 if (err)
1881 return err;
1882
1883 /* forward hop information to multipath impl. */
1884 multipath_set_nhinfo(rth,
1885 FIB_RES_NETWORK(*res),
1886 FIB_RES_NETMASK(*res),
1887 res->prefixlen,
1888 &FIB_RES_NH(*res));
1da177e4 1889 }
ce723d8e 1890 skb->dst = &rtres->u.dst;
1da177e4
LT
1891 return err;
1892#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1893 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1894#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1895}
1896
1897
1898/*
1899 * NOTE. We drop all the packets that has local source
1900 * addresses, because every properly looped back packet
1901 * must have correct destination already attached by output routine.
1902 *
1903 * Such approach solves two big problems:
1904 * 1. Not simplex devices are handled properly.
1905 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1906 */
1907
9e12bb22 1908static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1909 u8 tos, struct net_device *dev)
1910{
1911 struct fib_result res;
1912 struct in_device *in_dev = in_dev_get(dev);
1913 struct flowi fl = { .nl_u = { .ip4_u =
1914 { .daddr = daddr,
1915 .saddr = saddr,
1916 .tos = tos,
1917 .scope = RT_SCOPE_UNIVERSE,
1da177e4 1918 } },
47dcf0cb 1919 .mark = skb->mark,
1da177e4
LT
1920 .iif = dev->ifindex };
1921 unsigned flags = 0;
1922 u32 itag = 0;
1923 struct rtable * rth;
1924 unsigned hash;
9e12bb22 1925 __be32 spec_dst;
1da177e4
LT
1926 int err = -EINVAL;
1927 int free_res = 0;
1928
1929 /* IP on this device is disabled. */
1930
1931 if (!in_dev)
1932 goto out;
1933
1934 /* Check for the most weird martians, which can be not detected
1935 by fib_lookup.
1936 */
1937
1938 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1939 goto martian_source;
1940
e448515c 1941 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1da177e4
LT
1942 goto brd_input;
1943
1944 /* Accept zero addresses only to limited broadcast;
1945 * I even do not know to fix it or not. Waiting for complains :-)
1946 */
1947 if (ZERONET(saddr))
1948 goto martian_source;
1949
1950 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1951 goto martian_destination;
1952
1953 /*
1954 * Now we are ready to route packet.
1955 */
1956 if ((err = fib_lookup(&fl, &res)) != 0) {
1957 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1958 goto e_hostunreach;
1da177e4
LT
1959 goto no_route;
1960 }
1961 free_res = 1;
1962
1963 RT_CACHE_STAT_INC(in_slow_tot);
1964
1965 if (res.type == RTN_BROADCAST)
1966 goto brd_input;
1967
1968 if (res.type == RTN_LOCAL) {
1969 int result;
1970 result = fib_validate_source(saddr, daddr, tos,
1971 loopback_dev.ifindex,
1972 dev, &spec_dst, &itag);
1973 if (result < 0)
1974 goto martian_source;
1975 if (result)
1976 flags |= RTCF_DIRECTSRC;
1977 spec_dst = daddr;
1978 goto local_input;
1979 }
1980
1981 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1982 goto e_hostunreach;
1da177e4
LT
1983 if (res.type != RTN_UNICAST)
1984 goto martian_destination;
1985
1986 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1987 if (err == -ENOBUFS)
1988 goto e_nobufs;
1989 if (err == -EINVAL)
1990 goto e_inval;
1991
1992done:
1993 in_dev_put(in_dev);
1994 if (free_res)
1995 fib_res_put(&res);
1996out: return err;
1997
1998brd_input:
1999 if (skb->protocol != htons(ETH_P_IP))
2000 goto e_inval;
2001
2002 if (ZERONET(saddr))
2003 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2004 else {
2005 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2006 &itag);
2007 if (err < 0)
2008 goto martian_source;
2009 if (err)
2010 flags |= RTCF_DIRECTSRC;
2011 }
2012 flags |= RTCF_BROADCAST;
2013 res.type = RTN_BROADCAST;
2014 RT_CACHE_STAT_INC(in_brd);
2015
2016local_input:
2017 rth = dst_alloc(&ipv4_dst_ops);
2018 if (!rth)
2019 goto e_nobufs;
2020
2021 rth->u.dst.output= ip_rt_bug;
2022
2023 atomic_set(&rth->u.dst.__refcnt, 1);
2024 rth->u.dst.flags= DST_HOST;
2025 if (in_dev->cnf.no_policy)
2026 rth->u.dst.flags |= DST_NOPOLICY;
2027 rth->fl.fl4_dst = daddr;
2028 rth->rt_dst = daddr;
2029 rth->fl.fl4_tos = tos;
47dcf0cb 2030 rth->fl.mark = skb->mark;
1da177e4
LT
2031 rth->fl.fl4_src = saddr;
2032 rth->rt_src = saddr;
2033#ifdef CONFIG_NET_CLS_ROUTE
2034 rth->u.dst.tclassid = itag;
2035#endif
2036 rth->rt_iif =
2037 rth->fl.iif = dev->ifindex;
2038 rth->u.dst.dev = &loopback_dev;
2039 dev_hold(rth->u.dst.dev);
2040 rth->idev = in_dev_get(rth->u.dst.dev);
2041 rth->rt_gateway = daddr;
2042 rth->rt_spec_dst= spec_dst;
2043 rth->u.dst.input= ip_local_deliver;
2044 rth->rt_flags = flags|RTCF_LOCAL;
2045 if (res.type == RTN_UNREACHABLE) {
2046 rth->u.dst.input= ip_error;
2047 rth->u.dst.error= -err;
2048 rth->rt_flags &= ~RTCF_LOCAL;
2049 }
2050 rth->rt_type = res.type;
8c7bc840 2051 hash = rt_hash(daddr, saddr, fl.iif);
1da177e4
LT
2052 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2053 goto done;
2054
2055no_route:
2056 RT_CACHE_STAT_INC(in_no_route);
2057 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2058 res.type = RTN_UNREACHABLE;
2059 goto local_input;
2060
2061 /*
2062 * Do not cache martian addresses: they should be logged (RFC1812)
2063 */
2064martian_destination:
2065 RT_CACHE_STAT_INC(in_martian_dst);
2066#ifdef CONFIG_IP_ROUTE_VERBOSE
2067 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2068 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2069 "%u.%u.%u.%u, dev %s\n",
2070 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2071#endif
2c2910a4
DE
2072
2073e_hostunreach:
2074 err = -EHOSTUNREACH;
2075 goto done;
2076
1da177e4
LT
2077e_inval:
2078 err = -EINVAL;
2079 goto done;
2080
2081e_nobufs:
2082 err = -ENOBUFS;
2083 goto done;
2084
2085martian_source:
2086 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2087 goto e_inval;
2088}
2089
9e12bb22 2090int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2091 u8 tos, struct net_device *dev)
2092{
2093 struct rtable * rth;
2094 unsigned hash;
2095 int iif = dev->ifindex;
2096
2097 tos &= IPTOS_RT_MASK;
8c7bc840 2098 hash = rt_hash(daddr, saddr, iif);
1da177e4
LT
2099
2100 rcu_read_lock();
2101 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2102 rth = rcu_dereference(rth->u.rt_next)) {
2103 if (rth->fl.fl4_dst == daddr &&
2104 rth->fl.fl4_src == saddr &&
2105 rth->fl.iif == iif &&
2106 rth->fl.oif == 0 &&
47dcf0cb 2107 rth->fl.mark == skb->mark &&
1da177e4
LT
2108 rth->fl.fl4_tos == tos) {
2109 rth->u.dst.lastuse = jiffies;
2110 dst_hold(&rth->u.dst);
2111 rth->u.dst.__use++;
2112 RT_CACHE_STAT_INC(in_hit);
2113 rcu_read_unlock();
2114 skb->dst = (struct dst_entry*)rth;
2115 return 0;
2116 }
2117 RT_CACHE_STAT_INC(in_hlist_search);
2118 }
2119 rcu_read_unlock();
2120
2121 /* Multicast recognition logic is moved from route cache to here.
2122 The problem was that too many Ethernet cards have broken/missing
2123 hardware multicast filters :-( As result the host on multicasting
2124 network acquires a lot of useless route cache entries, sort of
2125 SDR messages from all the world. Now we try to get rid of them.
2126 Really, provided software IP multicast filter is organized
2127 reasonably (at least, hashed), it does not result in a slowdown
2128 comparing with route cache reject entries.
2129 Note, that multicast routers are not affected, because
2130 route cache entry is created eventually.
2131 */
2132 if (MULTICAST(daddr)) {
2133 struct in_device *in_dev;
2134
2135 rcu_read_lock();
e5ed6399 2136 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1da177e4
LT
2137 int our = ip_check_mc(in_dev, daddr, saddr,
2138 skb->nh.iph->protocol);
2139 if (our
2140#ifdef CONFIG_IP_MROUTE
2141 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2142#endif
2143 ) {
2144 rcu_read_unlock();
2145 return ip_route_input_mc(skb, daddr, saddr,
2146 tos, dev, our);
2147 }
2148 }
2149 rcu_read_unlock();
2150 return -EINVAL;
2151 }
2152 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2153}
2154
2155static inline int __mkroute_output(struct rtable **result,
2156 struct fib_result* res,
2157 const struct flowi *fl,
2158 const struct flowi *oldflp,
2159 struct net_device *dev_out,
2160 unsigned flags)
2161{
2162 struct rtable *rth;
2163 struct in_device *in_dev;
2164 u32 tos = RT_FL_TOS(oldflp);
2165 int err = 0;
2166
2167 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2168 return -EINVAL;
2169
e448515c 2170 if (fl->fl4_dst == htonl(0xFFFFFFFF))
1da177e4
LT
2171 res->type = RTN_BROADCAST;
2172 else if (MULTICAST(fl->fl4_dst))
2173 res->type = RTN_MULTICAST;
2174 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2175 return -EINVAL;
2176
2177 if (dev_out->flags & IFF_LOOPBACK)
2178 flags |= RTCF_LOCAL;
2179
2180 /* get work reference to inet device */
2181 in_dev = in_dev_get(dev_out);
2182 if (!in_dev)
2183 return -EINVAL;
2184
2185 if (res->type == RTN_BROADCAST) {
2186 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2187 if (res->fi) {
2188 fib_info_put(res->fi);
2189 res->fi = NULL;
2190 }
2191 } else if (res->type == RTN_MULTICAST) {
2192 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2193 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2194 oldflp->proto))
2195 flags &= ~RTCF_LOCAL;
2196 /* If multicast route do not exist use
2197 default one, but do not gateway in this case.
2198 Yes, it is hack.
2199 */
2200 if (res->fi && res->prefixlen < 4) {
2201 fib_info_put(res->fi);
2202 res->fi = NULL;
2203 }
2204 }
2205
2206
2207 rth = dst_alloc(&ipv4_dst_ops);
2208 if (!rth) {
2209 err = -ENOBUFS;
2210 goto cleanup;
2211 }
2212
ce723d8e 2213 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4
LT
2214 rth->u.dst.flags= DST_HOST;
2215#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2216 if (res->fi) {
2217 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2218 if (res->fi->fib_nhs > 1)
2219 rth->u.dst.flags |= DST_BALANCED;
2220 }
2221#endif
2222 if (in_dev->cnf.no_xfrm)
2223 rth->u.dst.flags |= DST_NOXFRM;
2224 if (in_dev->cnf.no_policy)
2225 rth->u.dst.flags |= DST_NOPOLICY;
2226
2227 rth->fl.fl4_dst = oldflp->fl4_dst;
2228 rth->fl.fl4_tos = tos;
2229 rth->fl.fl4_src = oldflp->fl4_src;
2230 rth->fl.oif = oldflp->oif;
47dcf0cb 2231 rth->fl.mark = oldflp->mark;
1da177e4
LT
2232 rth->rt_dst = fl->fl4_dst;
2233 rth->rt_src = fl->fl4_src;
2234 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2235 /* get references to the devices that are to be hold by the routing
2236 cache entry */
2237 rth->u.dst.dev = dev_out;
2238 dev_hold(dev_out);
2239 rth->idev = in_dev_get(dev_out);
2240 rth->rt_gateway = fl->fl4_dst;
2241 rth->rt_spec_dst= fl->fl4_src;
2242
2243 rth->u.dst.output=ip_output;
2244
2245 RT_CACHE_STAT_INC(out_slow_tot);
2246
2247 if (flags & RTCF_LOCAL) {
2248 rth->u.dst.input = ip_local_deliver;
2249 rth->rt_spec_dst = fl->fl4_dst;
2250 }
2251 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2252 rth->rt_spec_dst = fl->fl4_src;
2253 if (flags & RTCF_LOCAL &&
2254 !(dev_out->flags & IFF_LOOPBACK)) {
2255 rth->u.dst.output = ip_mc_output;
2256 RT_CACHE_STAT_INC(out_slow_mc);
2257 }
2258#ifdef CONFIG_IP_MROUTE
2259 if (res->type == RTN_MULTICAST) {
2260 if (IN_DEV_MFORWARD(in_dev) &&
2261 !LOCAL_MCAST(oldflp->fl4_dst)) {
2262 rth->u.dst.input = ip_mr_input;
2263 rth->u.dst.output = ip_mc_output;
2264 }
2265 }
2266#endif
2267 }
2268
2269 rt_set_nexthop(rth, res, 0);
2270
2271 rth->rt_flags = flags;
2272
2273 *result = rth;
2274 cleanup:
2275 /* release work reference to inet device */
2276 in_dev_put(in_dev);
2277
2278 return err;
2279}
2280
2281static inline int ip_mkroute_output_def(struct rtable **rp,
2282 struct fib_result* res,
2283 const struct flowi *fl,
2284 const struct flowi *oldflp,
2285 struct net_device *dev_out,
2286 unsigned flags)
2287{
7abaa27c 2288 struct rtable *rth = NULL;
1da177e4
LT
2289 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2290 unsigned hash;
2291 if (err == 0) {
8c7bc840 2292 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
1da177e4
LT
2293 err = rt_intern_hash(hash, rth, rp);
2294 }
2295
2296 return err;
2297}
2298
2299static inline int ip_mkroute_output(struct rtable** rp,
2300 struct fib_result* res,
2301 const struct flowi *fl,
2302 const struct flowi *oldflp,
2303 struct net_device *dev_out,
2304 unsigned flags)
2305{
2306#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1da177e4
LT
2307 unsigned char hop;
2308 unsigned hash;
2309 int err = -EINVAL;
7abaa27c 2310 struct rtable *rth = NULL;
1da177e4
LT
2311
2312 if (res->fi && res->fi->fib_nhs > 1) {
2313 unsigned char hopcount = res->fi->fib_nhs;
2314
2315 for (hop = 0; hop < hopcount; hop++) {
2316 struct net_device *dev2nexthop;
2317
2318 res->nh_sel = hop;
2319
2320 /* hold a work reference to the output device */
2321 dev2nexthop = FIB_RES_DEV(*res);
2322 dev_hold(dev2nexthop);
2323
ce723d8e
JA
2324 /* put reference to previous result */
2325 if (hop)
2326 ip_rt_put(*rp);
2327
1da177e4
LT
2328 err = __mkroute_output(&rth, res, fl, oldflp,
2329 dev2nexthop, flags);
2330
2331 if (err != 0)
2332 goto cleanup;
2333
8c7bc840
AV
2334 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2335 oldflp->oif);
1da177e4
LT
2336 err = rt_intern_hash(hash, rth, rp);
2337
2338 /* forward hop information to multipath impl. */
2339 multipath_set_nhinfo(rth,
2340 FIB_RES_NETWORK(*res),
2341 FIB_RES_NETMASK(*res),
2342 res->prefixlen,
2343 &FIB_RES_NH(*res));
2344 cleanup:
2345 /* release work reference to output device */
2346 dev_put(dev2nexthop);
2347
2348 if (err != 0)
2349 return err;
2350 }
1da177e4
LT
2351 return err;
2352 } else {
2353 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2354 flags);
2355 }
2356#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2357 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2358#endif
2359}
2360
2361/*
2362 * Major route resolver routine.
2363 */
2364
2365static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2366{
2367 u32 tos = RT_FL_TOS(oldflp);
2368 struct flowi fl = { .nl_u = { .ip4_u =
2369 { .daddr = oldflp->fl4_dst,
2370 .saddr = oldflp->fl4_src,
2371 .tos = tos & IPTOS_RT_MASK,
2372 .scope = ((tos & RTO_ONLINK) ?
2373 RT_SCOPE_LINK :
2374 RT_SCOPE_UNIVERSE),
1da177e4 2375 } },
47dcf0cb 2376 .mark = oldflp->mark,
1da177e4
LT
2377 .iif = loopback_dev.ifindex,
2378 .oif = oldflp->oif };
2379 struct fib_result res;
2380 unsigned flags = 0;
2381 struct net_device *dev_out = NULL;
2382 int free_res = 0;
2383 int err;
2384
2385
2386 res.fi = NULL;
2387#ifdef CONFIG_IP_MULTIPLE_TABLES
2388 res.r = NULL;
2389#endif
2390
2391 if (oldflp->fl4_src) {
2392 err = -EINVAL;
2393 if (MULTICAST(oldflp->fl4_src) ||
2394 BADCLASS(oldflp->fl4_src) ||
2395 ZERONET(oldflp->fl4_src))
2396 goto out;
2397
2398 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2399 dev_out = ip_dev_find(oldflp->fl4_src);
2400 if (dev_out == NULL)
2401 goto out;
2402
2403 /* I removed check for oif == dev_out->oif here.
2404 It was wrong for two reasons:
2405 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2406 assigned to multiple interfaces.
2407 2. Moreover, we are allowed to send packets with saddr
2408 of another iface. --ANK
2409 */
2410
2411 if (oldflp->oif == 0
e448515c 2412 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
1da177e4
LT
2413 /* Special hack: user can direct multicasts
2414 and limited broadcast via necessary interface
2415 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2416 This hack is not just for fun, it allows
2417 vic,vat and friends to work.
2418 They bind socket to loopback, set ttl to zero
2419 and expect that it will work.
2420 From the viewpoint of routing cache they are broken,
2421 because we are not allowed to build multicast path
2422 with loopback source addr (look, routing cache
2423 cannot know, that ttl is zero, so that packet
2424 will not leave this host and route is valid).
2425 Luckily, this hack is good workaround.
2426 */
2427
2428 fl.oif = dev_out->ifindex;
2429 goto make_route;
2430 }
2431 if (dev_out)
2432 dev_put(dev_out);
2433 dev_out = NULL;
2434 }
2435
2436
2437 if (oldflp->oif) {
2438 dev_out = dev_get_by_index(oldflp->oif);
2439 err = -ENODEV;
2440 if (dev_out == NULL)
2441 goto out;
e5ed6399
HX
2442
2443 /* RACE: Check return value of inet_select_addr instead. */
2444 if (__in_dev_get_rtnl(dev_out) == NULL) {
1da177e4
LT
2445 dev_put(dev_out);
2446 goto out; /* Wrong error code */
2447 }
2448
e448515c 2449 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
1da177e4
LT
2450 if (!fl.fl4_src)
2451 fl.fl4_src = inet_select_addr(dev_out, 0,
2452 RT_SCOPE_LINK);
2453 goto make_route;
2454 }
2455 if (!fl.fl4_src) {
2456 if (MULTICAST(oldflp->fl4_dst))
2457 fl.fl4_src = inet_select_addr(dev_out, 0,
2458 fl.fl4_scope);
2459 else if (!oldflp->fl4_dst)
2460 fl.fl4_src = inet_select_addr(dev_out, 0,
2461 RT_SCOPE_HOST);
2462 }
2463 }
2464
2465 if (!fl.fl4_dst) {
2466 fl.fl4_dst = fl.fl4_src;
2467 if (!fl.fl4_dst)
2468 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2469 if (dev_out)
2470 dev_put(dev_out);
2471 dev_out = &loopback_dev;
2472 dev_hold(dev_out);
2473 fl.oif = loopback_dev.ifindex;
2474 res.type = RTN_LOCAL;
2475 flags |= RTCF_LOCAL;
2476 goto make_route;
2477 }
2478
2479 if (fib_lookup(&fl, &res)) {
2480 res.fi = NULL;
2481 if (oldflp->oif) {
2482 /* Apparently, routing tables are wrong. Assume,
2483 that the destination is on link.
2484
2485 WHY? DW.
2486 Because we are allowed to send to iface
2487 even if it has NO routes and NO assigned
2488 addresses. When oif is specified, routing
2489 tables are looked up with only one purpose:
2490 to catch if destination is gatewayed, rather than
2491 direct. Moreover, if MSG_DONTROUTE is set,
2492 we send packet, ignoring both routing tables
2493 and ifaddr state. --ANK
2494
2495
2496 We could make it even if oif is unknown,
2497 likely IPv6, but we do not.
2498 */
2499
2500 if (fl.fl4_src == 0)
2501 fl.fl4_src = inet_select_addr(dev_out, 0,
2502 RT_SCOPE_LINK);
2503 res.type = RTN_UNICAST;
2504 goto make_route;
2505 }
2506 if (dev_out)
2507 dev_put(dev_out);
2508 err = -ENETUNREACH;
2509 goto out;
2510 }
2511 free_res = 1;
2512
2513 if (res.type == RTN_LOCAL) {
2514 if (!fl.fl4_src)
2515 fl.fl4_src = fl.fl4_dst;
2516 if (dev_out)
2517 dev_put(dev_out);
2518 dev_out = &loopback_dev;
2519 dev_hold(dev_out);
2520 fl.oif = dev_out->ifindex;
2521 if (res.fi)
2522 fib_info_put(res.fi);
2523 res.fi = NULL;
2524 flags |= RTCF_LOCAL;
2525 goto make_route;
2526 }
2527
2528#ifdef CONFIG_IP_ROUTE_MULTIPATH
2529 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2530 fib_select_multipath(&fl, &res);
2531 else
2532#endif
2533 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2534 fib_select_default(&fl, &res);
2535
2536 if (!fl.fl4_src)
2537 fl.fl4_src = FIB_RES_PREFSRC(res);
2538
2539 if (dev_out)
2540 dev_put(dev_out);
2541 dev_out = FIB_RES_DEV(res);
2542 dev_hold(dev_out);
2543 fl.oif = dev_out->ifindex;
2544
2545
2546make_route:
2547 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2548
2549
2550 if (free_res)
2551 fib_res_put(&res);
2552 if (dev_out)
2553 dev_put(dev_out);
2554out: return err;
2555}
2556
2557int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2558{
2559 unsigned hash;
2560 struct rtable *rth;
2561
8c7bc840 2562 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
1da177e4
LT
2563
2564 rcu_read_lock_bh();
2565 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2566 rth = rcu_dereference(rth->u.rt_next)) {
2567 if (rth->fl.fl4_dst == flp->fl4_dst &&
2568 rth->fl.fl4_src == flp->fl4_src &&
2569 rth->fl.iif == 0 &&
2570 rth->fl.oif == flp->oif &&
47dcf0cb 2571 rth->fl.mark == flp->mark &&
1da177e4
LT
2572 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2573 (IPTOS_RT_MASK | RTO_ONLINK))) {
2574
2575 /* check for multipath routes and choose one if
2576 * necessary
2577 */
2578 if (multipath_select_route(flp, rth, rp)) {
2579 dst_hold(&(*rp)->u.dst);
2580 RT_CACHE_STAT_INC(out_hit);
2581 rcu_read_unlock_bh();
2582 return 0;
2583 }
2584
2585 rth->u.dst.lastuse = jiffies;
2586 dst_hold(&rth->u.dst);
2587 rth->u.dst.__use++;
2588 RT_CACHE_STAT_INC(out_hit);
2589 rcu_read_unlock_bh();
2590 *rp = rth;
2591 return 0;
2592 }
2593 RT_CACHE_STAT_INC(out_hlist_search);
2594 }
2595 rcu_read_unlock_bh();
2596
2597 return ip_route_output_slow(rp, flp);
2598}
2599
d8c97a94
ACM
2600EXPORT_SYMBOL_GPL(__ip_route_output_key);
2601
1da177e4
LT
2602int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2603{
2604 int err;
2605
2606 if ((err = __ip_route_output_key(rp, flp)) != 0)
2607 return err;
2608
2609 if (flp->proto) {
2610 if (!flp->fl4_src)
2611 flp->fl4_src = (*rp)->rt_src;
2612 if (!flp->fl4_dst)
2613 flp->fl4_dst = (*rp)->rt_dst;
2614 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2615 }
2616
2617 return 0;
2618}
2619
d8c97a94
ACM
2620EXPORT_SYMBOL_GPL(ip_route_output_flow);
2621
1da177e4
LT
2622int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2623{
2624 return ip_route_output_flow(rp, flp, NULL, 0);
2625}
2626
2627static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2628 int nowait, unsigned int flags)
1da177e4
LT
2629{
2630 struct rtable *rt = (struct rtable*)skb->dst;
2631 struct rtmsg *r;
be403ea1 2632 struct nlmsghdr *nlh;
e3703b3d
TG
2633 long expires;
2634 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2635
2636 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2637 if (nlh == NULL)
2638 return -ENOBUFS;
2639
2640 r = nlmsg_data(nlh);
1da177e4
LT
2641 r->rtm_family = AF_INET;
2642 r->rtm_dst_len = 32;
2643 r->rtm_src_len = 0;
2644 r->rtm_tos = rt->fl.fl4_tos;
2645 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2646 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2647 r->rtm_type = rt->rt_type;
2648 r->rtm_scope = RT_SCOPE_UNIVERSE;
2649 r->rtm_protocol = RTPROT_UNSPEC;
2650 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2651 if (rt->rt_flags & RTCF_NOTIFY)
2652 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2653
17fb2c64 2654 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2655
1da177e4
LT
2656 if (rt->fl.fl4_src) {
2657 r->rtm_src_len = 32;
17fb2c64 2658 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
1da177e4
LT
2659 }
2660 if (rt->u.dst.dev)
be403ea1 2661 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
1da177e4
LT
2662#ifdef CONFIG_NET_CLS_ROUTE
2663 if (rt->u.dst.tclassid)
be403ea1 2664 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
1da177e4
LT
2665#endif
2666#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
be403ea1
TG
2667 if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2668 NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
1da177e4
LT
2669#endif
2670 if (rt->fl.iif)
17fb2c64 2671 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
1da177e4 2672 else if (rt->rt_src != rt->fl.fl4_src)
17fb2c64 2673 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2674
1da177e4 2675 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2676 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2677
1da177e4 2678 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
be403ea1
TG
2679 goto nla_put_failure;
2680
e3703b3d
TG
2681 error = rt->u.dst.error;
2682 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
1da177e4 2683 if (rt->peer) {
e3703b3d 2684 id = rt->peer->ip_id_count;
1da177e4 2685 if (rt->peer->tcp_ts_stamp) {
e3703b3d
TG
2686 ts = rt->peer->tcp_ts;
2687 tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
1da177e4
LT
2688 }
2689 }
be403ea1 2690
1da177e4
LT
2691 if (rt->fl.iif) {
2692#ifdef CONFIG_IP_MROUTE
e448515c 2693 __be32 dst = rt->rt_dst;
1da177e4
LT
2694
2695 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2696 ipv4_devconf.mc_forwarding) {
2697 int err = ipmr_get_route(skb, r, nowait);
2698 if (err <= 0) {
2699 if (!nowait) {
2700 if (err == 0)
2701 return 0;
be403ea1 2702 goto nla_put_failure;
1da177e4
LT
2703 } else {
2704 if (err == -EMSGSIZE)
be403ea1 2705 goto nla_put_failure;
e3703b3d 2706 error = err;
1da177e4
LT
2707 }
2708 }
2709 } else
2710#endif
be403ea1 2711 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
1da177e4
LT
2712 }
2713
e3703b3d
TG
2714 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2715 expires, error) < 0)
2716 goto nla_put_failure;
be403ea1
TG
2717
2718 return nlmsg_end(skb, nlh);
1da177e4 2719
be403ea1
TG
2720nla_put_failure:
2721 return nlmsg_cancel(skb, nlh);
1da177e4
LT
2722}
2723
2724int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2725{
d889ce3b
TG
2726 struct rtmsg *rtm;
2727 struct nlattr *tb[RTA_MAX+1];
1da177e4 2728 struct rtable *rt = NULL;
9e12bb22
AV
2729 __be32 dst = 0;
2730 __be32 src = 0;
2731 u32 iif;
d889ce3b 2732 int err;
1da177e4
LT
2733 struct sk_buff *skb;
2734
d889ce3b
TG
2735 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2736 if (err < 0)
2737 goto errout;
2738
2739 rtm = nlmsg_data(nlh);
2740
1da177e4 2741 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2742 if (skb == NULL) {
2743 err = -ENOBUFS;
2744 goto errout;
2745 }
1da177e4
LT
2746
2747 /* Reserve room for dummy headers, this skb can pass
2748 through good chunk of routing engine.
2749 */
d2c962b8
SH
2750 skb->mac.raw = skb->nh.raw = skb->data;
2751
2752 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2753 skb->nh.iph->protocol = IPPROTO_ICMP;
1da177e4
LT
2754 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2755
17fb2c64
AV
2756 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2757 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2758 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
1da177e4
LT
2759
2760 if (iif) {
d889ce3b
TG
2761 struct net_device *dev;
2762
2763 dev = __dev_get_by_index(iif);
2764 if (dev == NULL) {
2765 err = -ENODEV;
2766 goto errout_free;
2767 }
2768
1da177e4
LT
2769 skb->protocol = htons(ETH_P_IP);
2770 skb->dev = dev;
2771 local_bh_disable();
2772 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2773 local_bh_enable();
d889ce3b
TG
2774
2775 rt = (struct rtable*) skb->dst;
2776 if (err == 0 && rt->u.dst.error)
1da177e4
LT
2777 err = -rt->u.dst.error;
2778 } else {
d889ce3b
TG
2779 struct flowi fl = {
2780 .nl_u = {
2781 .ip4_u = {
2782 .daddr = dst,
2783 .saddr = src,
2784 .tos = rtm->rtm_tos,
2785 },
2786 },
2787 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2788 };
1da177e4
LT
2789 err = ip_route_output_key(&rt, &fl);
2790 }
d889ce3b 2791
1da177e4 2792 if (err)
d889ce3b 2793 goto errout_free;
1da177e4
LT
2794
2795 skb->dst = &rt->u.dst;
2796 if (rtm->rtm_flags & RTM_F_NOTIFY)
2797 rt->rt_flags |= RTCF_NOTIFY;
2798
1da177e4 2799 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
b6544c0b 2800 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
2801 if (err <= 0)
2802 goto errout_free;
1da177e4 2803
2942e900 2804 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
d889ce3b 2805errout:
2942e900 2806 return err;
1da177e4 2807
d889ce3b 2808errout_free:
1da177e4 2809 kfree_skb(skb);
d889ce3b 2810 goto errout;
1da177e4
LT
2811}
2812
2813int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2814{
2815 struct rtable *rt;
2816 int h, s_h;
2817 int idx, s_idx;
2818
2819 s_h = cb->args[0];
2820 s_idx = idx = cb->args[1];
2821 for (h = 0; h <= rt_hash_mask; h++) {
2822 if (h < s_h) continue;
2823 if (h > s_h)
2824 s_idx = 0;
2825 rcu_read_lock_bh();
2826 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2827 rt = rcu_dereference(rt->u.rt_next), idx++) {
2828 if (idx < s_idx)
2829 continue;
2830 skb->dst = dst_clone(&rt->u.dst);
2831 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
b6544c0b
JHS
2832 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2833 1, NLM_F_MULTI) <= 0) {
1da177e4
LT
2834 dst_release(xchg(&skb->dst, NULL));
2835 rcu_read_unlock_bh();
2836 goto done;
2837 }
2838 dst_release(xchg(&skb->dst, NULL));
2839 }
2840 rcu_read_unlock_bh();
2841 }
2842
2843done:
2844 cb->args[0] = h;
2845 cb->args[1] = idx;
2846 return skb->len;
2847}
2848
2849void ip_rt_multicast_event(struct in_device *in_dev)
2850{
2851 rt_cache_flush(0);
2852}
2853
2854#ifdef CONFIG_SYSCTL
2855static int flush_delay;
2856
2857static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2858 struct file *filp, void __user *buffer,
2859 size_t *lenp, loff_t *ppos)
2860{
2861 if (write) {
2862 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2863 rt_cache_flush(flush_delay);
2864 return 0;
2865 }
2866
2867 return -EINVAL;
2868}
2869
2870static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2871 int __user *name,
2872 int nlen,
2873 void __user *oldval,
2874 size_t __user *oldlenp,
2875 void __user *newval,
1f29bcd7 2876 size_t newlen)
1da177e4
LT
2877{
2878 int delay;
2879 if (newlen != sizeof(int))
2880 return -EINVAL;
2881 if (get_user(delay, (int __user *)newval))
2882 return -EFAULT;
2883 rt_cache_flush(delay);
2884 return 0;
2885}
2886
2887ctl_table ipv4_route_table[] = {
2888 {
2889 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2890 .procname = "flush",
2891 .data = &flush_delay,
2892 .maxlen = sizeof(int),
7e3e0360 2893 .mode = 0200,
1da177e4
LT
2894 .proc_handler = &ipv4_sysctl_rtcache_flush,
2895 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2896 },
2897 {
2898 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2899 .procname = "min_delay",
2900 .data = &ip_rt_min_delay,
2901 .maxlen = sizeof(int),
2902 .mode = 0644,
2903 .proc_handler = &proc_dointvec_jiffies,
2904 .strategy = &sysctl_jiffies,
2905 },
2906 {
2907 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2908 .procname = "max_delay",
2909 .data = &ip_rt_max_delay,
2910 .maxlen = sizeof(int),
2911 .mode = 0644,
2912 .proc_handler = &proc_dointvec_jiffies,
2913 .strategy = &sysctl_jiffies,
2914 },
2915 {
2916 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2917 .procname = "gc_thresh",
2918 .data = &ipv4_dst_ops.gc_thresh,
2919 .maxlen = sizeof(int),
2920 .mode = 0644,
2921 .proc_handler = &proc_dointvec,
2922 },
2923 {
2924 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2925 .procname = "max_size",
2926 .data = &ip_rt_max_size,
2927 .maxlen = sizeof(int),
2928 .mode = 0644,
2929 .proc_handler = &proc_dointvec,
2930 },
2931 {
2932 /* Deprecated. Use gc_min_interval_ms */
2933
2934 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2935 .procname = "gc_min_interval",
2936 .data = &ip_rt_gc_min_interval,
2937 .maxlen = sizeof(int),
2938 .mode = 0644,
2939 .proc_handler = &proc_dointvec_jiffies,
2940 .strategy = &sysctl_jiffies,
2941 },
2942 {
2943 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2944 .procname = "gc_min_interval_ms",
2945 .data = &ip_rt_gc_min_interval,
2946 .maxlen = sizeof(int),
2947 .mode = 0644,
2948 .proc_handler = &proc_dointvec_ms_jiffies,
2949 .strategy = &sysctl_ms_jiffies,
2950 },
2951 {
2952 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2953 .procname = "gc_timeout",
2954 .data = &ip_rt_gc_timeout,
2955 .maxlen = sizeof(int),
2956 .mode = 0644,
2957 .proc_handler = &proc_dointvec_jiffies,
2958 .strategy = &sysctl_jiffies,
2959 },
2960 {
2961 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2962 .procname = "gc_interval",
2963 .data = &ip_rt_gc_interval,
2964 .maxlen = sizeof(int),
2965 .mode = 0644,
2966 .proc_handler = &proc_dointvec_jiffies,
2967 .strategy = &sysctl_jiffies,
2968 },
2969 {
2970 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2971 .procname = "redirect_load",
2972 .data = &ip_rt_redirect_load,
2973 .maxlen = sizeof(int),
2974 .mode = 0644,
2975 .proc_handler = &proc_dointvec,
2976 },
2977 {
2978 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2979 .procname = "redirect_number",
2980 .data = &ip_rt_redirect_number,
2981 .maxlen = sizeof(int),
2982 .mode = 0644,
2983 .proc_handler = &proc_dointvec,
2984 },
2985 {
2986 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2987 .procname = "redirect_silence",
2988 .data = &ip_rt_redirect_silence,
2989 .maxlen = sizeof(int),
2990 .mode = 0644,
2991 .proc_handler = &proc_dointvec,
2992 },
2993 {
2994 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2995 .procname = "error_cost",
2996 .data = &ip_rt_error_cost,
2997 .maxlen = sizeof(int),
2998 .mode = 0644,
2999 .proc_handler = &proc_dointvec,
3000 },
3001 {
3002 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3003 .procname = "error_burst",
3004 .data = &ip_rt_error_burst,
3005 .maxlen = sizeof(int),
3006 .mode = 0644,
3007 .proc_handler = &proc_dointvec,
3008 },
3009 {
3010 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3011 .procname = "gc_elasticity",
3012 .data = &ip_rt_gc_elasticity,
3013 .maxlen = sizeof(int),
3014 .mode = 0644,
3015 .proc_handler = &proc_dointvec,
3016 },
3017 {
3018 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3019 .procname = "mtu_expires",
3020 .data = &ip_rt_mtu_expires,
3021 .maxlen = sizeof(int),
3022 .mode = 0644,
3023 .proc_handler = &proc_dointvec_jiffies,
3024 .strategy = &sysctl_jiffies,
3025 },
3026 {
3027 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3028 .procname = "min_pmtu",
3029 .data = &ip_rt_min_pmtu,
3030 .maxlen = sizeof(int),
3031 .mode = 0644,
3032 .proc_handler = &proc_dointvec,
3033 },
3034 {
3035 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3036 .procname = "min_adv_mss",
3037 .data = &ip_rt_min_advmss,
3038 .maxlen = sizeof(int),
3039 .mode = 0644,
3040 .proc_handler = &proc_dointvec,
3041 },
3042 {
3043 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3044 .procname = "secret_interval",
3045 .data = &ip_rt_secret_interval,
3046 .maxlen = sizeof(int),
3047 .mode = 0644,
3048 .proc_handler = &proc_dointvec_jiffies,
3049 .strategy = &sysctl_jiffies,
3050 },
3051 { .ctl_name = 0 }
3052};
3053#endif
3054
3055#ifdef CONFIG_NET_CLS_ROUTE
3056struct ip_rt_acct *ip_rt_acct;
3057
3058/* This code sucks. But you should have seen it before! --RR */
3059
3060/* IP route accounting ptr for this logical cpu number. */
3061#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3062
3063#ifdef CONFIG_PROC_FS
3064static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3065 int length, int *eof, void *data)
3066{
3067 unsigned int i;
3068
3069 if ((offset & 3) || (length & 3))
3070 return -EIO;
3071
3072 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3073 *eof = 1;
3074 return 0;
3075 }
3076
3077 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3078 length = sizeof(struct ip_rt_acct) * 256 - offset;
3079 *eof = 1;
3080 }
3081
3082 offset /= sizeof(u32);
3083
3084 if (length > 0) {
3085 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3086 u32 *dst = (u32 *) buffer;
3087
3088 /* Copy first cpu. */
3089 *start = buffer;
3090 memcpy(dst, src, length);
3091
3092 /* Add the other cpus in, one int at a time */
6f912042 3093 for_each_possible_cpu(i) {
1da177e4
LT
3094 unsigned int j;
3095
3096 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3097
3098 for (j = 0; j < length/4; j++)
3099 dst[j] += src[j];
3100 }
3101 }
3102 return length;
3103}
3104#endif /* CONFIG_PROC_FS */
3105#endif /* CONFIG_NET_CLS_ROUTE */
3106
3107static __initdata unsigned long rhash_entries;
3108static int __init set_rhash_entries(char *str)
3109{
3110 if (!str)
3111 return 0;
3112 rhash_entries = simple_strtoul(str, &str, 0);
3113 return 1;
3114}
3115__setup("rhash_entries=", set_rhash_entries);
3116
3117int __init ip_rt_init(void)
3118{
424c4b70 3119 int rc = 0;
1da177e4
LT
3120
3121 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3122 (jiffies ^ (jiffies >> 7)));
3123
3124#ifdef CONFIG_NET_CLS_ROUTE
424c4b70
ED
3125 {
3126 int order;
1da177e4
LT
3127 for (order = 0;
3128 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3129 /* NOTHING */;
3130 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3131 if (!ip_rt_acct)
3132 panic("IP: failed to allocate ip_rt_acct\n");
3133 memset(ip_rt_acct, 0, PAGE_SIZE << order);
424c4b70 3134 }
1da177e4
LT
3135#endif
3136
e5d679f3
AD
3137 ipv4_dst_ops.kmem_cachep =
3138 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3139 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
1da177e4 3140
424c4b70
ED
3141 rt_hash_table = (struct rt_hash_bucket *)
3142 alloc_large_system_hash("IP route cache",
3143 sizeof(struct rt_hash_bucket),
3144 rhash_entries,
3145 (num_physpages >= 128 * 1024) ?
18955cfc 3146 15 : 17,
8d1502de 3147 0,
424c4b70
ED
3148 &rt_hash_log,
3149 &rt_hash_mask,
3150 0);
22c047cc
ED
3151 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3152 rt_hash_lock_init();
1da177e4
LT
3153
3154 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3155 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3156
1da177e4
LT
3157 devinet_init();
3158 ip_fib_init();
3159
3160 init_timer(&rt_flush_timer);
3161 rt_flush_timer.function = rt_run_flush;
3162 init_timer(&rt_periodic_timer);
3163 rt_periodic_timer.function = rt_check_expire;
3164 init_timer(&rt_secret_timer);
3165 rt_secret_timer.function = rt_secret_rebuild;
3166
3167 /* All the timers, started at system startup tend
3168 to synchronize. Perturb it a bit.
3169 */
3170 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3171 ip_rt_gc_interval;
3172 add_timer(&rt_periodic_timer);
3173
3174 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3175 ip_rt_secret_interval;
3176 add_timer(&rt_secret_timer);
3177
3178#ifdef CONFIG_PROC_FS
3179 {
3180 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3181 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3182 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3183 proc_net_stat))) {
1da177e4
LT
3184 return -ENOMEM;
3185 }
3186 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3187 }
3188#ifdef CONFIG_NET_CLS_ROUTE
3189 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3190#endif
3191#endif
3192#ifdef CONFIG_XFRM
3193 xfrm_init();
3194 xfrm4_init();
3195#endif
3196 return rc;
3197}
3198
3199EXPORT_SYMBOL(__ip_select_ident);
3200EXPORT_SYMBOL(ip_route_input);
3201EXPORT_SYMBOL(ip_route_output_key);