]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/route.c
[PATCH] lockdep: prove rwsem locking correctness
[net-next-2.6.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
41 *
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
60 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
1da177e4
LT
67#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/mm.h>
424c4b70 75#include <linux/bootmem.h>
1da177e4
LT
76#include <linux/string.h>
77#include <linux/socket.h>
78#include <linux/sockios.h>
79#include <linux/errno.h>
80#include <linux/in.h>
81#include <linux/inet.h>
82#include <linux/netdevice.h>
83#include <linux/proc_fs.h>
84#include <linux/init.h>
85#include <linux/skbuff.h>
86#include <linux/rtnetlink.h>
87#include <linux/inetdevice.h>
88#include <linux/igmp.h>
89#include <linux/pkt_sched.h>
90#include <linux/mroute.h>
91#include <linux/netfilter_ipv4.h>
92#include <linux/random.h>
93#include <linux/jhash.h>
94#include <linux/rcupdate.h>
95#include <linux/times.h>
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
106#include <net/ip_mp_alg.h>
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
118static int ip_rt_min_delay = 2 * HZ;
119static int ip_rt_max_delay = 10 * HZ;
120static int ip_rt_max_size;
121static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
122static int ip_rt_gc_interval = 60 * HZ;
123static int ip_rt_gc_min_interval = HZ / 2;
124static int ip_rt_redirect_number = 9;
125static int ip_rt_redirect_load = HZ / 50;
126static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
127static int ip_rt_error_cost = HZ;
128static int ip_rt_error_burst = 5 * HZ;
129static int ip_rt_gc_elasticity = 8;
130static int ip_rt_mtu_expires = 10 * 60 * HZ;
131static int ip_rt_min_pmtu = 512 + 20 + 20;
132static int ip_rt_min_advmss = 256;
133static int ip_rt_secret_interval = 10 * 60 * HZ;
134static unsigned long rt_deadline;
135
136#define RTprint(a...) printk(KERN_DEBUG a)
137
138static struct timer_list rt_flush_timer;
139static struct timer_list rt_periodic_timer;
140static struct timer_list rt_secret_timer;
141
142/*
143 * Interface to generic destination cache.
144 */
145
146static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
147static void ipv4_dst_destroy(struct dst_entry *dst);
148static void ipv4_dst_ifdown(struct dst_entry *dst,
149 struct net_device *dev, int how);
150static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
151static void ipv4_link_failure(struct sk_buff *skb);
152static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
153static int rt_garbage_collect(void);
154
155
156static struct dst_ops ipv4_dst_ops = {
157 .family = AF_INET,
158 .protocol = __constant_htons(ETH_P_IP),
159 .gc = rt_garbage_collect,
160 .check = ipv4_dst_check,
161 .destroy = ipv4_dst_destroy,
162 .ifdown = ipv4_dst_ifdown,
163 .negative_advice = ipv4_negative_advice,
164 .link_failure = ipv4_link_failure,
165 .update_pmtu = ip_rt_update_pmtu,
166 .entry_size = sizeof(struct rtable),
167};
168
169#define ECN_OR_COST(class) TC_PRIO_##class
170
171__u8 ip_tos2prio[16] = {
172 TC_PRIO_BESTEFFORT,
173 ECN_OR_COST(FILLER),
174 TC_PRIO_BESTEFFORT,
175 ECN_OR_COST(BESTEFFORT),
176 TC_PRIO_BULK,
177 ECN_OR_COST(BULK),
178 TC_PRIO_BULK,
179 ECN_OR_COST(BULK),
180 TC_PRIO_INTERACTIVE,
181 ECN_OR_COST(INTERACTIVE),
182 TC_PRIO_INTERACTIVE,
183 ECN_OR_COST(INTERACTIVE),
184 TC_PRIO_INTERACTIVE_BULK,
185 ECN_OR_COST(INTERACTIVE_BULK),
186 TC_PRIO_INTERACTIVE_BULK,
187 ECN_OR_COST(INTERACTIVE_BULK)
188};
189
190
191/*
192 * Route cache.
193 */
194
195/* The locking scheme is rather straight forward:
196 *
197 * 1) Read-Copy Update protects the buckets of the central route hash.
198 * 2) Only writers remove entries, and they hold the lock
199 * as they look at rtable reference counts.
200 * 3) Only readers acquire references to rtable entries,
201 * they do so with atomic increments and with the
202 * lock held.
203 */
204
205struct rt_hash_bucket {
206 struct rtable *chain;
22c047cc
ED
207};
208#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
209/*
210 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
211 * The size of this table is a power of two and depends on the number of CPUS.
212 */
213#if NR_CPUS >= 32
214#define RT_HASH_LOCK_SZ 4096
215#elif NR_CPUS >= 16
216#define RT_HASH_LOCK_SZ 2048
217#elif NR_CPUS >= 8
218#define RT_HASH_LOCK_SZ 1024
219#elif NR_CPUS >= 4
220#define RT_HASH_LOCK_SZ 512
221#else
222#define RT_HASH_LOCK_SZ 256
223#endif
224
225static spinlock_t *rt_hash_locks;
226# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
227# define rt_hash_lock_init() { \
228 int i; \
229 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
230 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
231 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
232 spin_lock_init(&rt_hash_locks[i]); \
233 }
234#else
235# define rt_hash_lock_addr(slot) NULL
236# define rt_hash_lock_init()
237#endif
1da177e4
LT
238
239static struct rt_hash_bucket *rt_hash_table;
240static unsigned rt_hash_mask;
241static int rt_hash_log;
242static unsigned int rt_hash_rnd;
243
2f970d83 244static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
dbd2915c 245#define RT_CACHE_STAT_INC(field) \
bfe5d834 246 (__raw_get_cpu_var(rt_cache_stat).field++)
1da177e4
LT
247
248static int rt_intern_hash(unsigned hash, struct rtable *rth,
249 struct rtable **res);
250
cef2685e 251static unsigned int rt_hash_code(u32 daddr, u32 saddr)
1da177e4 252{
cef2685e 253 return (jhash_2words(daddr, saddr, rt_hash_rnd)
1da177e4
LT
254 & rt_hash_mask);
255}
256
257#ifdef CONFIG_PROC_FS
258struct rt_cache_iter_state {
259 int bucket;
260};
261
262static struct rtable *rt_cache_get_first(struct seq_file *seq)
263{
264 struct rtable *r = NULL;
265 struct rt_cache_iter_state *st = seq->private;
266
267 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
268 rcu_read_lock_bh();
269 r = rt_hash_table[st->bucket].chain;
270 if (r)
271 break;
272 rcu_read_unlock_bh();
273 }
274 return r;
275}
276
277static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
278{
279 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
280
281 r = r->u.rt_next;
282 while (!r) {
283 rcu_read_unlock_bh();
284 if (--st->bucket < 0)
285 break;
286 rcu_read_lock_bh();
287 r = rt_hash_table[st->bucket].chain;
288 }
289 return r;
290}
291
292static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
293{
294 struct rtable *r = rt_cache_get_first(seq);
295
296 if (r)
297 while (pos && (r = rt_cache_get_next(seq, r)))
298 --pos;
299 return pos ? NULL : r;
300}
301
302static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
303{
304 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
305}
306
307static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
308{
309 struct rtable *r = NULL;
310
311 if (v == SEQ_START_TOKEN)
312 r = rt_cache_get_first(seq);
313 else
314 r = rt_cache_get_next(seq, v);
315 ++*pos;
316 return r;
317}
318
319static void rt_cache_seq_stop(struct seq_file *seq, void *v)
320{
321 if (v && v != SEQ_START_TOKEN)
322 rcu_read_unlock_bh();
323}
324
325static int rt_cache_seq_show(struct seq_file *seq, void *v)
326{
327 if (v == SEQ_START_TOKEN)
328 seq_printf(seq, "%-127s\n",
329 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
330 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
331 "HHUptod\tSpecDst");
332 else {
333 struct rtable *r = v;
334 char temp[256];
335
336 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
337 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
338 r->u.dst.dev ? r->u.dst.dev->name : "*",
339 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
340 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
341 r->u.dst.__use, 0, (unsigned long)r->rt_src,
342 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
343 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
344 dst_metric(&r->u.dst, RTAX_WINDOW),
345 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
346 dst_metric(&r->u.dst, RTAX_RTTVAR)),
347 r->fl.fl4_tos,
348 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
349 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
350 dev_queue_xmit) : 0,
351 r->rt_spec_dst);
352 seq_printf(seq, "%-127s\n", temp);
353 }
354 return 0;
355}
356
357static struct seq_operations rt_cache_seq_ops = {
358 .start = rt_cache_seq_start,
359 .next = rt_cache_seq_next,
360 .stop = rt_cache_seq_stop,
361 .show = rt_cache_seq_show,
362};
363
364static int rt_cache_seq_open(struct inode *inode, struct file *file)
365{
366 struct seq_file *seq;
367 int rc = -ENOMEM;
368 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
369
370 if (!s)
371 goto out;
372 rc = seq_open(file, &rt_cache_seq_ops);
373 if (rc)
374 goto out_kfree;
375 seq = file->private_data;
376 seq->private = s;
377 memset(s, 0, sizeof(*s));
378out:
379 return rc;
380out_kfree:
381 kfree(s);
382 goto out;
383}
384
385static struct file_operations rt_cache_seq_fops = {
386 .owner = THIS_MODULE,
387 .open = rt_cache_seq_open,
388 .read = seq_read,
389 .llseek = seq_lseek,
390 .release = seq_release_private,
391};
392
393
394static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
395{
396 int cpu;
397
398 if (*pos == 0)
399 return SEQ_START_TOKEN;
400
401 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
402 if (!cpu_possible(cpu))
403 continue;
404 *pos = cpu+1;
2f970d83 405 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
406 }
407 return NULL;
408}
409
410static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
411{
412 int cpu;
413
414 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
415 if (!cpu_possible(cpu))
416 continue;
417 *pos = cpu+1;
2f970d83 418 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
419 }
420 return NULL;
421
422}
423
424static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
425{
426
427}
428
429static int rt_cpu_seq_show(struct seq_file *seq, void *v)
430{
431 struct rt_cache_stat *st = v;
432
433 if (v == SEQ_START_TOKEN) {
5bec0039 434 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
435 return 0;
436 }
437
438 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
439 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
440 atomic_read(&ipv4_dst_ops.entries),
441 st->in_hit,
442 st->in_slow_tot,
443 st->in_slow_mc,
444 st->in_no_route,
445 st->in_brd,
446 st->in_martian_dst,
447 st->in_martian_src,
448
449 st->out_hit,
450 st->out_slow_tot,
451 st->out_slow_mc,
452
453 st->gc_total,
454 st->gc_ignored,
455 st->gc_goal_miss,
456 st->gc_dst_overflow,
457 st->in_hlist_search,
458 st->out_hlist_search
459 );
460 return 0;
461}
462
463static struct seq_operations rt_cpu_seq_ops = {
464 .start = rt_cpu_seq_start,
465 .next = rt_cpu_seq_next,
466 .stop = rt_cpu_seq_stop,
467 .show = rt_cpu_seq_show,
468};
469
470
471static int rt_cpu_seq_open(struct inode *inode, struct file *file)
472{
473 return seq_open(file, &rt_cpu_seq_ops);
474}
475
476static struct file_operations rt_cpu_seq_fops = {
477 .owner = THIS_MODULE,
478 .open = rt_cpu_seq_open,
479 .read = seq_read,
480 .llseek = seq_lseek,
481 .release = seq_release,
482};
483
484#endif /* CONFIG_PROC_FS */
485
486static __inline__ void rt_free(struct rtable *rt)
487{
488 multipath_remove(rt);
489 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
490}
491
492static __inline__ void rt_drop(struct rtable *rt)
493{
494 multipath_remove(rt);
495 ip_rt_put(rt);
496 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
497}
498
499static __inline__ int rt_fast_clean(struct rtable *rth)
500{
501 /* Kill broadcast/multicast entries very aggresively, if they
502 collide in hash table with more useful entries */
503 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
504 rth->fl.iif && rth->u.rt_next;
505}
506
507static __inline__ int rt_valuable(struct rtable *rth)
508{
509 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
510 rth->u.dst.expires;
511}
512
513static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
514{
515 unsigned long age;
516 int ret = 0;
517
518 if (atomic_read(&rth->u.dst.__refcnt))
519 goto out;
520
521 ret = 1;
522 if (rth->u.dst.expires &&
523 time_after_eq(jiffies, rth->u.dst.expires))
524 goto out;
525
526 age = jiffies - rth->u.dst.lastuse;
527 ret = 0;
528 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
529 (age <= tmo2 && rt_valuable(rth)))
530 goto out;
531 ret = 1;
532out: return ret;
533}
534
535/* Bits of score are:
536 * 31: very valuable
537 * 30: not quite useless
538 * 29..0: usage counter
539 */
540static inline u32 rt_score(struct rtable *rt)
541{
542 u32 score = jiffies - rt->u.dst.lastuse;
543
544 score = ~score & ~(3<<30);
545
546 if (rt_valuable(rt))
547 score |= (1<<31);
548
549 if (!rt->fl.iif ||
550 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
551 score |= (1<<30);
552
553 return score;
554}
555
556static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
557{
558 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
559 fl1->oif == fl2->oif &&
560 fl1->iif == fl2->iif;
561}
562
563#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
564static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
565 struct rtable *expentry,
566 int *removed_count)
567{
568 int passedexpired = 0;
569 struct rtable **nextstep = NULL;
570 struct rtable **rthp = chain_head;
571 struct rtable *rth;
572
573 if (removed_count)
574 *removed_count = 0;
575
576 while ((rth = *rthp) != NULL) {
577 if (rth == expentry)
578 passedexpired = 1;
579
580 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
581 compare_keys(&(*rthp)->fl, &expentry->fl)) {
582 if (*rthp == expentry) {
583 *rthp = rth->u.rt_next;
584 continue;
585 } else {
586 *rthp = rth->u.rt_next;
587 rt_free(rth);
588 if (removed_count)
589 ++(*removed_count);
590 }
591 } else {
592 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
593 passedexpired && !nextstep)
594 nextstep = &rth->u.rt_next;
595
596 rthp = &rth->u.rt_next;
597 }
598 }
599
600 rt_free(expentry);
601 if (removed_count)
602 ++(*removed_count);
603
604 return nextstep;
605}
606#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
607
608
609/* This runs via a timer and thus is always in BH context. */
610static void rt_check_expire(unsigned long dummy)
611{
bb1d23b0
ED
612 static unsigned int rover;
613 unsigned int i = rover, goal;
1da177e4
LT
614 struct rtable *rth, **rthp;
615 unsigned long now = jiffies;
bb1d23b0
ED
616 u64 mult;
617
618 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
619 if (ip_rt_gc_timeout > 1)
620 do_div(mult, ip_rt_gc_timeout);
621 goal = (unsigned int)mult;
622 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
623 for (; goal > 0; goal--) {
1da177e4
LT
624 unsigned long tmo = ip_rt_gc_timeout;
625
626 i = (i + 1) & rt_hash_mask;
627 rthp = &rt_hash_table[i].chain;
628
bb1d23b0
ED
629 if (*rthp == 0)
630 continue;
22c047cc 631 spin_lock(rt_hash_lock_addr(i));
1da177e4
LT
632 while ((rth = *rthp) != NULL) {
633 if (rth->u.dst.expires) {
634 /* Entry is expired even if it is in use */
635 if (time_before_eq(now, rth->u.dst.expires)) {
636 tmo >>= 1;
637 rthp = &rth->u.rt_next;
638 continue;
639 }
640 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
641 tmo >>= 1;
642 rthp = &rth->u.rt_next;
643 continue;
644 }
645
646 /* Cleanup aged off entries. */
647#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
648 /* remove all related balanced entries if necessary */
649 if (rth->u.dst.flags & DST_BALANCED) {
650 rthp = rt_remove_balanced_route(
651 &rt_hash_table[i].chain,
652 rth, NULL);
653 if (!rthp)
654 break;
655 } else {
656 *rthp = rth->u.rt_next;
657 rt_free(rth);
658 }
659#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
660 *rthp = rth->u.rt_next;
661 rt_free(rth);
662#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
663 }
22c047cc 664 spin_unlock(rt_hash_lock_addr(i));
1da177e4
LT
665
666 /* Fallback loop breaker. */
667 if (time_after(jiffies, now))
668 break;
669 }
670 rover = i;
bb1d23b0 671 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
1da177e4
LT
672}
673
674/* This can run from both BH and non-BH contexts, the latter
675 * in the case of a forced flush event.
676 */
677static void rt_run_flush(unsigned long dummy)
678{
679 int i;
680 struct rtable *rth, *next;
681
682 rt_deadline = 0;
683
684 get_random_bytes(&rt_hash_rnd, 4);
685
686 for (i = rt_hash_mask; i >= 0; i--) {
22c047cc 687 spin_lock_bh(rt_hash_lock_addr(i));
1da177e4
LT
688 rth = rt_hash_table[i].chain;
689 if (rth)
690 rt_hash_table[i].chain = NULL;
22c047cc 691 spin_unlock_bh(rt_hash_lock_addr(i));
1da177e4
LT
692
693 for (; rth; rth = next) {
694 next = rth->u.rt_next;
695 rt_free(rth);
696 }
697 }
698}
699
700static DEFINE_SPINLOCK(rt_flush_lock);
701
702void rt_cache_flush(int delay)
703{
704 unsigned long now = jiffies;
705 int user_mode = !in_softirq();
706
707 if (delay < 0)
708 delay = ip_rt_min_delay;
709
710 /* flush existing multipath state*/
711 multipath_flush();
712
713 spin_lock_bh(&rt_flush_lock);
714
715 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
716 long tmo = (long)(rt_deadline - now);
717
718 /* If flush timer is already running
719 and flush request is not immediate (delay > 0):
720
721 if deadline is not achieved, prolongate timer to "delay",
722 otherwise fire it at deadline time.
723 */
724
725 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
726 tmo = 0;
727
728 if (delay > tmo)
729 delay = tmo;
730 }
731
732 if (delay <= 0) {
733 spin_unlock_bh(&rt_flush_lock);
734 rt_run_flush(0);
735 return;
736 }
737
738 if (rt_deadline == 0)
739 rt_deadline = now + ip_rt_max_delay;
740
741 mod_timer(&rt_flush_timer, now+delay);
742 spin_unlock_bh(&rt_flush_lock);
743}
744
745static void rt_secret_rebuild(unsigned long dummy)
746{
747 unsigned long now = jiffies;
748
749 rt_cache_flush(0);
750 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
751}
752
753/*
754 Short description of GC goals.
755
756 We want to build algorithm, which will keep routing cache
757 at some equilibrium point, when number of aged off entries
758 is kept approximately equal to newly generated ones.
759
760 Current expiration strength is variable "expire".
761 We try to adjust it dynamically, so that if networking
762 is idle expires is large enough to keep enough of warm entries,
763 and when load increases it reduces to limit cache size.
764 */
765
766static int rt_garbage_collect(void)
767{
768 static unsigned long expire = RT_GC_TIMEOUT;
769 static unsigned long last_gc;
770 static int rover;
771 static int equilibrium;
772 struct rtable *rth, **rthp;
773 unsigned long now = jiffies;
774 int goal;
775
776 /*
777 * Garbage collection is pretty expensive,
778 * do not make it too frequently.
779 */
780
781 RT_CACHE_STAT_INC(gc_total);
782
783 if (now - last_gc < ip_rt_gc_min_interval &&
784 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
785 RT_CACHE_STAT_INC(gc_ignored);
786 goto out;
787 }
788
789 /* Calculate number of entries, which we want to expire now. */
790 goal = atomic_read(&ipv4_dst_ops.entries) -
791 (ip_rt_gc_elasticity << rt_hash_log);
792 if (goal <= 0) {
793 if (equilibrium < ipv4_dst_ops.gc_thresh)
794 equilibrium = ipv4_dst_ops.gc_thresh;
795 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
796 if (goal > 0) {
797 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
798 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
799 }
800 } else {
801 /* We are in dangerous area. Try to reduce cache really
802 * aggressively.
803 */
804 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
805 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
806 }
807
808 if (now - last_gc >= ip_rt_gc_min_interval)
809 last_gc = now;
810
811 if (goal <= 0) {
812 equilibrium += goal;
813 goto work_done;
814 }
815
816 do {
817 int i, k;
818
819 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
820 unsigned long tmo = expire;
821
822 k = (k + 1) & rt_hash_mask;
823 rthp = &rt_hash_table[k].chain;
22c047cc 824 spin_lock_bh(rt_hash_lock_addr(k));
1da177e4
LT
825 while ((rth = *rthp) != NULL) {
826 if (!rt_may_expire(rth, tmo, expire)) {
827 tmo >>= 1;
828 rthp = &rth->u.rt_next;
829 continue;
830 }
831#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
832 /* remove all related balanced entries
833 * if necessary
834 */
835 if (rth->u.dst.flags & DST_BALANCED) {
836 int r;
837
838 rthp = rt_remove_balanced_route(
85259878 839 &rt_hash_table[k].chain,
1da177e4
LT
840 rth,
841 &r);
842 goal -= r;
843 if (!rthp)
844 break;
845 } else {
846 *rthp = rth->u.rt_next;
847 rt_free(rth);
848 goal--;
849 }
850#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
851 *rthp = rth->u.rt_next;
852 rt_free(rth);
853 goal--;
854#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
855 }
22c047cc 856 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
857 if (goal <= 0)
858 break;
859 }
860 rover = k;
861
862 if (goal <= 0)
863 goto work_done;
864
865 /* Goal is not achieved. We stop process if:
866
867 - if expire reduced to zero. Otherwise, expire is halfed.
868 - if table is not full.
869 - if we are called from interrupt.
870 - jiffies check is just fallback/debug loop breaker.
871 We will not spin here for long time in any case.
872 */
873
874 RT_CACHE_STAT_INC(gc_goal_miss);
875
876 if (expire == 0)
877 break;
878
879 expire >>= 1;
880#if RT_CACHE_DEBUG >= 2
881 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
882 atomic_read(&ipv4_dst_ops.entries), goal, i);
883#endif
884
885 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
886 goto out;
887 } while (!in_softirq() && time_before_eq(jiffies, now));
888
889 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
890 goto out;
891 if (net_ratelimit())
892 printk(KERN_WARNING "dst cache overflow\n");
893 RT_CACHE_STAT_INC(gc_dst_overflow);
894 return 1;
895
896work_done:
897 expire += ip_rt_gc_min_interval;
898 if (expire > ip_rt_gc_timeout ||
899 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
900 expire = ip_rt_gc_timeout;
901#if RT_CACHE_DEBUG >= 2
902 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
903 atomic_read(&ipv4_dst_ops.entries), goal, rover);
904#endif
905out: return 0;
906}
907
908static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
909{
910 struct rtable *rth, **rthp;
911 unsigned long now;
912 struct rtable *cand, **candp;
913 u32 min_score;
914 int chain_length;
915 int attempts = !in_softirq();
916
917restart:
918 chain_length = 0;
919 min_score = ~(u32)0;
920 cand = NULL;
921 candp = NULL;
922 now = jiffies;
923
924 rthp = &rt_hash_table[hash].chain;
925
22c047cc 926 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
927 while ((rth = *rthp) != NULL) {
928#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
929 if (!(rth->u.dst.flags & DST_BALANCED) &&
930 compare_keys(&rth->fl, &rt->fl)) {
931#else
932 if (compare_keys(&rth->fl, &rt->fl)) {
933#endif
934 /* Put it first */
935 *rthp = rth->u.rt_next;
936 /*
937 * Since lookup is lockfree, the deletion
938 * must be visible to another weakly ordered CPU before
939 * the insertion at the start of the hash chain.
940 */
941 rcu_assign_pointer(rth->u.rt_next,
942 rt_hash_table[hash].chain);
943 /*
944 * Since lookup is lockfree, the update writes
945 * must be ordered for consistency on SMP.
946 */
947 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
948
949 rth->u.dst.__use++;
950 dst_hold(&rth->u.dst);
951 rth->u.dst.lastuse = now;
22c047cc 952 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
953
954 rt_drop(rt);
955 *rp = rth;
956 return 0;
957 }
958
959 if (!atomic_read(&rth->u.dst.__refcnt)) {
960 u32 score = rt_score(rth);
961
962 if (score <= min_score) {
963 cand = rth;
964 candp = rthp;
965 min_score = score;
966 }
967 }
968
969 chain_length++;
970
971 rthp = &rth->u.rt_next;
972 }
973
974 if (cand) {
975 /* ip_rt_gc_elasticity used to be average length of chain
976 * length, when exceeded gc becomes really aggressive.
977 *
978 * The second limit is less certain. At the moment it allows
979 * only 2 entries per bucket. We will see.
980 */
981 if (chain_length > ip_rt_gc_elasticity) {
982 *candp = cand->u.rt_next;
983 rt_free(cand);
984 }
985 }
986
987 /* Try to bind route to arp only if it is output
988 route or unicast forwarding path.
989 */
990 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
991 int err = arp_bind_neighbour(&rt->u.dst);
992 if (err) {
22c047cc 993 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
994
995 if (err != -ENOBUFS) {
996 rt_drop(rt);
997 return err;
998 }
999
1000 /* Neighbour tables are full and nothing
1001 can be released. Try to shrink route cache,
1002 it is most likely it holds some neighbour records.
1003 */
1004 if (attempts-- > 0) {
1005 int saved_elasticity = ip_rt_gc_elasticity;
1006 int saved_int = ip_rt_gc_min_interval;
1007 ip_rt_gc_elasticity = 1;
1008 ip_rt_gc_min_interval = 0;
1009 rt_garbage_collect();
1010 ip_rt_gc_min_interval = saved_int;
1011 ip_rt_gc_elasticity = saved_elasticity;
1012 goto restart;
1013 }
1014
1015 if (net_ratelimit())
1016 printk(KERN_WARNING "Neighbour table overflow.\n");
1017 rt_drop(rt);
1018 return -ENOBUFS;
1019 }
1020 }
1021
1022 rt->u.rt_next = rt_hash_table[hash].chain;
1023#if RT_CACHE_DEBUG >= 2
1024 if (rt->u.rt_next) {
1025 struct rtable *trt;
1026 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1027 NIPQUAD(rt->rt_dst));
1028 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1029 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1030 printk("\n");
1031 }
1032#endif
1033 rt_hash_table[hash].chain = rt;
22c047cc 1034 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1035 *rp = rt;
1036 return 0;
1037}
1038
1039void rt_bind_peer(struct rtable *rt, int create)
1040{
1041 static DEFINE_SPINLOCK(rt_peer_lock);
1042 struct inet_peer *peer;
1043
1044 peer = inet_getpeer(rt->rt_dst, create);
1045
1046 spin_lock_bh(&rt_peer_lock);
1047 if (rt->peer == NULL) {
1048 rt->peer = peer;
1049 peer = NULL;
1050 }
1051 spin_unlock_bh(&rt_peer_lock);
1052 if (peer)
1053 inet_putpeer(peer);
1054}
1055
1056/*
1057 * Peer allocation may fail only in serious out-of-memory conditions. However
1058 * we still can generate some output.
1059 * Random ID selection looks a bit dangerous because we have no chances to
1060 * select ID being unique in a reasonable period of time.
1061 * But broken packet identifier may be better than no packet at all.
1062 */
1063static void ip_select_fb_ident(struct iphdr *iph)
1064{
1065 static DEFINE_SPINLOCK(ip_fb_id_lock);
1066 static u32 ip_fallback_id;
1067 u32 salt;
1068
1069 spin_lock_bh(&ip_fb_id_lock);
1070 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1071 iph->id = htons(salt & 0xFFFF);
1072 ip_fallback_id = salt;
1073 spin_unlock_bh(&ip_fb_id_lock);
1074}
1075
1076void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1077{
1078 struct rtable *rt = (struct rtable *) dst;
1079
1080 if (rt) {
1081 if (rt->peer == NULL)
1082 rt_bind_peer(rt, 1);
1083
1084 /* If peer is attached to destination, it is never detached,
1085 so that we need not to grab a lock to dereference it.
1086 */
1087 if (rt->peer) {
1088 iph->id = htons(inet_getid(rt->peer, more));
1089 return;
1090 }
1091 } else
9c2b3328
SH
1092 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1093 __builtin_return_address(0));
1da177e4
LT
1094
1095 ip_select_fb_ident(iph);
1096}
1097
1098static void rt_del(unsigned hash, struct rtable *rt)
1099{
1100 struct rtable **rthp;
1101
22c047cc 1102 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1103 ip_rt_put(rt);
1104 for (rthp = &rt_hash_table[hash].chain; *rthp;
1105 rthp = &(*rthp)->u.rt_next)
1106 if (*rthp == rt) {
1107 *rthp = rt->u.rt_next;
1108 rt_free(rt);
1109 break;
1110 }
22c047cc 1111 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1112}
1113
1114void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
cef2685e 1115 u32 saddr, struct net_device *dev)
1da177e4
LT
1116{
1117 int i, k;
1118 struct in_device *in_dev = in_dev_get(dev);
1119 struct rtable *rth, **rthp;
1120 u32 skeys[2] = { saddr, 0 };
1121 int ikeys[2] = { dev->ifindex, 0 };
1122
1da177e4
LT
1123 if (!in_dev)
1124 return;
1125
1126 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1127 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1128 goto reject_redirect;
1129
1130 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1131 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1132 goto reject_redirect;
1133 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1134 goto reject_redirect;
1135 } else {
1136 if (inet_addr_type(new_gw) != RTN_UNICAST)
1137 goto reject_redirect;
1138 }
1139
1140 for (i = 0; i < 2; i++) {
1141 for (k = 0; k < 2; k++) {
1142 unsigned hash = rt_hash_code(daddr,
cef2685e 1143 skeys[i] ^ (ikeys[k] << 5));
1da177e4
LT
1144
1145 rthp=&rt_hash_table[hash].chain;
1146
1147 rcu_read_lock();
1148 while ((rth = rcu_dereference(*rthp)) != NULL) {
1149 struct rtable *rt;
1150
1151 if (rth->fl.fl4_dst != daddr ||
1152 rth->fl.fl4_src != skeys[i] ||
1da177e4
LT
1153 rth->fl.oif != ikeys[k] ||
1154 rth->fl.iif != 0) {
1155 rthp = &rth->u.rt_next;
1156 continue;
1157 }
1158
1159 if (rth->rt_dst != daddr ||
1160 rth->rt_src != saddr ||
1161 rth->u.dst.error ||
1162 rth->rt_gateway != old_gw ||
1163 rth->u.dst.dev != dev)
1164 break;
1165
1166 dst_hold(&rth->u.dst);
1167 rcu_read_unlock();
1168
1169 rt = dst_alloc(&ipv4_dst_ops);
1170 if (rt == NULL) {
1171 ip_rt_put(rth);
1172 in_dev_put(in_dev);
1173 return;
1174 }
1175
1176 /* Copy all the information. */
1177 *rt = *rth;
1178 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1179 rt->u.dst.__use = 1;
1180 atomic_set(&rt->u.dst.__refcnt, 1);
1181 rt->u.dst.child = NULL;
1182 if (rt->u.dst.dev)
1183 dev_hold(rt->u.dst.dev);
1184 if (rt->idev)
1185 in_dev_hold(rt->idev);
1186 rt->u.dst.obsolete = 0;
1187 rt->u.dst.lastuse = jiffies;
1188 rt->u.dst.path = &rt->u.dst;
1189 rt->u.dst.neighbour = NULL;
1190 rt->u.dst.hh = NULL;
1191 rt->u.dst.xfrm = NULL;
1192
1193 rt->rt_flags |= RTCF_REDIRECTED;
1194
1195 /* Gateway is different ... */
1196 rt->rt_gateway = new_gw;
1197
1198 /* Redirect received -> path was valid */
1199 dst_confirm(&rth->u.dst);
1200
1201 if (rt->peer)
1202 atomic_inc(&rt->peer->refcnt);
1203
1204 if (arp_bind_neighbour(&rt->u.dst) ||
1205 !(rt->u.dst.neighbour->nud_state &
1206 NUD_VALID)) {
1207 if (rt->u.dst.neighbour)
1208 neigh_event_send(rt->u.dst.neighbour, NULL);
1209 ip_rt_put(rth);
1210 rt_drop(rt);
1211 goto do_next;
1212 }
1213
1214 rt_del(hash, rth);
1215 if (!rt_intern_hash(hash, rt, &rt))
1216 ip_rt_put(rt);
1217 goto do_next;
1218 }
1219 rcu_read_unlock();
1220 do_next:
1221 ;
1222 }
1223 }
1224 in_dev_put(in_dev);
1225 return;
1226
1227reject_redirect:
1228#ifdef CONFIG_IP_ROUTE_VERBOSE
1229 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1230 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1231 "%u.%u.%u.%u ignored.\n"
cef2685e 1232 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1da177e4 1233 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
cef2685e 1234 NIPQUAD(saddr), NIPQUAD(daddr));
1da177e4
LT
1235#endif
1236 in_dev_put(in_dev);
1237}
1238
1239static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1240{
1241 struct rtable *rt = (struct rtable*)dst;
1242 struct dst_entry *ret = dst;
1243
1244 if (rt) {
1245 if (dst->obsolete) {
1246 ip_rt_put(rt);
1247 ret = NULL;
1248 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1249 rt->u.dst.expires) {
1250 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1251 rt->fl.fl4_src ^
cef2685e 1252 (rt->fl.oif << 5));
1da177e4
LT
1253#if RT_CACHE_DEBUG >= 1
1254 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1255 "%u.%u.%u.%u/%02x dropped\n",
1256 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1257#endif
1258 rt_del(hash, rt);
1259 ret = NULL;
1260 }
1261 }
1262 return ret;
1263}
1264
1265/*
1266 * Algorithm:
1267 * 1. The first ip_rt_redirect_number redirects are sent
1268 * with exponential backoff, then we stop sending them at all,
1269 * assuming that the host ignores our redirects.
1270 * 2. If we did not see packets requiring redirects
1271 * during ip_rt_redirect_silence, we assume that the host
1272 * forgot redirected route and start to send redirects again.
1273 *
1274 * This algorithm is much cheaper and more intelligent than dumb load limiting
1275 * in icmp.c.
1276 *
1277 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1278 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1279 */
1280
1281void ip_rt_send_redirect(struct sk_buff *skb)
1282{
1283 struct rtable *rt = (struct rtable*)skb->dst;
1284 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1285
1286 if (!in_dev)
1287 return;
1288
1289 if (!IN_DEV_TX_REDIRECTS(in_dev))
1290 goto out;
1291
1292 /* No redirected packets during ip_rt_redirect_silence;
1293 * reset the algorithm.
1294 */
1295 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1296 rt->u.dst.rate_tokens = 0;
1297
1298 /* Too many ignored redirects; do not send anything
1299 * set u.dst.rate_last to the last seen redirected packet.
1300 */
1301 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1302 rt->u.dst.rate_last = jiffies;
1303 goto out;
1304 }
1305
1306 /* Check for load limit; set rate_last to the latest sent
1307 * redirect.
1308 */
1309 if (time_after(jiffies,
1310 (rt->u.dst.rate_last +
1311 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1312 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1313 rt->u.dst.rate_last = jiffies;
1314 ++rt->u.dst.rate_tokens;
1315#ifdef CONFIG_IP_ROUTE_VERBOSE
1316 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1317 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1318 net_ratelimit())
1319 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1320 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1321 NIPQUAD(rt->rt_src), rt->rt_iif,
1322 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1323#endif
1324 }
1325out:
1326 in_dev_put(in_dev);
1327}
1328
1329static int ip_error(struct sk_buff *skb)
1330{
1331 struct rtable *rt = (struct rtable*)skb->dst;
1332 unsigned long now;
1333 int code;
1334
1335 switch (rt->u.dst.error) {
1336 case EINVAL:
1337 default:
1338 goto out;
1339 case EHOSTUNREACH:
1340 code = ICMP_HOST_UNREACH;
1341 break;
1342 case ENETUNREACH:
1343 code = ICMP_NET_UNREACH;
1344 break;
1345 case EACCES:
1346 code = ICMP_PKT_FILTERED;
1347 break;
1348 }
1349
1350 now = jiffies;
1351 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1352 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1353 rt->u.dst.rate_tokens = ip_rt_error_burst;
1354 rt->u.dst.rate_last = now;
1355 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1356 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1357 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1358 }
1359
1360out: kfree_skb(skb);
1361 return 0;
1362}
1363
1364/*
1365 * The last two values are not from the RFC but
1366 * are needed for AMPRnet AX.25 paths.
1367 */
1368
9b5b5cff 1369static const unsigned short mtu_plateau[] =
1da177e4
LT
1370{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1371
1372static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1373{
1374 int i;
1375
1376 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1377 if (old_mtu > mtu_plateau[i])
1378 return mtu_plateau[i];
1379 return 68;
1380}
1381
1382unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1383{
1384 int i;
1385 unsigned short old_mtu = ntohs(iph->tot_len);
1386 struct rtable *rth;
1387 u32 skeys[2] = { iph->saddr, 0, };
1388 u32 daddr = iph->daddr;
1da177e4
LT
1389 unsigned short est_mtu = 0;
1390
1391 if (ipv4_config.no_pmtu_disc)
1392 return 0;
1393
1394 for (i = 0; i < 2; i++) {
cef2685e 1395 unsigned hash = rt_hash_code(daddr, skeys[i]);
1da177e4
LT
1396
1397 rcu_read_lock();
1398 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1399 rth = rcu_dereference(rth->u.rt_next)) {
1400 if (rth->fl.fl4_dst == daddr &&
1401 rth->fl.fl4_src == skeys[i] &&
1402 rth->rt_dst == daddr &&
1403 rth->rt_src == iph->saddr &&
1da177e4
LT
1404 rth->fl.iif == 0 &&
1405 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1406 unsigned short mtu = new_mtu;
1407
1408 if (new_mtu < 68 || new_mtu >= old_mtu) {
1409
1410 /* BSD 4.2 compatibility hack :-( */
1411 if (mtu == 0 &&
1412 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1413 old_mtu >= 68 + (iph->ihl << 2))
1414 old_mtu -= iph->ihl << 2;
1415
1416 mtu = guess_mtu(old_mtu);
1417 }
1418 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1419 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1420 dst_confirm(&rth->u.dst);
1421 if (mtu < ip_rt_min_pmtu) {
1422 mtu = ip_rt_min_pmtu;
1423 rth->u.dst.metrics[RTAX_LOCK-1] |=
1424 (1 << RTAX_MTU);
1425 }
1426 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1427 dst_set_expires(&rth->u.dst,
1428 ip_rt_mtu_expires);
1429 }
1430 est_mtu = mtu;
1431 }
1432 }
1433 }
1434 rcu_read_unlock();
1435 }
1436 return est_mtu ? : new_mtu;
1437}
1438
1439static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1440{
1441 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1442 !(dst_metric_locked(dst, RTAX_MTU))) {
1443 if (mtu < ip_rt_min_pmtu) {
1444 mtu = ip_rt_min_pmtu;
1445 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1446 }
1447 dst->metrics[RTAX_MTU-1] = mtu;
1448 dst_set_expires(dst, ip_rt_mtu_expires);
1449 }
1450}
1451
1452static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1453{
1454 return NULL;
1455}
1456
1457static void ipv4_dst_destroy(struct dst_entry *dst)
1458{
1459 struct rtable *rt = (struct rtable *) dst;
1460 struct inet_peer *peer = rt->peer;
1461 struct in_device *idev = rt->idev;
1462
1463 if (peer) {
1464 rt->peer = NULL;
1465 inet_putpeer(peer);
1466 }
1467
1468 if (idev) {
1469 rt->idev = NULL;
1470 in_dev_put(idev);
1471 }
1472}
1473
1474static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1475 int how)
1476{
1477 struct rtable *rt = (struct rtable *) dst;
1478 struct in_device *idev = rt->idev;
1479 if (dev != &loopback_dev && idev && idev->dev == dev) {
1480 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1481 if (loopback_idev) {
1482 rt->idev = loopback_idev;
1483 in_dev_put(idev);
1484 }
1485 }
1486}
1487
1488static void ipv4_link_failure(struct sk_buff *skb)
1489{
1490 struct rtable *rt;
1491
1492 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1493
1494 rt = (struct rtable *) skb->dst;
1495 if (rt)
1496 dst_set_expires(&rt->u.dst, 0);
1497}
1498
1499static int ip_rt_bug(struct sk_buff *skb)
1500{
1501 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1502 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1503 skb->dev ? skb->dev->name : "?");
1504 kfree_skb(skb);
1505 return 0;
1506}
1507
1508/*
1509 We do not cache source address of outgoing interface,
1510 because it is used only by IP RR, TS and SRR options,
1511 so that it out of fast path.
1512
1513 BTW remember: "addr" is allowed to be not aligned
1514 in IP options!
1515 */
1516
1517void ip_rt_get_source(u8 *addr, struct rtable *rt)
1518{
1519 u32 src;
1520 struct fib_result res;
1521
1522 if (rt->fl.iif == 0)
1523 src = rt->rt_src;
1524 else if (fib_lookup(&rt->fl, &res) == 0) {
1525 src = FIB_RES_PREFSRC(res);
1526 fib_res_put(&res);
1527 } else
1528 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1529 RT_SCOPE_UNIVERSE);
1530 memcpy(addr, &src, 4);
1531}
1532
1533#ifdef CONFIG_NET_CLS_ROUTE
1534static void set_class_tag(struct rtable *rt, u32 tag)
1535{
1536 if (!(rt->u.dst.tclassid & 0xFFFF))
1537 rt->u.dst.tclassid |= tag & 0xFFFF;
1538 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1539 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1540}
1541#endif
1542
1543static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1544{
1545 struct fib_info *fi = res->fi;
1546
1547 if (fi) {
1548 if (FIB_RES_GW(*res) &&
1549 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1550 rt->rt_gateway = FIB_RES_GW(*res);
1551 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1552 sizeof(rt->u.dst.metrics));
1553 if (fi->fib_mtu == 0) {
1554 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1555 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1556 rt->rt_gateway != rt->rt_dst &&
1557 rt->u.dst.dev->mtu > 576)
1558 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1559 }
1560#ifdef CONFIG_NET_CLS_ROUTE
1561 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1562#endif
1563 } else
1564 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1565
1566 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1567 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1568 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1569 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1570 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1571 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1572 ip_rt_min_advmss);
1573 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1574 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1575
1576#ifdef CONFIG_NET_CLS_ROUTE
1577#ifdef CONFIG_IP_MULTIPLE_TABLES
1578 set_class_tag(rt, fib_rules_tclass(res));
1579#endif
1580 set_class_tag(rt, itag);
1581#endif
1582 rt->rt_type = res->type;
1583}
1584
1585static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1586 u8 tos, struct net_device *dev, int our)
1587{
1588 unsigned hash;
1589 struct rtable *rth;
1590 u32 spec_dst;
1591 struct in_device *in_dev = in_dev_get(dev);
1592 u32 itag = 0;
1593
1594 /* Primary sanity checks. */
1595
1596 if (in_dev == NULL)
1597 return -EINVAL;
1598
1599 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1600 skb->protocol != htons(ETH_P_IP))
1601 goto e_inval;
1602
1603 if (ZERONET(saddr)) {
1604 if (!LOCAL_MCAST(daddr))
1605 goto e_inval;
1606 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1607 } else if (fib_validate_source(saddr, 0, tos, 0,
1608 dev, &spec_dst, &itag) < 0)
1609 goto e_inval;
1610
1611 rth = dst_alloc(&ipv4_dst_ops);
1612 if (!rth)
1613 goto e_nobufs;
1614
1615 rth->u.dst.output= ip_rt_bug;
1616
1617 atomic_set(&rth->u.dst.__refcnt, 1);
1618 rth->u.dst.flags= DST_HOST;
1619 if (in_dev->cnf.no_policy)
1620 rth->u.dst.flags |= DST_NOPOLICY;
1621 rth->fl.fl4_dst = daddr;
1622 rth->rt_dst = daddr;
1623 rth->fl.fl4_tos = tos;
1624#ifdef CONFIG_IP_ROUTE_FWMARK
1625 rth->fl.fl4_fwmark= skb->nfmark;
1626#endif
1627 rth->fl.fl4_src = saddr;
1628 rth->rt_src = saddr;
1629#ifdef CONFIG_NET_CLS_ROUTE
1630 rth->u.dst.tclassid = itag;
1631#endif
1632 rth->rt_iif =
1633 rth->fl.iif = dev->ifindex;
1634 rth->u.dst.dev = &loopback_dev;
1635 dev_hold(rth->u.dst.dev);
1636 rth->idev = in_dev_get(rth->u.dst.dev);
1637 rth->fl.oif = 0;
1638 rth->rt_gateway = daddr;
1639 rth->rt_spec_dst= spec_dst;
1640 rth->rt_type = RTN_MULTICAST;
1641 rth->rt_flags = RTCF_MULTICAST;
1642 if (our) {
1643 rth->u.dst.input= ip_local_deliver;
1644 rth->rt_flags |= RTCF_LOCAL;
1645 }
1646
1647#ifdef CONFIG_IP_MROUTE
1648 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1649 rth->u.dst.input = ip_mr_input;
1650#endif
1651 RT_CACHE_STAT_INC(in_slow_mc);
1652
1653 in_dev_put(in_dev);
cef2685e 1654 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5));
1da177e4
LT
1655 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1656
1657e_nobufs:
1658 in_dev_put(in_dev);
1659 return -ENOBUFS;
1660
1661e_inval:
1662 in_dev_put(in_dev);
1663 return -EINVAL;
1664}
1665
1666
1667static void ip_handle_martian_source(struct net_device *dev,
1668 struct in_device *in_dev,
1669 struct sk_buff *skb,
1670 u32 daddr,
1671 u32 saddr)
1672{
1673 RT_CACHE_STAT_INC(in_martian_src);
1674#ifdef CONFIG_IP_ROUTE_VERBOSE
1675 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1676 /*
1677 * RFC1812 recommendation, if source is martian,
1678 * the only hint is MAC header.
1679 */
1680 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1681 "%u.%u.%u.%u, on dev %s\n",
1682 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
0b7f22aa 1683 if (dev->hard_header_len && skb->mac.raw) {
1da177e4
LT
1684 int i;
1685 unsigned char *p = skb->mac.raw;
1686 printk(KERN_WARNING "ll header: ");
1687 for (i = 0; i < dev->hard_header_len; i++, p++) {
1688 printk("%02x", *p);
1689 if (i < (dev->hard_header_len - 1))
1690 printk(":");
1691 }
1692 printk("\n");
1693 }
1694 }
1695#endif
1696}
1697
1698static inline int __mkroute_input(struct sk_buff *skb,
1699 struct fib_result* res,
1700 struct in_device *in_dev,
1701 u32 daddr, u32 saddr, u32 tos,
1702 struct rtable **result)
1703{
1704
1705 struct rtable *rth;
1706 int err;
1707 struct in_device *out_dev;
1708 unsigned flags = 0;
1709 u32 spec_dst, itag;
1710
1711 /* get a working reference to the output device */
1712 out_dev = in_dev_get(FIB_RES_DEV(*res));
1713 if (out_dev == NULL) {
1714 if (net_ratelimit())
1715 printk(KERN_CRIT "Bug in ip_route_input" \
1716 "_slow(). Please, report\n");
1717 return -EINVAL;
1718 }
1719
1720
1721 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1722 in_dev->dev, &spec_dst, &itag);
1723 if (err < 0) {
1724 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1725 saddr);
1726
1727 err = -EINVAL;
1728 goto cleanup;
1729 }
1730
1731 if (err)
1732 flags |= RTCF_DIRECTSRC;
1733
1734 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1735 (IN_DEV_SHARED_MEDIA(out_dev) ||
1736 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1737 flags |= RTCF_DOREDIRECT;
1738
1739 if (skb->protocol != htons(ETH_P_IP)) {
1740 /* Not IP (i.e. ARP). Do not create route, if it is
1741 * invalid for proxy arp. DNAT routes are always valid.
1742 */
1743 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1744 err = -EINVAL;
1745 goto cleanup;
1746 }
1747 }
1748
1749
1750 rth = dst_alloc(&ipv4_dst_ops);
1751 if (!rth) {
1752 err = -ENOBUFS;
1753 goto cleanup;
1754 }
1755
ce723d8e 1756 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4
LT
1757 rth->u.dst.flags= DST_HOST;
1758#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1759 if (res->fi->fib_nhs > 1)
1760 rth->u.dst.flags |= DST_BALANCED;
1761#endif
1762 if (in_dev->cnf.no_policy)
1763 rth->u.dst.flags |= DST_NOPOLICY;
1764 if (in_dev->cnf.no_xfrm)
1765 rth->u.dst.flags |= DST_NOXFRM;
1766 rth->fl.fl4_dst = daddr;
1767 rth->rt_dst = daddr;
1768 rth->fl.fl4_tos = tos;
1769#ifdef CONFIG_IP_ROUTE_FWMARK
1770 rth->fl.fl4_fwmark= skb->nfmark;
1771#endif
1772 rth->fl.fl4_src = saddr;
1773 rth->rt_src = saddr;
1774 rth->rt_gateway = daddr;
1775 rth->rt_iif =
1776 rth->fl.iif = in_dev->dev->ifindex;
1777 rth->u.dst.dev = (out_dev)->dev;
1778 dev_hold(rth->u.dst.dev);
1779 rth->idev = in_dev_get(rth->u.dst.dev);
1780 rth->fl.oif = 0;
1781 rth->rt_spec_dst= spec_dst;
1782
1783 rth->u.dst.input = ip_forward;
1784 rth->u.dst.output = ip_output;
1785
1786 rt_set_nexthop(rth, res, itag);
1787
1788 rth->rt_flags = flags;
1789
1790 *result = rth;
1791 err = 0;
1792 cleanup:
1793 /* release the working reference to the output device */
1794 in_dev_put(out_dev);
1795 return err;
1796}
1797
1798static inline int ip_mkroute_input_def(struct sk_buff *skb,
1799 struct fib_result* res,
1800 const struct flowi *fl,
1801 struct in_device *in_dev,
1802 u32 daddr, u32 saddr, u32 tos)
1803{
7abaa27c 1804 struct rtable* rth = NULL;
1da177e4
LT
1805 int err;
1806 unsigned hash;
1807
1808#ifdef CONFIG_IP_ROUTE_MULTIPATH
1809 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1810 fib_select_multipath(fl, res);
1811#endif
1812
1813 /* create a routing cache entry */
1814 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1815 if (err)
1816 return err;
1da177e4
LT
1817
1818 /* put it into the cache */
cef2685e 1819 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
1da177e4
LT
1820 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1821}
1822
1823static inline int ip_mkroute_input(struct sk_buff *skb,
1824 struct fib_result* res,
1825 const struct flowi *fl,
1826 struct in_device *in_dev,
1827 u32 daddr, u32 saddr, u32 tos)
1828{
1829#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
ce723d8e
JA
1830 struct rtable* rth = NULL, *rtres;
1831 unsigned char hop, hopcount;
1da177e4
LT
1832 int err = -EINVAL;
1833 unsigned int hash;
1834
1835 if (res->fi)
1836 hopcount = res->fi->fib_nhs;
1837 else
1838 hopcount = 1;
1839
1da177e4
LT
1840 /* distinguish between multipath and singlepath */
1841 if (hopcount < 2)
1842 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1843 saddr, tos);
1844
1845 /* add all alternatives to the routing cache */
1846 for (hop = 0; hop < hopcount; hop++) {
1847 res->nh_sel = hop;
1848
ce723d8e
JA
1849 /* put reference to previous result */
1850 if (hop)
1851 ip_rt_put(rtres);
1852
1da177e4
LT
1853 /* create a routing cache entry */
1854 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1855 &rth);
1856 if (err)
1857 return err;
1858
1859 /* put it into the cache */
cef2685e 1860 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
ce723d8e 1861 err = rt_intern_hash(hash, rth, &rtres);
1da177e4
LT
1862 if (err)
1863 return err;
1864
1865 /* forward hop information to multipath impl. */
1866 multipath_set_nhinfo(rth,
1867 FIB_RES_NETWORK(*res),
1868 FIB_RES_NETMASK(*res),
1869 res->prefixlen,
1870 &FIB_RES_NH(*res));
1da177e4 1871 }
ce723d8e 1872 skb->dst = &rtres->u.dst;
1da177e4
LT
1873 return err;
1874#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1875 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1876#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1877}
1878
1879
1880/*
1881 * NOTE. We drop all the packets that has local source
1882 * addresses, because every properly looped back packet
1883 * must have correct destination already attached by output routine.
1884 *
1885 * Such approach solves two big problems:
1886 * 1. Not simplex devices are handled properly.
1887 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1888 */
1889
1890static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1891 u8 tos, struct net_device *dev)
1892{
1893 struct fib_result res;
1894 struct in_device *in_dev = in_dev_get(dev);
1895 struct flowi fl = { .nl_u = { .ip4_u =
1896 { .daddr = daddr,
1897 .saddr = saddr,
1898 .tos = tos,
1899 .scope = RT_SCOPE_UNIVERSE,
1900#ifdef CONFIG_IP_ROUTE_FWMARK
1901 .fwmark = skb->nfmark
1902#endif
1903 } },
1904 .iif = dev->ifindex };
1905 unsigned flags = 0;
1906 u32 itag = 0;
1907 struct rtable * rth;
1908 unsigned hash;
1909 u32 spec_dst;
1910 int err = -EINVAL;
1911 int free_res = 0;
1912
1913 /* IP on this device is disabled. */
1914
1915 if (!in_dev)
1916 goto out;
1917
1918 /* Check for the most weird martians, which can be not detected
1919 by fib_lookup.
1920 */
1921
1922 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1923 goto martian_source;
1924
1925 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1926 goto brd_input;
1927
1928 /* Accept zero addresses only to limited broadcast;
1929 * I even do not know to fix it or not. Waiting for complains :-)
1930 */
1931 if (ZERONET(saddr))
1932 goto martian_source;
1933
1934 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1935 goto martian_destination;
1936
1937 /*
1938 * Now we are ready to route packet.
1939 */
1940 if ((err = fib_lookup(&fl, &res)) != 0) {
1941 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1942 goto e_hostunreach;
1da177e4
LT
1943 goto no_route;
1944 }
1945 free_res = 1;
1946
1947 RT_CACHE_STAT_INC(in_slow_tot);
1948
1949 if (res.type == RTN_BROADCAST)
1950 goto brd_input;
1951
1952 if (res.type == RTN_LOCAL) {
1953 int result;
1954 result = fib_validate_source(saddr, daddr, tos,
1955 loopback_dev.ifindex,
1956 dev, &spec_dst, &itag);
1957 if (result < 0)
1958 goto martian_source;
1959 if (result)
1960 flags |= RTCF_DIRECTSRC;
1961 spec_dst = daddr;
1962 goto local_input;
1963 }
1964
1965 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1966 goto e_hostunreach;
1da177e4
LT
1967 if (res.type != RTN_UNICAST)
1968 goto martian_destination;
1969
1970 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1971 if (err == -ENOBUFS)
1972 goto e_nobufs;
1973 if (err == -EINVAL)
1974 goto e_inval;
1975
1976done:
1977 in_dev_put(in_dev);
1978 if (free_res)
1979 fib_res_put(&res);
1980out: return err;
1981
1982brd_input:
1983 if (skb->protocol != htons(ETH_P_IP))
1984 goto e_inval;
1985
1986 if (ZERONET(saddr))
1987 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1988 else {
1989 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1990 &itag);
1991 if (err < 0)
1992 goto martian_source;
1993 if (err)
1994 flags |= RTCF_DIRECTSRC;
1995 }
1996 flags |= RTCF_BROADCAST;
1997 res.type = RTN_BROADCAST;
1998 RT_CACHE_STAT_INC(in_brd);
1999
2000local_input:
2001 rth = dst_alloc(&ipv4_dst_ops);
2002 if (!rth)
2003 goto e_nobufs;
2004
2005 rth->u.dst.output= ip_rt_bug;
2006
2007 atomic_set(&rth->u.dst.__refcnt, 1);
2008 rth->u.dst.flags= DST_HOST;
2009 if (in_dev->cnf.no_policy)
2010 rth->u.dst.flags |= DST_NOPOLICY;
2011 rth->fl.fl4_dst = daddr;
2012 rth->rt_dst = daddr;
2013 rth->fl.fl4_tos = tos;
2014#ifdef CONFIG_IP_ROUTE_FWMARK
2015 rth->fl.fl4_fwmark= skb->nfmark;
2016#endif
2017 rth->fl.fl4_src = saddr;
2018 rth->rt_src = saddr;
2019#ifdef CONFIG_NET_CLS_ROUTE
2020 rth->u.dst.tclassid = itag;
2021#endif
2022 rth->rt_iif =
2023 rth->fl.iif = dev->ifindex;
2024 rth->u.dst.dev = &loopback_dev;
2025 dev_hold(rth->u.dst.dev);
2026 rth->idev = in_dev_get(rth->u.dst.dev);
2027 rth->rt_gateway = daddr;
2028 rth->rt_spec_dst= spec_dst;
2029 rth->u.dst.input= ip_local_deliver;
2030 rth->rt_flags = flags|RTCF_LOCAL;
2031 if (res.type == RTN_UNREACHABLE) {
2032 rth->u.dst.input= ip_error;
2033 rth->u.dst.error= -err;
2034 rth->rt_flags &= ~RTCF_LOCAL;
2035 }
2036 rth->rt_type = res.type;
cef2685e 2037 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5));
1da177e4
LT
2038 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2039 goto done;
2040
2041no_route:
2042 RT_CACHE_STAT_INC(in_no_route);
2043 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2044 res.type = RTN_UNREACHABLE;
2045 goto local_input;
2046
2047 /*
2048 * Do not cache martian addresses: they should be logged (RFC1812)
2049 */
2050martian_destination:
2051 RT_CACHE_STAT_INC(in_martian_dst);
2052#ifdef CONFIG_IP_ROUTE_VERBOSE
2053 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2054 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2055 "%u.%u.%u.%u, dev %s\n",
2056 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2057#endif
2c2910a4
DE
2058
2059e_hostunreach:
2060 err = -EHOSTUNREACH;
2061 goto done;
2062
1da177e4
LT
2063e_inval:
2064 err = -EINVAL;
2065 goto done;
2066
2067e_nobufs:
2068 err = -ENOBUFS;
2069 goto done;
2070
2071martian_source:
2072 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2073 goto e_inval;
2074}
2075
2076int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2077 u8 tos, struct net_device *dev)
2078{
2079 struct rtable * rth;
2080 unsigned hash;
2081 int iif = dev->ifindex;
2082
2083 tos &= IPTOS_RT_MASK;
cef2685e 2084 hash = rt_hash_code(daddr, saddr ^ (iif << 5));
1da177e4
LT
2085
2086 rcu_read_lock();
2087 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2088 rth = rcu_dereference(rth->u.rt_next)) {
2089 if (rth->fl.fl4_dst == daddr &&
2090 rth->fl.fl4_src == saddr &&
2091 rth->fl.iif == iif &&
2092 rth->fl.oif == 0 &&
2093#ifdef CONFIG_IP_ROUTE_FWMARK
2094 rth->fl.fl4_fwmark == skb->nfmark &&
2095#endif
2096 rth->fl.fl4_tos == tos) {
2097 rth->u.dst.lastuse = jiffies;
2098 dst_hold(&rth->u.dst);
2099 rth->u.dst.__use++;
2100 RT_CACHE_STAT_INC(in_hit);
2101 rcu_read_unlock();
2102 skb->dst = (struct dst_entry*)rth;
2103 return 0;
2104 }
2105 RT_CACHE_STAT_INC(in_hlist_search);
2106 }
2107 rcu_read_unlock();
2108
2109 /* Multicast recognition logic is moved from route cache to here.
2110 The problem was that too many Ethernet cards have broken/missing
2111 hardware multicast filters :-( As result the host on multicasting
2112 network acquires a lot of useless route cache entries, sort of
2113 SDR messages from all the world. Now we try to get rid of them.
2114 Really, provided software IP multicast filter is organized
2115 reasonably (at least, hashed), it does not result in a slowdown
2116 comparing with route cache reject entries.
2117 Note, that multicast routers are not affected, because
2118 route cache entry is created eventually.
2119 */
2120 if (MULTICAST(daddr)) {
2121 struct in_device *in_dev;
2122
2123 rcu_read_lock();
e5ed6399 2124 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1da177e4
LT
2125 int our = ip_check_mc(in_dev, daddr, saddr,
2126 skb->nh.iph->protocol);
2127 if (our
2128#ifdef CONFIG_IP_MROUTE
2129 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2130#endif
2131 ) {
2132 rcu_read_unlock();
2133 return ip_route_input_mc(skb, daddr, saddr,
2134 tos, dev, our);
2135 }
2136 }
2137 rcu_read_unlock();
2138 return -EINVAL;
2139 }
2140 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2141}
2142
2143static inline int __mkroute_output(struct rtable **result,
2144 struct fib_result* res,
2145 const struct flowi *fl,
2146 const struct flowi *oldflp,
2147 struct net_device *dev_out,
2148 unsigned flags)
2149{
2150 struct rtable *rth;
2151 struct in_device *in_dev;
2152 u32 tos = RT_FL_TOS(oldflp);
2153 int err = 0;
2154
2155 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2156 return -EINVAL;
2157
2158 if (fl->fl4_dst == 0xFFFFFFFF)
2159 res->type = RTN_BROADCAST;
2160 else if (MULTICAST(fl->fl4_dst))
2161 res->type = RTN_MULTICAST;
2162 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2163 return -EINVAL;
2164
2165 if (dev_out->flags & IFF_LOOPBACK)
2166 flags |= RTCF_LOCAL;
2167
2168 /* get work reference to inet device */
2169 in_dev = in_dev_get(dev_out);
2170 if (!in_dev)
2171 return -EINVAL;
2172
2173 if (res->type == RTN_BROADCAST) {
2174 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2175 if (res->fi) {
2176 fib_info_put(res->fi);
2177 res->fi = NULL;
2178 }
2179 } else if (res->type == RTN_MULTICAST) {
2180 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2181 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2182 oldflp->proto))
2183 flags &= ~RTCF_LOCAL;
2184 /* If multicast route do not exist use
2185 default one, but do not gateway in this case.
2186 Yes, it is hack.
2187 */
2188 if (res->fi && res->prefixlen < 4) {
2189 fib_info_put(res->fi);
2190 res->fi = NULL;
2191 }
2192 }
2193
2194
2195 rth = dst_alloc(&ipv4_dst_ops);
2196 if (!rth) {
2197 err = -ENOBUFS;
2198 goto cleanup;
2199 }
2200
ce723d8e 2201 atomic_set(&rth->u.dst.__refcnt, 1);
1da177e4
LT
2202 rth->u.dst.flags= DST_HOST;
2203#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2204 if (res->fi) {
2205 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2206 if (res->fi->fib_nhs > 1)
2207 rth->u.dst.flags |= DST_BALANCED;
2208 }
2209#endif
2210 if (in_dev->cnf.no_xfrm)
2211 rth->u.dst.flags |= DST_NOXFRM;
2212 if (in_dev->cnf.no_policy)
2213 rth->u.dst.flags |= DST_NOPOLICY;
2214
2215 rth->fl.fl4_dst = oldflp->fl4_dst;
2216 rth->fl.fl4_tos = tos;
2217 rth->fl.fl4_src = oldflp->fl4_src;
2218 rth->fl.oif = oldflp->oif;
2219#ifdef CONFIG_IP_ROUTE_FWMARK
2220 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2221#endif
2222 rth->rt_dst = fl->fl4_dst;
2223 rth->rt_src = fl->fl4_src;
2224 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2225 /* get references to the devices that are to be hold by the routing
2226 cache entry */
2227 rth->u.dst.dev = dev_out;
2228 dev_hold(dev_out);
2229 rth->idev = in_dev_get(dev_out);
2230 rth->rt_gateway = fl->fl4_dst;
2231 rth->rt_spec_dst= fl->fl4_src;
2232
2233 rth->u.dst.output=ip_output;
2234
2235 RT_CACHE_STAT_INC(out_slow_tot);
2236
2237 if (flags & RTCF_LOCAL) {
2238 rth->u.dst.input = ip_local_deliver;
2239 rth->rt_spec_dst = fl->fl4_dst;
2240 }
2241 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2242 rth->rt_spec_dst = fl->fl4_src;
2243 if (flags & RTCF_LOCAL &&
2244 !(dev_out->flags & IFF_LOOPBACK)) {
2245 rth->u.dst.output = ip_mc_output;
2246 RT_CACHE_STAT_INC(out_slow_mc);
2247 }
2248#ifdef CONFIG_IP_MROUTE
2249 if (res->type == RTN_MULTICAST) {
2250 if (IN_DEV_MFORWARD(in_dev) &&
2251 !LOCAL_MCAST(oldflp->fl4_dst)) {
2252 rth->u.dst.input = ip_mr_input;
2253 rth->u.dst.output = ip_mc_output;
2254 }
2255 }
2256#endif
2257 }
2258
2259 rt_set_nexthop(rth, res, 0);
2260
2261 rth->rt_flags = flags;
2262
2263 *result = rth;
2264 cleanup:
2265 /* release work reference to inet device */
2266 in_dev_put(in_dev);
2267
2268 return err;
2269}
2270
2271static inline int ip_mkroute_output_def(struct rtable **rp,
2272 struct fib_result* res,
2273 const struct flowi *fl,
2274 const struct flowi *oldflp,
2275 struct net_device *dev_out,
2276 unsigned flags)
2277{
7abaa27c 2278 struct rtable *rth = NULL;
1da177e4
LT
2279 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2280 unsigned hash;
2281 if (err == 0) {
1da177e4 2282 hash = rt_hash_code(oldflp->fl4_dst,
cef2685e 2283 oldflp->fl4_src ^ (oldflp->oif << 5));
1da177e4
LT
2284 err = rt_intern_hash(hash, rth, rp);
2285 }
2286
2287 return err;
2288}
2289
2290static inline int ip_mkroute_output(struct rtable** rp,
2291 struct fib_result* res,
2292 const struct flowi *fl,
2293 const struct flowi *oldflp,
2294 struct net_device *dev_out,
2295 unsigned flags)
2296{
2297#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1da177e4
LT
2298 unsigned char hop;
2299 unsigned hash;
2300 int err = -EINVAL;
7abaa27c 2301 struct rtable *rth = NULL;
1da177e4
LT
2302
2303 if (res->fi && res->fi->fib_nhs > 1) {
2304 unsigned char hopcount = res->fi->fib_nhs;
2305
2306 for (hop = 0; hop < hopcount; hop++) {
2307 struct net_device *dev2nexthop;
2308
2309 res->nh_sel = hop;
2310
2311 /* hold a work reference to the output device */
2312 dev2nexthop = FIB_RES_DEV(*res);
2313 dev_hold(dev2nexthop);
2314
ce723d8e
JA
2315 /* put reference to previous result */
2316 if (hop)
2317 ip_rt_put(*rp);
2318
1da177e4
LT
2319 err = __mkroute_output(&rth, res, fl, oldflp,
2320 dev2nexthop, flags);
2321
2322 if (err != 0)
2323 goto cleanup;
2324
2325 hash = rt_hash_code(oldflp->fl4_dst,
2326 oldflp->fl4_src ^
cef2685e 2327 (oldflp->oif << 5));
1da177e4
LT
2328 err = rt_intern_hash(hash, rth, rp);
2329
2330 /* forward hop information to multipath impl. */
2331 multipath_set_nhinfo(rth,
2332 FIB_RES_NETWORK(*res),
2333 FIB_RES_NETMASK(*res),
2334 res->prefixlen,
2335 &FIB_RES_NH(*res));
2336 cleanup:
2337 /* release work reference to output device */
2338 dev_put(dev2nexthop);
2339
2340 if (err != 0)
2341 return err;
2342 }
1da177e4
LT
2343 return err;
2344 } else {
2345 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2346 flags);
2347 }
2348#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2349 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2350#endif
2351}
2352
2353/*
2354 * Major route resolver routine.
2355 */
2356
2357static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2358{
2359 u32 tos = RT_FL_TOS(oldflp);
2360 struct flowi fl = { .nl_u = { .ip4_u =
2361 { .daddr = oldflp->fl4_dst,
2362 .saddr = oldflp->fl4_src,
2363 .tos = tos & IPTOS_RT_MASK,
2364 .scope = ((tos & RTO_ONLINK) ?
2365 RT_SCOPE_LINK :
2366 RT_SCOPE_UNIVERSE),
2367#ifdef CONFIG_IP_ROUTE_FWMARK
2368 .fwmark = oldflp->fl4_fwmark
2369#endif
2370 } },
2371 .iif = loopback_dev.ifindex,
2372 .oif = oldflp->oif };
2373 struct fib_result res;
2374 unsigned flags = 0;
2375 struct net_device *dev_out = NULL;
2376 int free_res = 0;
2377 int err;
2378
2379
2380 res.fi = NULL;
2381#ifdef CONFIG_IP_MULTIPLE_TABLES
2382 res.r = NULL;
2383#endif
2384
2385 if (oldflp->fl4_src) {
2386 err = -EINVAL;
2387 if (MULTICAST(oldflp->fl4_src) ||
2388 BADCLASS(oldflp->fl4_src) ||
2389 ZERONET(oldflp->fl4_src))
2390 goto out;
2391
2392 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2393 dev_out = ip_dev_find(oldflp->fl4_src);
2394 if (dev_out == NULL)
2395 goto out;
2396
2397 /* I removed check for oif == dev_out->oif here.
2398 It was wrong for two reasons:
2399 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2400 assigned to multiple interfaces.
2401 2. Moreover, we are allowed to send packets with saddr
2402 of another iface. --ANK
2403 */
2404
2405 if (oldflp->oif == 0
2406 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2407 /* Special hack: user can direct multicasts
2408 and limited broadcast via necessary interface
2409 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2410 This hack is not just for fun, it allows
2411 vic,vat and friends to work.
2412 They bind socket to loopback, set ttl to zero
2413 and expect that it will work.
2414 From the viewpoint of routing cache they are broken,
2415 because we are not allowed to build multicast path
2416 with loopback source addr (look, routing cache
2417 cannot know, that ttl is zero, so that packet
2418 will not leave this host and route is valid).
2419 Luckily, this hack is good workaround.
2420 */
2421
2422 fl.oif = dev_out->ifindex;
2423 goto make_route;
2424 }
2425 if (dev_out)
2426 dev_put(dev_out);
2427 dev_out = NULL;
2428 }
2429
2430
2431 if (oldflp->oif) {
2432 dev_out = dev_get_by_index(oldflp->oif);
2433 err = -ENODEV;
2434 if (dev_out == NULL)
2435 goto out;
e5ed6399
HX
2436
2437 /* RACE: Check return value of inet_select_addr instead. */
2438 if (__in_dev_get_rtnl(dev_out) == NULL) {
1da177e4
LT
2439 dev_put(dev_out);
2440 goto out; /* Wrong error code */
2441 }
2442
2443 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2444 if (!fl.fl4_src)
2445 fl.fl4_src = inet_select_addr(dev_out, 0,
2446 RT_SCOPE_LINK);
2447 goto make_route;
2448 }
2449 if (!fl.fl4_src) {
2450 if (MULTICAST(oldflp->fl4_dst))
2451 fl.fl4_src = inet_select_addr(dev_out, 0,
2452 fl.fl4_scope);
2453 else if (!oldflp->fl4_dst)
2454 fl.fl4_src = inet_select_addr(dev_out, 0,
2455 RT_SCOPE_HOST);
2456 }
2457 }
2458
2459 if (!fl.fl4_dst) {
2460 fl.fl4_dst = fl.fl4_src;
2461 if (!fl.fl4_dst)
2462 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2463 if (dev_out)
2464 dev_put(dev_out);
2465 dev_out = &loopback_dev;
2466 dev_hold(dev_out);
2467 fl.oif = loopback_dev.ifindex;
2468 res.type = RTN_LOCAL;
2469 flags |= RTCF_LOCAL;
2470 goto make_route;
2471 }
2472
2473 if (fib_lookup(&fl, &res)) {
2474 res.fi = NULL;
2475 if (oldflp->oif) {
2476 /* Apparently, routing tables are wrong. Assume,
2477 that the destination is on link.
2478
2479 WHY? DW.
2480 Because we are allowed to send to iface
2481 even if it has NO routes and NO assigned
2482 addresses. When oif is specified, routing
2483 tables are looked up with only one purpose:
2484 to catch if destination is gatewayed, rather than
2485 direct. Moreover, if MSG_DONTROUTE is set,
2486 we send packet, ignoring both routing tables
2487 and ifaddr state. --ANK
2488
2489
2490 We could make it even if oif is unknown,
2491 likely IPv6, but we do not.
2492 */
2493
2494 if (fl.fl4_src == 0)
2495 fl.fl4_src = inet_select_addr(dev_out, 0,
2496 RT_SCOPE_LINK);
2497 res.type = RTN_UNICAST;
2498 goto make_route;
2499 }
2500 if (dev_out)
2501 dev_put(dev_out);
2502 err = -ENETUNREACH;
2503 goto out;
2504 }
2505 free_res = 1;
2506
2507 if (res.type == RTN_LOCAL) {
2508 if (!fl.fl4_src)
2509 fl.fl4_src = fl.fl4_dst;
2510 if (dev_out)
2511 dev_put(dev_out);
2512 dev_out = &loopback_dev;
2513 dev_hold(dev_out);
2514 fl.oif = dev_out->ifindex;
2515 if (res.fi)
2516 fib_info_put(res.fi);
2517 res.fi = NULL;
2518 flags |= RTCF_LOCAL;
2519 goto make_route;
2520 }
2521
2522#ifdef CONFIG_IP_ROUTE_MULTIPATH
2523 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2524 fib_select_multipath(&fl, &res);
2525 else
2526#endif
2527 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2528 fib_select_default(&fl, &res);
2529
2530 if (!fl.fl4_src)
2531 fl.fl4_src = FIB_RES_PREFSRC(res);
2532
2533 if (dev_out)
2534 dev_put(dev_out);
2535 dev_out = FIB_RES_DEV(res);
2536 dev_hold(dev_out);
2537 fl.oif = dev_out->ifindex;
2538
2539
2540make_route:
2541 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2542
2543
2544 if (free_res)
2545 fib_res_put(&res);
2546 if (dev_out)
2547 dev_put(dev_out);
2548out: return err;
2549}
2550
2551int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2552{
2553 unsigned hash;
2554 struct rtable *rth;
2555
cef2685e 2556 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5));
1da177e4
LT
2557
2558 rcu_read_lock_bh();
2559 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2560 rth = rcu_dereference(rth->u.rt_next)) {
2561 if (rth->fl.fl4_dst == flp->fl4_dst &&
2562 rth->fl.fl4_src == flp->fl4_src &&
2563 rth->fl.iif == 0 &&
2564 rth->fl.oif == flp->oif &&
2565#ifdef CONFIG_IP_ROUTE_FWMARK
2566 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2567#endif
2568 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2569 (IPTOS_RT_MASK | RTO_ONLINK))) {
2570
2571 /* check for multipath routes and choose one if
2572 * necessary
2573 */
2574 if (multipath_select_route(flp, rth, rp)) {
2575 dst_hold(&(*rp)->u.dst);
2576 RT_CACHE_STAT_INC(out_hit);
2577 rcu_read_unlock_bh();
2578 return 0;
2579 }
2580
2581 rth->u.dst.lastuse = jiffies;
2582 dst_hold(&rth->u.dst);
2583 rth->u.dst.__use++;
2584 RT_CACHE_STAT_INC(out_hit);
2585 rcu_read_unlock_bh();
2586 *rp = rth;
2587 return 0;
2588 }
2589 RT_CACHE_STAT_INC(out_hlist_search);
2590 }
2591 rcu_read_unlock_bh();
2592
2593 return ip_route_output_slow(rp, flp);
2594}
2595
d8c97a94
ACM
2596EXPORT_SYMBOL_GPL(__ip_route_output_key);
2597
1da177e4
LT
2598int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2599{
2600 int err;
2601
2602 if ((err = __ip_route_output_key(rp, flp)) != 0)
2603 return err;
2604
2605 if (flp->proto) {
2606 if (!flp->fl4_src)
2607 flp->fl4_src = (*rp)->rt_src;
2608 if (!flp->fl4_dst)
2609 flp->fl4_dst = (*rp)->rt_dst;
2610 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2611 }
2612
2613 return 0;
2614}
2615
d8c97a94
ACM
2616EXPORT_SYMBOL_GPL(ip_route_output_flow);
2617
1da177e4
LT
2618int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2619{
2620 return ip_route_output_flow(rp, flp, NULL, 0);
2621}
2622
2623static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2624 int nowait, unsigned int flags)
1da177e4
LT
2625{
2626 struct rtable *rt = (struct rtable*)skb->dst;
2627 struct rtmsg *r;
2628 struct nlmsghdr *nlh;
2629 unsigned char *b = skb->tail;
2630 struct rta_cacheinfo ci;
2631#ifdef CONFIG_IP_MROUTE
2632 struct rtattr *eptr;
2633#endif
b6544c0b 2634 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
1da177e4 2635 r = NLMSG_DATA(nlh);
1da177e4
LT
2636 r->rtm_family = AF_INET;
2637 r->rtm_dst_len = 32;
2638 r->rtm_src_len = 0;
2639 r->rtm_tos = rt->fl.fl4_tos;
2640 r->rtm_table = RT_TABLE_MAIN;
2641 r->rtm_type = rt->rt_type;
2642 r->rtm_scope = RT_SCOPE_UNIVERSE;
2643 r->rtm_protocol = RTPROT_UNSPEC;
2644 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2645 if (rt->rt_flags & RTCF_NOTIFY)
2646 r->rtm_flags |= RTM_F_NOTIFY;
2647 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2648 if (rt->fl.fl4_src) {
2649 r->rtm_src_len = 32;
2650 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2651 }
2652 if (rt->u.dst.dev)
2653 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2654#ifdef CONFIG_NET_CLS_ROUTE
2655 if (rt->u.dst.tclassid)
2656 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2657#endif
2658#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2659 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2660 __u32 alg = rt->rt_multipath_alg;
2661
2662 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2663 }
2664#endif
2665 if (rt->fl.iif)
2666 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2667 else if (rt->rt_src != rt->fl.fl4_src)
2668 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2669 if (rt->rt_dst != rt->rt_gateway)
2670 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2671 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2672 goto rtattr_failure;
2673 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2674 ci.rta_used = rt->u.dst.__use;
2675 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2676 if (rt->u.dst.expires)
2677 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2678 else
2679 ci.rta_expires = 0;
2680 ci.rta_error = rt->u.dst.error;
2681 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2682 if (rt->peer) {
2683 ci.rta_id = rt->peer->ip_id_count;
2684 if (rt->peer->tcp_ts_stamp) {
2685 ci.rta_ts = rt->peer->tcp_ts;
2686 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2687 }
2688 }
2689#ifdef CONFIG_IP_MROUTE
2690 eptr = (struct rtattr*)skb->tail;
2691#endif
2692 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2693 if (rt->fl.iif) {
2694#ifdef CONFIG_IP_MROUTE
2695 u32 dst = rt->rt_dst;
2696
2697 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2698 ipv4_devconf.mc_forwarding) {
2699 int err = ipmr_get_route(skb, r, nowait);
2700 if (err <= 0) {
2701 if (!nowait) {
2702 if (err == 0)
2703 return 0;
2704 goto nlmsg_failure;
2705 } else {
2706 if (err == -EMSGSIZE)
2707 goto nlmsg_failure;
2708 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2709 }
2710 }
2711 } else
2712#endif
2713 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2714 }
2715
2716 nlh->nlmsg_len = skb->tail - b;
2717 return skb->len;
2718
2719nlmsg_failure:
2720rtattr_failure:
2721 skb_trim(skb, b - skb->data);
2722 return -1;
2723}
2724
2725int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2726{
2727 struct rtattr **rta = arg;
2728 struct rtmsg *rtm = NLMSG_DATA(nlh);
2729 struct rtable *rt = NULL;
2730 u32 dst = 0;
2731 u32 src = 0;
2732 int iif = 0;
2733 int err = -ENOBUFS;
2734 struct sk_buff *skb;
2735
2736 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2737 if (!skb)
2738 goto out;
2739
2740 /* Reserve room for dummy headers, this skb can pass
2741 through good chunk of routing engine.
2742 */
d2c962b8
SH
2743 skb->mac.raw = skb->nh.raw = skb->data;
2744
2745 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2746 skb->nh.iph->protocol = IPPROTO_ICMP;
1da177e4
LT
2747 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2748
2749 if (rta[RTA_SRC - 1])
2750 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2751 if (rta[RTA_DST - 1])
2752 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2753 if (rta[RTA_IIF - 1])
2754 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2755
2756 if (iif) {
2757 struct net_device *dev = __dev_get_by_index(iif);
2758 err = -ENODEV;
2759 if (!dev)
2760 goto out_free;
2761 skb->protocol = htons(ETH_P_IP);
2762 skb->dev = dev;
2763 local_bh_disable();
2764 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2765 local_bh_enable();
2766 rt = (struct rtable*)skb->dst;
2767 if (!err && rt->u.dst.error)
2768 err = -rt->u.dst.error;
2769 } else {
2770 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2771 .saddr = src,
2772 .tos = rtm->rtm_tos } } };
2773 int oif = 0;
2774 if (rta[RTA_OIF - 1])
2775 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2776 fl.oif = oif;
2777 err = ip_route_output_key(&rt, &fl);
2778 }
2779 if (err)
2780 goto out_free;
2781
2782 skb->dst = &rt->u.dst;
2783 if (rtm->rtm_flags & RTM_F_NOTIFY)
2784 rt->rt_flags |= RTCF_NOTIFY;
2785
2786 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2787
2788 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
b6544c0b 2789 RTM_NEWROUTE, 0, 0);
1da177e4
LT
2790 if (!err)
2791 goto out_free;
2792 if (err < 0) {
2793 err = -EMSGSIZE;
2794 goto out_free;
2795 }
2796
2797 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2798 if (err > 0)
2799 err = 0;
2800out: return err;
2801
2802out_free:
2803 kfree_skb(skb);
2804 goto out;
2805}
2806
2807int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2808{
2809 struct rtable *rt;
2810 int h, s_h;
2811 int idx, s_idx;
2812
2813 s_h = cb->args[0];
2814 s_idx = idx = cb->args[1];
2815 for (h = 0; h <= rt_hash_mask; h++) {
2816 if (h < s_h) continue;
2817 if (h > s_h)
2818 s_idx = 0;
2819 rcu_read_lock_bh();
2820 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2821 rt = rcu_dereference(rt->u.rt_next), idx++) {
2822 if (idx < s_idx)
2823 continue;
2824 skb->dst = dst_clone(&rt->u.dst);
2825 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
b6544c0b
JHS
2826 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2827 1, NLM_F_MULTI) <= 0) {
1da177e4
LT
2828 dst_release(xchg(&skb->dst, NULL));
2829 rcu_read_unlock_bh();
2830 goto done;
2831 }
2832 dst_release(xchg(&skb->dst, NULL));
2833 }
2834 rcu_read_unlock_bh();
2835 }
2836
2837done:
2838 cb->args[0] = h;
2839 cb->args[1] = idx;
2840 return skb->len;
2841}
2842
2843void ip_rt_multicast_event(struct in_device *in_dev)
2844{
2845 rt_cache_flush(0);
2846}
2847
2848#ifdef CONFIG_SYSCTL
2849static int flush_delay;
2850
2851static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2852 struct file *filp, void __user *buffer,
2853 size_t *lenp, loff_t *ppos)
2854{
2855 if (write) {
2856 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2857 rt_cache_flush(flush_delay);
2858 return 0;
2859 }
2860
2861 return -EINVAL;
2862}
2863
2864static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2865 int __user *name,
2866 int nlen,
2867 void __user *oldval,
2868 size_t __user *oldlenp,
2869 void __user *newval,
2870 size_t newlen,
2871 void **context)
2872{
2873 int delay;
2874 if (newlen != sizeof(int))
2875 return -EINVAL;
2876 if (get_user(delay, (int __user *)newval))
2877 return -EFAULT;
2878 rt_cache_flush(delay);
2879 return 0;
2880}
2881
2882ctl_table ipv4_route_table[] = {
2883 {
2884 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2885 .procname = "flush",
2886 .data = &flush_delay,
2887 .maxlen = sizeof(int),
7e3e0360 2888 .mode = 0200,
1da177e4
LT
2889 .proc_handler = &ipv4_sysctl_rtcache_flush,
2890 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2891 },
2892 {
2893 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2894 .procname = "min_delay",
2895 .data = &ip_rt_min_delay,
2896 .maxlen = sizeof(int),
2897 .mode = 0644,
2898 .proc_handler = &proc_dointvec_jiffies,
2899 .strategy = &sysctl_jiffies,
2900 },
2901 {
2902 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2903 .procname = "max_delay",
2904 .data = &ip_rt_max_delay,
2905 .maxlen = sizeof(int),
2906 .mode = 0644,
2907 .proc_handler = &proc_dointvec_jiffies,
2908 .strategy = &sysctl_jiffies,
2909 },
2910 {
2911 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2912 .procname = "gc_thresh",
2913 .data = &ipv4_dst_ops.gc_thresh,
2914 .maxlen = sizeof(int),
2915 .mode = 0644,
2916 .proc_handler = &proc_dointvec,
2917 },
2918 {
2919 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2920 .procname = "max_size",
2921 .data = &ip_rt_max_size,
2922 .maxlen = sizeof(int),
2923 .mode = 0644,
2924 .proc_handler = &proc_dointvec,
2925 },
2926 {
2927 /* Deprecated. Use gc_min_interval_ms */
2928
2929 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2930 .procname = "gc_min_interval",
2931 .data = &ip_rt_gc_min_interval,
2932 .maxlen = sizeof(int),
2933 .mode = 0644,
2934 .proc_handler = &proc_dointvec_jiffies,
2935 .strategy = &sysctl_jiffies,
2936 },
2937 {
2938 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2939 .procname = "gc_min_interval_ms",
2940 .data = &ip_rt_gc_min_interval,
2941 .maxlen = sizeof(int),
2942 .mode = 0644,
2943 .proc_handler = &proc_dointvec_ms_jiffies,
2944 .strategy = &sysctl_ms_jiffies,
2945 },
2946 {
2947 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2948 .procname = "gc_timeout",
2949 .data = &ip_rt_gc_timeout,
2950 .maxlen = sizeof(int),
2951 .mode = 0644,
2952 .proc_handler = &proc_dointvec_jiffies,
2953 .strategy = &sysctl_jiffies,
2954 },
2955 {
2956 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2957 .procname = "gc_interval",
2958 .data = &ip_rt_gc_interval,
2959 .maxlen = sizeof(int),
2960 .mode = 0644,
2961 .proc_handler = &proc_dointvec_jiffies,
2962 .strategy = &sysctl_jiffies,
2963 },
2964 {
2965 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2966 .procname = "redirect_load",
2967 .data = &ip_rt_redirect_load,
2968 .maxlen = sizeof(int),
2969 .mode = 0644,
2970 .proc_handler = &proc_dointvec,
2971 },
2972 {
2973 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2974 .procname = "redirect_number",
2975 .data = &ip_rt_redirect_number,
2976 .maxlen = sizeof(int),
2977 .mode = 0644,
2978 .proc_handler = &proc_dointvec,
2979 },
2980 {
2981 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2982 .procname = "redirect_silence",
2983 .data = &ip_rt_redirect_silence,
2984 .maxlen = sizeof(int),
2985 .mode = 0644,
2986 .proc_handler = &proc_dointvec,
2987 },
2988 {
2989 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2990 .procname = "error_cost",
2991 .data = &ip_rt_error_cost,
2992 .maxlen = sizeof(int),
2993 .mode = 0644,
2994 .proc_handler = &proc_dointvec,
2995 },
2996 {
2997 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2998 .procname = "error_burst",
2999 .data = &ip_rt_error_burst,
3000 .maxlen = sizeof(int),
3001 .mode = 0644,
3002 .proc_handler = &proc_dointvec,
3003 },
3004 {
3005 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3006 .procname = "gc_elasticity",
3007 .data = &ip_rt_gc_elasticity,
3008 .maxlen = sizeof(int),
3009 .mode = 0644,
3010 .proc_handler = &proc_dointvec,
3011 },
3012 {
3013 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3014 .procname = "mtu_expires",
3015 .data = &ip_rt_mtu_expires,
3016 .maxlen = sizeof(int),
3017 .mode = 0644,
3018 .proc_handler = &proc_dointvec_jiffies,
3019 .strategy = &sysctl_jiffies,
3020 },
3021 {
3022 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3023 .procname = "min_pmtu",
3024 .data = &ip_rt_min_pmtu,
3025 .maxlen = sizeof(int),
3026 .mode = 0644,
3027 .proc_handler = &proc_dointvec,
3028 },
3029 {
3030 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3031 .procname = "min_adv_mss",
3032 .data = &ip_rt_min_advmss,
3033 .maxlen = sizeof(int),
3034 .mode = 0644,
3035 .proc_handler = &proc_dointvec,
3036 },
3037 {
3038 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3039 .procname = "secret_interval",
3040 .data = &ip_rt_secret_interval,
3041 .maxlen = sizeof(int),
3042 .mode = 0644,
3043 .proc_handler = &proc_dointvec_jiffies,
3044 .strategy = &sysctl_jiffies,
3045 },
3046 { .ctl_name = 0 }
3047};
3048#endif
3049
3050#ifdef CONFIG_NET_CLS_ROUTE
3051struct ip_rt_acct *ip_rt_acct;
3052
3053/* This code sucks. But you should have seen it before! --RR */
3054
3055/* IP route accounting ptr for this logical cpu number. */
3056#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3057
3058#ifdef CONFIG_PROC_FS
3059static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3060 int length, int *eof, void *data)
3061{
3062 unsigned int i;
3063
3064 if ((offset & 3) || (length & 3))
3065 return -EIO;
3066
3067 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3068 *eof = 1;
3069 return 0;
3070 }
3071
3072 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3073 length = sizeof(struct ip_rt_acct) * 256 - offset;
3074 *eof = 1;
3075 }
3076
3077 offset /= sizeof(u32);
3078
3079 if (length > 0) {
3080 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3081 u32 *dst = (u32 *) buffer;
3082
3083 /* Copy first cpu. */
3084 *start = buffer;
3085 memcpy(dst, src, length);
3086
3087 /* Add the other cpus in, one int at a time */
6f912042 3088 for_each_possible_cpu(i) {
1da177e4
LT
3089 unsigned int j;
3090
3091 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3092
3093 for (j = 0; j < length/4; j++)
3094 dst[j] += src[j];
3095 }
3096 }
3097 return length;
3098}
3099#endif /* CONFIG_PROC_FS */
3100#endif /* CONFIG_NET_CLS_ROUTE */
3101
3102static __initdata unsigned long rhash_entries;
3103static int __init set_rhash_entries(char *str)
3104{
3105 if (!str)
3106 return 0;
3107 rhash_entries = simple_strtoul(str, &str, 0);
3108 return 1;
3109}
3110__setup("rhash_entries=", set_rhash_entries);
3111
3112int __init ip_rt_init(void)
3113{
424c4b70 3114 int rc = 0;
1da177e4
LT
3115
3116 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3117 (jiffies ^ (jiffies >> 7)));
3118
3119#ifdef CONFIG_NET_CLS_ROUTE
424c4b70
ED
3120 {
3121 int order;
1da177e4
LT
3122 for (order = 0;
3123 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3124 /* NOTHING */;
3125 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3126 if (!ip_rt_acct)
3127 panic("IP: failed to allocate ip_rt_acct\n");
3128 memset(ip_rt_acct, 0, PAGE_SIZE << order);
424c4b70 3129 }
1da177e4
LT
3130#endif
3131
3132 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3133 sizeof(struct rtable),
3134 0, SLAB_HWCACHE_ALIGN,
3135 NULL, NULL);
3136
3137 if (!ipv4_dst_ops.kmem_cachep)
3138 panic("IP: failed to allocate ip_dst_cache\n");
3139
424c4b70
ED
3140 rt_hash_table = (struct rt_hash_bucket *)
3141 alloc_large_system_hash("IP route cache",
3142 sizeof(struct rt_hash_bucket),
3143 rhash_entries,
3144 (num_physpages >= 128 * 1024) ?
18955cfc 3145 15 : 17,
424c4b70
ED
3146 HASH_HIGHMEM,
3147 &rt_hash_log,
3148 &rt_hash_mask,
3149 0);
22c047cc
ED
3150 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3151 rt_hash_lock_init();
1da177e4
LT
3152
3153 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3154 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3155
1da177e4
LT
3156 devinet_init();
3157 ip_fib_init();
3158
3159 init_timer(&rt_flush_timer);
3160 rt_flush_timer.function = rt_run_flush;
3161 init_timer(&rt_periodic_timer);
3162 rt_periodic_timer.function = rt_check_expire;
3163 init_timer(&rt_secret_timer);
3164 rt_secret_timer.function = rt_secret_rebuild;
3165
3166 /* All the timers, started at system startup tend
3167 to synchronize. Perturb it a bit.
3168 */
3169 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3170 ip_rt_gc_interval;
3171 add_timer(&rt_periodic_timer);
3172
3173 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3174 ip_rt_secret_interval;
3175 add_timer(&rt_secret_timer);
3176
3177#ifdef CONFIG_PROC_FS
3178 {
3179 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3180 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3181 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3182 proc_net_stat))) {
1da177e4
LT
3183 return -ENOMEM;
3184 }
3185 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3186 }
3187#ifdef CONFIG_NET_CLS_ROUTE
3188 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3189#endif
3190#endif
3191#ifdef CONFIG_XFRM
3192 xfrm_init();
3193 xfrm4_init();
3194#endif
3195 return rc;
3196}
3197
3198EXPORT_SYMBOL(__ip_select_ident);
3199EXPORT_SYMBOL(ip_route_input);
3200EXPORT_SYMBOL(ip_route_output_key);