]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/route.c
[NET]: Hashed spinlocks in net/ipv4/route.c
[net-next-2.6.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
41 *
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
22c047cc 57 * Eric Dumazet : hashed spinlocks
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65#include <linux/config.h>
66#include <linux/module.h>
67#include <asm/uaccess.h>
68#include <asm/system.h>
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
72#include <linux/sched.h>
73#include <linux/mm.h>
74#include <linux/string.h>
75#include <linux/socket.h>
76#include <linux/sockios.h>
77#include <linux/errno.h>
78#include <linux/in.h>
79#include <linux/inet.h>
80#include <linux/netdevice.h>
81#include <linux/proc_fs.h>
82#include <linux/init.h>
83#include <linux/skbuff.h>
84#include <linux/rtnetlink.h>
85#include <linux/inetdevice.h>
86#include <linux/igmp.h>
87#include <linux/pkt_sched.h>
88#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h>
93#include <linux/times.h>
94#include <net/protocol.h>
95#include <net/ip.h>
96#include <net/route.h>
97#include <net/inetpeer.h>
98#include <net/sock.h>
99#include <net/ip_fib.h>
100#include <net/arp.h>
101#include <net/tcp.h>
102#include <net/icmp.h>
103#include <net/xfrm.h>
104#include <net/ip_mp_alg.h>
105#ifdef CONFIG_SYSCTL
106#include <linux/sysctl.h>
107#endif
108
109#define RT_FL_TOS(oldflp) \
110 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
111
112#define IP_MAX_MTU 0xFFF0
113
114#define RT_GC_TIMEOUT (300*HZ)
115
116static int ip_rt_min_delay = 2 * HZ;
117static int ip_rt_max_delay = 10 * HZ;
118static int ip_rt_max_size;
119static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
120static int ip_rt_gc_interval = 60 * HZ;
121static int ip_rt_gc_min_interval = HZ / 2;
122static int ip_rt_redirect_number = 9;
123static int ip_rt_redirect_load = HZ / 50;
124static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
125static int ip_rt_error_cost = HZ;
126static int ip_rt_error_burst = 5 * HZ;
127static int ip_rt_gc_elasticity = 8;
128static int ip_rt_mtu_expires = 10 * 60 * HZ;
129static int ip_rt_min_pmtu = 512 + 20 + 20;
130static int ip_rt_min_advmss = 256;
131static int ip_rt_secret_interval = 10 * 60 * HZ;
132static unsigned long rt_deadline;
133
134#define RTprint(a...) printk(KERN_DEBUG a)
135
136static struct timer_list rt_flush_timer;
137static struct timer_list rt_periodic_timer;
138static struct timer_list rt_secret_timer;
139
140/*
141 * Interface to generic destination cache.
142 */
143
144static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
145static void ipv4_dst_destroy(struct dst_entry *dst);
146static void ipv4_dst_ifdown(struct dst_entry *dst,
147 struct net_device *dev, int how);
148static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149static void ipv4_link_failure(struct sk_buff *skb);
150static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
151static int rt_garbage_collect(void);
152
153
154static struct dst_ops ipv4_dst_ops = {
155 .family = AF_INET,
156 .protocol = __constant_htons(ETH_P_IP),
157 .gc = rt_garbage_collect,
158 .check = ipv4_dst_check,
159 .destroy = ipv4_dst_destroy,
160 .ifdown = ipv4_dst_ifdown,
161 .negative_advice = ipv4_negative_advice,
162 .link_failure = ipv4_link_failure,
163 .update_pmtu = ip_rt_update_pmtu,
164 .entry_size = sizeof(struct rtable),
165};
166
167#define ECN_OR_COST(class) TC_PRIO_##class
168
169__u8 ip_tos2prio[16] = {
170 TC_PRIO_BESTEFFORT,
171 ECN_OR_COST(FILLER),
172 TC_PRIO_BESTEFFORT,
173 ECN_OR_COST(BESTEFFORT),
174 TC_PRIO_BULK,
175 ECN_OR_COST(BULK),
176 TC_PRIO_BULK,
177 ECN_OR_COST(BULK),
178 TC_PRIO_INTERACTIVE,
179 ECN_OR_COST(INTERACTIVE),
180 TC_PRIO_INTERACTIVE,
181 ECN_OR_COST(INTERACTIVE),
182 TC_PRIO_INTERACTIVE_BULK,
183 ECN_OR_COST(INTERACTIVE_BULK),
184 TC_PRIO_INTERACTIVE_BULK,
185 ECN_OR_COST(INTERACTIVE_BULK)
186};
187
188
189/*
190 * Route cache.
191 */
192
193/* The locking scheme is rather straight forward:
194 *
195 * 1) Read-Copy Update protects the buckets of the central route hash.
196 * 2) Only writers remove entries, and they hold the lock
197 * as they look at rtable reference counts.
198 * 3) Only readers acquire references to rtable entries,
199 * they do so with atomic increments and with the
200 * lock held.
201 */
202
203struct rt_hash_bucket {
204 struct rtable *chain;
22c047cc
ED
205};
206#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
207/*
208 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
209 * The size of this table is a power of two and depends on the number of CPUS.
210 */
211#if NR_CPUS >= 32
212#define RT_HASH_LOCK_SZ 4096
213#elif NR_CPUS >= 16
214#define RT_HASH_LOCK_SZ 2048
215#elif NR_CPUS >= 8
216#define RT_HASH_LOCK_SZ 1024
217#elif NR_CPUS >= 4
218#define RT_HASH_LOCK_SZ 512
219#else
220#define RT_HASH_LOCK_SZ 256
221#endif
222
223static spinlock_t *rt_hash_locks;
224# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
225# define rt_hash_lock_init() { \
226 int i; \
227 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
228 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
229 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
230 spin_lock_init(&rt_hash_locks[i]); \
231 }
232#else
233# define rt_hash_lock_addr(slot) NULL
234# define rt_hash_lock_init()
235#endif
1da177e4
LT
236
237static struct rt_hash_bucket *rt_hash_table;
238static unsigned rt_hash_mask;
239static int rt_hash_log;
240static unsigned int rt_hash_rnd;
241
242struct rt_cache_stat *rt_cache_stat;
243
244static int rt_intern_hash(unsigned hash, struct rtable *rth,
245 struct rtable **res);
246
247static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
248{
249 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
250 & rt_hash_mask);
251}
252
253#ifdef CONFIG_PROC_FS
254struct rt_cache_iter_state {
255 int bucket;
256};
257
258static struct rtable *rt_cache_get_first(struct seq_file *seq)
259{
260 struct rtable *r = NULL;
261 struct rt_cache_iter_state *st = seq->private;
262
263 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
264 rcu_read_lock_bh();
265 r = rt_hash_table[st->bucket].chain;
266 if (r)
267 break;
268 rcu_read_unlock_bh();
269 }
270 return r;
271}
272
273static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
274{
275 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
276
277 r = r->u.rt_next;
278 while (!r) {
279 rcu_read_unlock_bh();
280 if (--st->bucket < 0)
281 break;
282 rcu_read_lock_bh();
283 r = rt_hash_table[st->bucket].chain;
284 }
285 return r;
286}
287
288static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
289{
290 struct rtable *r = rt_cache_get_first(seq);
291
292 if (r)
293 while (pos && (r = rt_cache_get_next(seq, r)))
294 --pos;
295 return pos ? NULL : r;
296}
297
298static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
299{
300 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
301}
302
303static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
304{
305 struct rtable *r = NULL;
306
307 if (v == SEQ_START_TOKEN)
308 r = rt_cache_get_first(seq);
309 else
310 r = rt_cache_get_next(seq, v);
311 ++*pos;
312 return r;
313}
314
315static void rt_cache_seq_stop(struct seq_file *seq, void *v)
316{
317 if (v && v != SEQ_START_TOKEN)
318 rcu_read_unlock_bh();
319}
320
321static int rt_cache_seq_show(struct seq_file *seq, void *v)
322{
323 if (v == SEQ_START_TOKEN)
324 seq_printf(seq, "%-127s\n",
325 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
326 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
327 "HHUptod\tSpecDst");
328 else {
329 struct rtable *r = v;
330 char temp[256];
331
332 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
333 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
334 r->u.dst.dev ? r->u.dst.dev->name : "*",
335 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
336 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
337 r->u.dst.__use, 0, (unsigned long)r->rt_src,
338 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
339 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
340 dst_metric(&r->u.dst, RTAX_WINDOW),
341 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
342 dst_metric(&r->u.dst, RTAX_RTTVAR)),
343 r->fl.fl4_tos,
344 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
345 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
346 dev_queue_xmit) : 0,
347 r->rt_spec_dst);
348 seq_printf(seq, "%-127s\n", temp);
349 }
350 return 0;
351}
352
353static struct seq_operations rt_cache_seq_ops = {
354 .start = rt_cache_seq_start,
355 .next = rt_cache_seq_next,
356 .stop = rt_cache_seq_stop,
357 .show = rt_cache_seq_show,
358};
359
360static int rt_cache_seq_open(struct inode *inode, struct file *file)
361{
362 struct seq_file *seq;
363 int rc = -ENOMEM;
364 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
365
366 if (!s)
367 goto out;
368 rc = seq_open(file, &rt_cache_seq_ops);
369 if (rc)
370 goto out_kfree;
371 seq = file->private_data;
372 seq->private = s;
373 memset(s, 0, sizeof(*s));
374out:
375 return rc;
376out_kfree:
377 kfree(s);
378 goto out;
379}
380
381static struct file_operations rt_cache_seq_fops = {
382 .owner = THIS_MODULE,
383 .open = rt_cache_seq_open,
384 .read = seq_read,
385 .llseek = seq_lseek,
386 .release = seq_release_private,
387};
388
389
390static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
391{
392 int cpu;
393
394 if (*pos == 0)
395 return SEQ_START_TOKEN;
396
397 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
398 if (!cpu_possible(cpu))
399 continue;
400 *pos = cpu+1;
401 return per_cpu_ptr(rt_cache_stat, cpu);
402 }
403 return NULL;
404}
405
406static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
407{
408 int cpu;
409
410 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
411 if (!cpu_possible(cpu))
412 continue;
413 *pos = cpu+1;
414 return per_cpu_ptr(rt_cache_stat, cpu);
415 }
416 return NULL;
417
418}
419
420static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
421{
422
423}
424
425static int rt_cpu_seq_show(struct seq_file *seq, void *v)
426{
427 struct rt_cache_stat *st = v;
428
429 if (v == SEQ_START_TOKEN) {
5bec0039 430 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
431 return 0;
432 }
433
434 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
435 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
436 atomic_read(&ipv4_dst_ops.entries),
437 st->in_hit,
438 st->in_slow_tot,
439 st->in_slow_mc,
440 st->in_no_route,
441 st->in_brd,
442 st->in_martian_dst,
443 st->in_martian_src,
444
445 st->out_hit,
446 st->out_slow_tot,
447 st->out_slow_mc,
448
449 st->gc_total,
450 st->gc_ignored,
451 st->gc_goal_miss,
452 st->gc_dst_overflow,
453 st->in_hlist_search,
454 st->out_hlist_search
455 );
456 return 0;
457}
458
459static struct seq_operations rt_cpu_seq_ops = {
460 .start = rt_cpu_seq_start,
461 .next = rt_cpu_seq_next,
462 .stop = rt_cpu_seq_stop,
463 .show = rt_cpu_seq_show,
464};
465
466
467static int rt_cpu_seq_open(struct inode *inode, struct file *file)
468{
469 return seq_open(file, &rt_cpu_seq_ops);
470}
471
472static struct file_operations rt_cpu_seq_fops = {
473 .owner = THIS_MODULE,
474 .open = rt_cpu_seq_open,
475 .read = seq_read,
476 .llseek = seq_lseek,
477 .release = seq_release,
478};
479
480#endif /* CONFIG_PROC_FS */
481
482static __inline__ void rt_free(struct rtable *rt)
483{
484 multipath_remove(rt);
485 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
486}
487
488static __inline__ void rt_drop(struct rtable *rt)
489{
490 multipath_remove(rt);
491 ip_rt_put(rt);
492 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
493}
494
495static __inline__ int rt_fast_clean(struct rtable *rth)
496{
497 /* Kill broadcast/multicast entries very aggresively, if they
498 collide in hash table with more useful entries */
499 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
500 rth->fl.iif && rth->u.rt_next;
501}
502
503static __inline__ int rt_valuable(struct rtable *rth)
504{
505 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
506 rth->u.dst.expires;
507}
508
509static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
510{
511 unsigned long age;
512 int ret = 0;
513
514 if (atomic_read(&rth->u.dst.__refcnt))
515 goto out;
516
517 ret = 1;
518 if (rth->u.dst.expires &&
519 time_after_eq(jiffies, rth->u.dst.expires))
520 goto out;
521
522 age = jiffies - rth->u.dst.lastuse;
523 ret = 0;
524 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
525 (age <= tmo2 && rt_valuable(rth)))
526 goto out;
527 ret = 1;
528out: return ret;
529}
530
531/* Bits of score are:
532 * 31: very valuable
533 * 30: not quite useless
534 * 29..0: usage counter
535 */
536static inline u32 rt_score(struct rtable *rt)
537{
538 u32 score = jiffies - rt->u.dst.lastuse;
539
540 score = ~score & ~(3<<30);
541
542 if (rt_valuable(rt))
543 score |= (1<<31);
544
545 if (!rt->fl.iif ||
546 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
547 score |= (1<<30);
548
549 return score;
550}
551
552static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
553{
554 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
555 fl1->oif == fl2->oif &&
556 fl1->iif == fl2->iif;
557}
558
559#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
560static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
561 struct rtable *expentry,
562 int *removed_count)
563{
564 int passedexpired = 0;
565 struct rtable **nextstep = NULL;
566 struct rtable **rthp = chain_head;
567 struct rtable *rth;
568
569 if (removed_count)
570 *removed_count = 0;
571
572 while ((rth = *rthp) != NULL) {
573 if (rth == expentry)
574 passedexpired = 1;
575
576 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
577 compare_keys(&(*rthp)->fl, &expentry->fl)) {
578 if (*rthp == expentry) {
579 *rthp = rth->u.rt_next;
580 continue;
581 } else {
582 *rthp = rth->u.rt_next;
583 rt_free(rth);
584 if (removed_count)
585 ++(*removed_count);
586 }
587 } else {
588 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
589 passedexpired && !nextstep)
590 nextstep = &rth->u.rt_next;
591
592 rthp = &rth->u.rt_next;
593 }
594 }
595
596 rt_free(expentry);
597 if (removed_count)
598 ++(*removed_count);
599
600 return nextstep;
601}
602#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
603
604
605/* This runs via a timer and thus is always in BH context. */
606static void rt_check_expire(unsigned long dummy)
607{
608 static int rover;
609 int i = rover, t;
610 struct rtable *rth, **rthp;
611 unsigned long now = jiffies;
612
613 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
614 t -= ip_rt_gc_timeout) {
615 unsigned long tmo = ip_rt_gc_timeout;
616
617 i = (i + 1) & rt_hash_mask;
618 rthp = &rt_hash_table[i].chain;
619
22c047cc 620 spin_lock(rt_hash_lock_addr(i));
1da177e4
LT
621 while ((rth = *rthp) != NULL) {
622 if (rth->u.dst.expires) {
623 /* Entry is expired even if it is in use */
624 if (time_before_eq(now, rth->u.dst.expires)) {
625 tmo >>= 1;
626 rthp = &rth->u.rt_next;
627 continue;
628 }
629 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
630 tmo >>= 1;
631 rthp = &rth->u.rt_next;
632 continue;
633 }
634
635 /* Cleanup aged off entries. */
636#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
637 /* remove all related balanced entries if necessary */
638 if (rth->u.dst.flags & DST_BALANCED) {
639 rthp = rt_remove_balanced_route(
640 &rt_hash_table[i].chain,
641 rth, NULL);
642 if (!rthp)
643 break;
644 } else {
645 *rthp = rth->u.rt_next;
646 rt_free(rth);
647 }
648#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
649 *rthp = rth->u.rt_next;
650 rt_free(rth);
651#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
652 }
22c047cc 653 spin_unlock(rt_hash_lock_addr(i));
1da177e4
LT
654
655 /* Fallback loop breaker. */
656 if (time_after(jiffies, now))
657 break;
658 }
659 rover = i;
660 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
661}
662
663/* This can run from both BH and non-BH contexts, the latter
664 * in the case of a forced flush event.
665 */
666static void rt_run_flush(unsigned long dummy)
667{
668 int i;
669 struct rtable *rth, *next;
670
671 rt_deadline = 0;
672
673 get_random_bytes(&rt_hash_rnd, 4);
674
675 for (i = rt_hash_mask; i >= 0; i--) {
22c047cc 676 spin_lock_bh(rt_hash_lock_addr(i));
1da177e4
LT
677 rth = rt_hash_table[i].chain;
678 if (rth)
679 rt_hash_table[i].chain = NULL;
22c047cc 680 spin_unlock_bh(rt_hash_lock_addr(i));
1da177e4
LT
681
682 for (; rth; rth = next) {
683 next = rth->u.rt_next;
684 rt_free(rth);
685 }
686 }
687}
688
689static DEFINE_SPINLOCK(rt_flush_lock);
690
691void rt_cache_flush(int delay)
692{
693 unsigned long now = jiffies;
694 int user_mode = !in_softirq();
695
696 if (delay < 0)
697 delay = ip_rt_min_delay;
698
699 /* flush existing multipath state*/
700 multipath_flush();
701
702 spin_lock_bh(&rt_flush_lock);
703
704 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
705 long tmo = (long)(rt_deadline - now);
706
707 /* If flush timer is already running
708 and flush request is not immediate (delay > 0):
709
710 if deadline is not achieved, prolongate timer to "delay",
711 otherwise fire it at deadline time.
712 */
713
714 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
715 tmo = 0;
716
717 if (delay > tmo)
718 delay = tmo;
719 }
720
721 if (delay <= 0) {
722 spin_unlock_bh(&rt_flush_lock);
723 rt_run_flush(0);
724 return;
725 }
726
727 if (rt_deadline == 0)
728 rt_deadline = now + ip_rt_max_delay;
729
730 mod_timer(&rt_flush_timer, now+delay);
731 spin_unlock_bh(&rt_flush_lock);
732}
733
734static void rt_secret_rebuild(unsigned long dummy)
735{
736 unsigned long now = jiffies;
737
738 rt_cache_flush(0);
739 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
740}
741
742/*
743 Short description of GC goals.
744
745 We want to build algorithm, which will keep routing cache
746 at some equilibrium point, when number of aged off entries
747 is kept approximately equal to newly generated ones.
748
749 Current expiration strength is variable "expire".
750 We try to adjust it dynamically, so that if networking
751 is idle expires is large enough to keep enough of warm entries,
752 and when load increases it reduces to limit cache size.
753 */
754
755static int rt_garbage_collect(void)
756{
757 static unsigned long expire = RT_GC_TIMEOUT;
758 static unsigned long last_gc;
759 static int rover;
760 static int equilibrium;
761 struct rtable *rth, **rthp;
762 unsigned long now = jiffies;
763 int goal;
764
765 /*
766 * Garbage collection is pretty expensive,
767 * do not make it too frequently.
768 */
769
770 RT_CACHE_STAT_INC(gc_total);
771
772 if (now - last_gc < ip_rt_gc_min_interval &&
773 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
774 RT_CACHE_STAT_INC(gc_ignored);
775 goto out;
776 }
777
778 /* Calculate number of entries, which we want to expire now. */
779 goal = atomic_read(&ipv4_dst_ops.entries) -
780 (ip_rt_gc_elasticity << rt_hash_log);
781 if (goal <= 0) {
782 if (equilibrium < ipv4_dst_ops.gc_thresh)
783 equilibrium = ipv4_dst_ops.gc_thresh;
784 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
785 if (goal > 0) {
786 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
787 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
788 }
789 } else {
790 /* We are in dangerous area. Try to reduce cache really
791 * aggressively.
792 */
793 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
794 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
795 }
796
797 if (now - last_gc >= ip_rt_gc_min_interval)
798 last_gc = now;
799
800 if (goal <= 0) {
801 equilibrium += goal;
802 goto work_done;
803 }
804
805 do {
806 int i, k;
807
808 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
809 unsigned long tmo = expire;
810
811 k = (k + 1) & rt_hash_mask;
812 rthp = &rt_hash_table[k].chain;
22c047cc 813 spin_lock_bh(rt_hash_lock_addr(k));
1da177e4
LT
814 while ((rth = *rthp) != NULL) {
815 if (!rt_may_expire(rth, tmo, expire)) {
816 tmo >>= 1;
817 rthp = &rth->u.rt_next;
818 continue;
819 }
820#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
821 /* remove all related balanced entries
822 * if necessary
823 */
824 if (rth->u.dst.flags & DST_BALANCED) {
825 int r;
826
827 rthp = rt_remove_balanced_route(
828 &rt_hash_table[i].chain,
829 rth,
830 &r);
831 goal -= r;
832 if (!rthp)
833 break;
834 } else {
835 *rthp = rth->u.rt_next;
836 rt_free(rth);
837 goal--;
838 }
839#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
840 *rthp = rth->u.rt_next;
841 rt_free(rth);
842 goal--;
843#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
844 }
22c047cc 845 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
846 if (goal <= 0)
847 break;
848 }
849 rover = k;
850
851 if (goal <= 0)
852 goto work_done;
853
854 /* Goal is not achieved. We stop process if:
855
856 - if expire reduced to zero. Otherwise, expire is halfed.
857 - if table is not full.
858 - if we are called from interrupt.
859 - jiffies check is just fallback/debug loop breaker.
860 We will not spin here for long time in any case.
861 */
862
863 RT_CACHE_STAT_INC(gc_goal_miss);
864
865 if (expire == 0)
866 break;
867
868 expire >>= 1;
869#if RT_CACHE_DEBUG >= 2
870 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
871 atomic_read(&ipv4_dst_ops.entries), goal, i);
872#endif
873
874 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
875 goto out;
876 } while (!in_softirq() && time_before_eq(jiffies, now));
877
878 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
879 goto out;
880 if (net_ratelimit())
881 printk(KERN_WARNING "dst cache overflow\n");
882 RT_CACHE_STAT_INC(gc_dst_overflow);
883 return 1;
884
885work_done:
886 expire += ip_rt_gc_min_interval;
887 if (expire > ip_rt_gc_timeout ||
888 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
889 expire = ip_rt_gc_timeout;
890#if RT_CACHE_DEBUG >= 2
891 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
892 atomic_read(&ipv4_dst_ops.entries), goal, rover);
893#endif
894out: return 0;
895}
896
897static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
898{
899 struct rtable *rth, **rthp;
900 unsigned long now;
901 struct rtable *cand, **candp;
902 u32 min_score;
903 int chain_length;
904 int attempts = !in_softirq();
905
906restart:
907 chain_length = 0;
908 min_score = ~(u32)0;
909 cand = NULL;
910 candp = NULL;
911 now = jiffies;
912
913 rthp = &rt_hash_table[hash].chain;
914
22c047cc 915 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
916 while ((rth = *rthp) != NULL) {
917#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
918 if (!(rth->u.dst.flags & DST_BALANCED) &&
919 compare_keys(&rth->fl, &rt->fl)) {
920#else
921 if (compare_keys(&rth->fl, &rt->fl)) {
922#endif
923 /* Put it first */
924 *rthp = rth->u.rt_next;
925 /*
926 * Since lookup is lockfree, the deletion
927 * must be visible to another weakly ordered CPU before
928 * the insertion at the start of the hash chain.
929 */
930 rcu_assign_pointer(rth->u.rt_next,
931 rt_hash_table[hash].chain);
932 /*
933 * Since lookup is lockfree, the update writes
934 * must be ordered for consistency on SMP.
935 */
936 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
937
938 rth->u.dst.__use++;
939 dst_hold(&rth->u.dst);
940 rth->u.dst.lastuse = now;
22c047cc 941 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
942
943 rt_drop(rt);
944 *rp = rth;
945 return 0;
946 }
947
948 if (!atomic_read(&rth->u.dst.__refcnt)) {
949 u32 score = rt_score(rth);
950
951 if (score <= min_score) {
952 cand = rth;
953 candp = rthp;
954 min_score = score;
955 }
956 }
957
958 chain_length++;
959
960 rthp = &rth->u.rt_next;
961 }
962
963 if (cand) {
964 /* ip_rt_gc_elasticity used to be average length of chain
965 * length, when exceeded gc becomes really aggressive.
966 *
967 * The second limit is less certain. At the moment it allows
968 * only 2 entries per bucket. We will see.
969 */
970 if (chain_length > ip_rt_gc_elasticity) {
971 *candp = cand->u.rt_next;
972 rt_free(cand);
973 }
974 }
975
976 /* Try to bind route to arp only if it is output
977 route or unicast forwarding path.
978 */
979 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
980 int err = arp_bind_neighbour(&rt->u.dst);
981 if (err) {
22c047cc 982 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
983
984 if (err != -ENOBUFS) {
985 rt_drop(rt);
986 return err;
987 }
988
989 /* Neighbour tables are full and nothing
990 can be released. Try to shrink route cache,
991 it is most likely it holds some neighbour records.
992 */
993 if (attempts-- > 0) {
994 int saved_elasticity = ip_rt_gc_elasticity;
995 int saved_int = ip_rt_gc_min_interval;
996 ip_rt_gc_elasticity = 1;
997 ip_rt_gc_min_interval = 0;
998 rt_garbage_collect();
999 ip_rt_gc_min_interval = saved_int;
1000 ip_rt_gc_elasticity = saved_elasticity;
1001 goto restart;
1002 }
1003
1004 if (net_ratelimit())
1005 printk(KERN_WARNING "Neighbour table overflow.\n");
1006 rt_drop(rt);
1007 return -ENOBUFS;
1008 }
1009 }
1010
1011 rt->u.rt_next = rt_hash_table[hash].chain;
1012#if RT_CACHE_DEBUG >= 2
1013 if (rt->u.rt_next) {
1014 struct rtable *trt;
1015 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1016 NIPQUAD(rt->rt_dst));
1017 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1018 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1019 printk("\n");
1020 }
1021#endif
1022 rt_hash_table[hash].chain = rt;
22c047cc 1023 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1024 *rp = rt;
1025 return 0;
1026}
1027
1028void rt_bind_peer(struct rtable *rt, int create)
1029{
1030 static DEFINE_SPINLOCK(rt_peer_lock);
1031 struct inet_peer *peer;
1032
1033 peer = inet_getpeer(rt->rt_dst, create);
1034
1035 spin_lock_bh(&rt_peer_lock);
1036 if (rt->peer == NULL) {
1037 rt->peer = peer;
1038 peer = NULL;
1039 }
1040 spin_unlock_bh(&rt_peer_lock);
1041 if (peer)
1042 inet_putpeer(peer);
1043}
1044
1045/*
1046 * Peer allocation may fail only in serious out-of-memory conditions. However
1047 * we still can generate some output.
1048 * Random ID selection looks a bit dangerous because we have no chances to
1049 * select ID being unique in a reasonable period of time.
1050 * But broken packet identifier may be better than no packet at all.
1051 */
1052static void ip_select_fb_ident(struct iphdr *iph)
1053{
1054 static DEFINE_SPINLOCK(ip_fb_id_lock);
1055 static u32 ip_fallback_id;
1056 u32 salt;
1057
1058 spin_lock_bh(&ip_fb_id_lock);
1059 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1060 iph->id = htons(salt & 0xFFFF);
1061 ip_fallback_id = salt;
1062 spin_unlock_bh(&ip_fb_id_lock);
1063}
1064
1065void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1066{
1067 struct rtable *rt = (struct rtable *) dst;
1068
1069 if (rt) {
1070 if (rt->peer == NULL)
1071 rt_bind_peer(rt, 1);
1072
1073 /* If peer is attached to destination, it is never detached,
1074 so that we need not to grab a lock to dereference it.
1075 */
1076 if (rt->peer) {
1077 iph->id = htons(inet_getid(rt->peer, more));
1078 return;
1079 }
1080 } else
9c2b3328
SH
1081 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1082 __builtin_return_address(0));
1da177e4
LT
1083
1084 ip_select_fb_ident(iph);
1085}
1086
1087static void rt_del(unsigned hash, struct rtable *rt)
1088{
1089 struct rtable **rthp;
1090
22c047cc 1091 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1092 ip_rt_put(rt);
1093 for (rthp = &rt_hash_table[hash].chain; *rthp;
1094 rthp = &(*rthp)->u.rt_next)
1095 if (*rthp == rt) {
1096 *rthp = rt->u.rt_next;
1097 rt_free(rt);
1098 break;
1099 }
22c047cc 1100 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1101}
1102
1103void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1104 u32 saddr, u8 tos, struct net_device *dev)
1105{
1106 int i, k;
1107 struct in_device *in_dev = in_dev_get(dev);
1108 struct rtable *rth, **rthp;
1109 u32 skeys[2] = { saddr, 0 };
1110 int ikeys[2] = { dev->ifindex, 0 };
1111
1112 tos &= IPTOS_RT_MASK;
1113
1114 if (!in_dev)
1115 return;
1116
1117 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1118 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1119 goto reject_redirect;
1120
1121 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1122 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1123 goto reject_redirect;
1124 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1125 goto reject_redirect;
1126 } else {
1127 if (inet_addr_type(new_gw) != RTN_UNICAST)
1128 goto reject_redirect;
1129 }
1130
1131 for (i = 0; i < 2; i++) {
1132 for (k = 0; k < 2; k++) {
1133 unsigned hash = rt_hash_code(daddr,
1134 skeys[i] ^ (ikeys[k] << 5),
1135 tos);
1136
1137 rthp=&rt_hash_table[hash].chain;
1138
1139 rcu_read_lock();
1140 while ((rth = rcu_dereference(*rthp)) != NULL) {
1141 struct rtable *rt;
1142
1143 if (rth->fl.fl4_dst != daddr ||
1144 rth->fl.fl4_src != skeys[i] ||
1145 rth->fl.fl4_tos != tos ||
1146 rth->fl.oif != ikeys[k] ||
1147 rth->fl.iif != 0) {
1148 rthp = &rth->u.rt_next;
1149 continue;
1150 }
1151
1152 if (rth->rt_dst != daddr ||
1153 rth->rt_src != saddr ||
1154 rth->u.dst.error ||
1155 rth->rt_gateway != old_gw ||
1156 rth->u.dst.dev != dev)
1157 break;
1158
1159 dst_hold(&rth->u.dst);
1160 rcu_read_unlock();
1161
1162 rt = dst_alloc(&ipv4_dst_ops);
1163 if (rt == NULL) {
1164 ip_rt_put(rth);
1165 in_dev_put(in_dev);
1166 return;
1167 }
1168
1169 /* Copy all the information. */
1170 *rt = *rth;
1171 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1172 rt->u.dst.__use = 1;
1173 atomic_set(&rt->u.dst.__refcnt, 1);
1174 rt->u.dst.child = NULL;
1175 if (rt->u.dst.dev)
1176 dev_hold(rt->u.dst.dev);
1177 if (rt->idev)
1178 in_dev_hold(rt->idev);
1179 rt->u.dst.obsolete = 0;
1180 rt->u.dst.lastuse = jiffies;
1181 rt->u.dst.path = &rt->u.dst;
1182 rt->u.dst.neighbour = NULL;
1183 rt->u.dst.hh = NULL;
1184 rt->u.dst.xfrm = NULL;
1185
1186 rt->rt_flags |= RTCF_REDIRECTED;
1187
1188 /* Gateway is different ... */
1189 rt->rt_gateway = new_gw;
1190
1191 /* Redirect received -> path was valid */
1192 dst_confirm(&rth->u.dst);
1193
1194 if (rt->peer)
1195 atomic_inc(&rt->peer->refcnt);
1196
1197 if (arp_bind_neighbour(&rt->u.dst) ||
1198 !(rt->u.dst.neighbour->nud_state &
1199 NUD_VALID)) {
1200 if (rt->u.dst.neighbour)
1201 neigh_event_send(rt->u.dst.neighbour, NULL);
1202 ip_rt_put(rth);
1203 rt_drop(rt);
1204 goto do_next;
1205 }
1206
1207 rt_del(hash, rth);
1208 if (!rt_intern_hash(hash, rt, &rt))
1209 ip_rt_put(rt);
1210 goto do_next;
1211 }
1212 rcu_read_unlock();
1213 do_next:
1214 ;
1215 }
1216 }
1217 in_dev_put(in_dev);
1218 return;
1219
1220reject_redirect:
1221#ifdef CONFIG_IP_ROUTE_VERBOSE
1222 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1223 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1224 "%u.%u.%u.%u ignored.\n"
1225 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1226 "tos %02x\n",
1227 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1228 NIPQUAD(saddr), NIPQUAD(daddr), tos);
1229#endif
1230 in_dev_put(in_dev);
1231}
1232
1233static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1234{
1235 struct rtable *rt = (struct rtable*)dst;
1236 struct dst_entry *ret = dst;
1237
1238 if (rt) {
1239 if (dst->obsolete) {
1240 ip_rt_put(rt);
1241 ret = NULL;
1242 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1243 rt->u.dst.expires) {
1244 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1245 rt->fl.fl4_src ^
1246 (rt->fl.oif << 5),
1247 rt->fl.fl4_tos);
1248#if RT_CACHE_DEBUG >= 1
1249 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1250 "%u.%u.%u.%u/%02x dropped\n",
1251 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1252#endif
1253 rt_del(hash, rt);
1254 ret = NULL;
1255 }
1256 }
1257 return ret;
1258}
1259
1260/*
1261 * Algorithm:
1262 * 1. The first ip_rt_redirect_number redirects are sent
1263 * with exponential backoff, then we stop sending them at all,
1264 * assuming that the host ignores our redirects.
1265 * 2. If we did not see packets requiring redirects
1266 * during ip_rt_redirect_silence, we assume that the host
1267 * forgot redirected route and start to send redirects again.
1268 *
1269 * This algorithm is much cheaper and more intelligent than dumb load limiting
1270 * in icmp.c.
1271 *
1272 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1273 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1274 */
1275
1276void ip_rt_send_redirect(struct sk_buff *skb)
1277{
1278 struct rtable *rt = (struct rtable*)skb->dst;
1279 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1280
1281 if (!in_dev)
1282 return;
1283
1284 if (!IN_DEV_TX_REDIRECTS(in_dev))
1285 goto out;
1286
1287 /* No redirected packets during ip_rt_redirect_silence;
1288 * reset the algorithm.
1289 */
1290 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1291 rt->u.dst.rate_tokens = 0;
1292
1293 /* Too many ignored redirects; do not send anything
1294 * set u.dst.rate_last to the last seen redirected packet.
1295 */
1296 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1297 rt->u.dst.rate_last = jiffies;
1298 goto out;
1299 }
1300
1301 /* Check for load limit; set rate_last to the latest sent
1302 * redirect.
1303 */
1304 if (time_after(jiffies,
1305 (rt->u.dst.rate_last +
1306 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1307 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1308 rt->u.dst.rate_last = jiffies;
1309 ++rt->u.dst.rate_tokens;
1310#ifdef CONFIG_IP_ROUTE_VERBOSE
1311 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1312 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1313 net_ratelimit())
1314 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1315 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1316 NIPQUAD(rt->rt_src), rt->rt_iif,
1317 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1318#endif
1319 }
1320out:
1321 in_dev_put(in_dev);
1322}
1323
1324static int ip_error(struct sk_buff *skb)
1325{
1326 struct rtable *rt = (struct rtable*)skb->dst;
1327 unsigned long now;
1328 int code;
1329
1330 switch (rt->u.dst.error) {
1331 case EINVAL:
1332 default:
1333 goto out;
1334 case EHOSTUNREACH:
1335 code = ICMP_HOST_UNREACH;
1336 break;
1337 case ENETUNREACH:
1338 code = ICMP_NET_UNREACH;
1339 break;
1340 case EACCES:
1341 code = ICMP_PKT_FILTERED;
1342 break;
1343 }
1344
1345 now = jiffies;
1346 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1347 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1348 rt->u.dst.rate_tokens = ip_rt_error_burst;
1349 rt->u.dst.rate_last = now;
1350 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1351 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1352 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1353 }
1354
1355out: kfree_skb(skb);
1356 return 0;
1357}
1358
1359/*
1360 * The last two values are not from the RFC but
1361 * are needed for AMPRnet AX.25 paths.
1362 */
1363
1364static unsigned short mtu_plateau[] =
1365{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1366
1367static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1368{
1369 int i;
1370
1371 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1372 if (old_mtu > mtu_plateau[i])
1373 return mtu_plateau[i];
1374 return 68;
1375}
1376
1377unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1378{
1379 int i;
1380 unsigned short old_mtu = ntohs(iph->tot_len);
1381 struct rtable *rth;
1382 u32 skeys[2] = { iph->saddr, 0, };
1383 u32 daddr = iph->daddr;
1384 u8 tos = iph->tos & IPTOS_RT_MASK;
1385 unsigned short est_mtu = 0;
1386
1387 if (ipv4_config.no_pmtu_disc)
1388 return 0;
1389
1390 for (i = 0; i < 2; i++) {
1391 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1392
1393 rcu_read_lock();
1394 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1395 rth = rcu_dereference(rth->u.rt_next)) {
1396 if (rth->fl.fl4_dst == daddr &&
1397 rth->fl.fl4_src == skeys[i] &&
1398 rth->rt_dst == daddr &&
1399 rth->rt_src == iph->saddr &&
1400 rth->fl.fl4_tos == tos &&
1401 rth->fl.iif == 0 &&
1402 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1403 unsigned short mtu = new_mtu;
1404
1405 if (new_mtu < 68 || new_mtu >= old_mtu) {
1406
1407 /* BSD 4.2 compatibility hack :-( */
1408 if (mtu == 0 &&
1409 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1410 old_mtu >= 68 + (iph->ihl << 2))
1411 old_mtu -= iph->ihl << 2;
1412
1413 mtu = guess_mtu(old_mtu);
1414 }
1415 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1416 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1417 dst_confirm(&rth->u.dst);
1418 if (mtu < ip_rt_min_pmtu) {
1419 mtu = ip_rt_min_pmtu;
1420 rth->u.dst.metrics[RTAX_LOCK-1] |=
1421 (1 << RTAX_MTU);
1422 }
1423 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1424 dst_set_expires(&rth->u.dst,
1425 ip_rt_mtu_expires);
1426 }
1427 est_mtu = mtu;
1428 }
1429 }
1430 }
1431 rcu_read_unlock();
1432 }
1433 return est_mtu ? : new_mtu;
1434}
1435
1436static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1437{
1438 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1439 !(dst_metric_locked(dst, RTAX_MTU))) {
1440 if (mtu < ip_rt_min_pmtu) {
1441 mtu = ip_rt_min_pmtu;
1442 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1443 }
1444 dst->metrics[RTAX_MTU-1] = mtu;
1445 dst_set_expires(dst, ip_rt_mtu_expires);
1446 }
1447}
1448
1449static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1450{
1451 return NULL;
1452}
1453
1454static void ipv4_dst_destroy(struct dst_entry *dst)
1455{
1456 struct rtable *rt = (struct rtable *) dst;
1457 struct inet_peer *peer = rt->peer;
1458 struct in_device *idev = rt->idev;
1459
1460 if (peer) {
1461 rt->peer = NULL;
1462 inet_putpeer(peer);
1463 }
1464
1465 if (idev) {
1466 rt->idev = NULL;
1467 in_dev_put(idev);
1468 }
1469}
1470
1471static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1472 int how)
1473{
1474 struct rtable *rt = (struct rtable *) dst;
1475 struct in_device *idev = rt->idev;
1476 if (dev != &loopback_dev && idev && idev->dev == dev) {
1477 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1478 if (loopback_idev) {
1479 rt->idev = loopback_idev;
1480 in_dev_put(idev);
1481 }
1482 }
1483}
1484
1485static void ipv4_link_failure(struct sk_buff *skb)
1486{
1487 struct rtable *rt;
1488
1489 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1490
1491 rt = (struct rtable *) skb->dst;
1492 if (rt)
1493 dst_set_expires(&rt->u.dst, 0);
1494}
1495
1496static int ip_rt_bug(struct sk_buff *skb)
1497{
1498 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1499 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1500 skb->dev ? skb->dev->name : "?");
1501 kfree_skb(skb);
1502 return 0;
1503}
1504
1505/*
1506 We do not cache source address of outgoing interface,
1507 because it is used only by IP RR, TS and SRR options,
1508 so that it out of fast path.
1509
1510 BTW remember: "addr" is allowed to be not aligned
1511 in IP options!
1512 */
1513
1514void ip_rt_get_source(u8 *addr, struct rtable *rt)
1515{
1516 u32 src;
1517 struct fib_result res;
1518
1519 if (rt->fl.iif == 0)
1520 src = rt->rt_src;
1521 else if (fib_lookup(&rt->fl, &res) == 0) {
1522 src = FIB_RES_PREFSRC(res);
1523 fib_res_put(&res);
1524 } else
1525 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1526 RT_SCOPE_UNIVERSE);
1527 memcpy(addr, &src, 4);
1528}
1529
1530#ifdef CONFIG_NET_CLS_ROUTE
1531static void set_class_tag(struct rtable *rt, u32 tag)
1532{
1533 if (!(rt->u.dst.tclassid & 0xFFFF))
1534 rt->u.dst.tclassid |= tag & 0xFFFF;
1535 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1536 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1537}
1538#endif
1539
1540static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1541{
1542 struct fib_info *fi = res->fi;
1543
1544 if (fi) {
1545 if (FIB_RES_GW(*res) &&
1546 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1547 rt->rt_gateway = FIB_RES_GW(*res);
1548 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1549 sizeof(rt->u.dst.metrics));
1550 if (fi->fib_mtu == 0) {
1551 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1552 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1553 rt->rt_gateway != rt->rt_dst &&
1554 rt->u.dst.dev->mtu > 576)
1555 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1556 }
1557#ifdef CONFIG_NET_CLS_ROUTE
1558 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1559#endif
1560 } else
1561 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1562
1563 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1564 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1565 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1566 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1567 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1568 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1569 ip_rt_min_advmss);
1570 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1571 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1572
1573#ifdef CONFIG_NET_CLS_ROUTE
1574#ifdef CONFIG_IP_MULTIPLE_TABLES
1575 set_class_tag(rt, fib_rules_tclass(res));
1576#endif
1577 set_class_tag(rt, itag);
1578#endif
1579 rt->rt_type = res->type;
1580}
1581
1582static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1583 u8 tos, struct net_device *dev, int our)
1584{
1585 unsigned hash;
1586 struct rtable *rth;
1587 u32 spec_dst;
1588 struct in_device *in_dev = in_dev_get(dev);
1589 u32 itag = 0;
1590
1591 /* Primary sanity checks. */
1592
1593 if (in_dev == NULL)
1594 return -EINVAL;
1595
1596 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1597 skb->protocol != htons(ETH_P_IP))
1598 goto e_inval;
1599
1600 if (ZERONET(saddr)) {
1601 if (!LOCAL_MCAST(daddr))
1602 goto e_inval;
1603 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1604 } else if (fib_validate_source(saddr, 0, tos, 0,
1605 dev, &spec_dst, &itag) < 0)
1606 goto e_inval;
1607
1608 rth = dst_alloc(&ipv4_dst_ops);
1609 if (!rth)
1610 goto e_nobufs;
1611
1612 rth->u.dst.output= ip_rt_bug;
1613
1614 atomic_set(&rth->u.dst.__refcnt, 1);
1615 rth->u.dst.flags= DST_HOST;
1616 if (in_dev->cnf.no_policy)
1617 rth->u.dst.flags |= DST_NOPOLICY;
1618 rth->fl.fl4_dst = daddr;
1619 rth->rt_dst = daddr;
1620 rth->fl.fl4_tos = tos;
1621#ifdef CONFIG_IP_ROUTE_FWMARK
1622 rth->fl.fl4_fwmark= skb->nfmark;
1623#endif
1624 rth->fl.fl4_src = saddr;
1625 rth->rt_src = saddr;
1626#ifdef CONFIG_NET_CLS_ROUTE
1627 rth->u.dst.tclassid = itag;
1628#endif
1629 rth->rt_iif =
1630 rth->fl.iif = dev->ifindex;
1631 rth->u.dst.dev = &loopback_dev;
1632 dev_hold(rth->u.dst.dev);
1633 rth->idev = in_dev_get(rth->u.dst.dev);
1634 rth->fl.oif = 0;
1635 rth->rt_gateway = daddr;
1636 rth->rt_spec_dst= spec_dst;
1637 rth->rt_type = RTN_MULTICAST;
1638 rth->rt_flags = RTCF_MULTICAST;
1639 if (our) {
1640 rth->u.dst.input= ip_local_deliver;
1641 rth->rt_flags |= RTCF_LOCAL;
1642 }
1643
1644#ifdef CONFIG_IP_MROUTE
1645 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1646 rth->u.dst.input = ip_mr_input;
1647#endif
1648 RT_CACHE_STAT_INC(in_slow_mc);
1649
1650 in_dev_put(in_dev);
1651 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1652 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1653
1654e_nobufs:
1655 in_dev_put(in_dev);
1656 return -ENOBUFS;
1657
1658e_inval:
1659 in_dev_put(in_dev);
1660 return -EINVAL;
1661}
1662
1663
1664static void ip_handle_martian_source(struct net_device *dev,
1665 struct in_device *in_dev,
1666 struct sk_buff *skb,
1667 u32 daddr,
1668 u32 saddr)
1669{
1670 RT_CACHE_STAT_INC(in_martian_src);
1671#ifdef CONFIG_IP_ROUTE_VERBOSE
1672 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1673 /*
1674 * RFC1812 recommendation, if source is martian,
1675 * the only hint is MAC header.
1676 */
1677 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1678 "%u.%u.%u.%u, on dev %s\n",
1679 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1680 if (dev->hard_header_len) {
1681 int i;
1682 unsigned char *p = skb->mac.raw;
1683 printk(KERN_WARNING "ll header: ");
1684 for (i = 0; i < dev->hard_header_len; i++, p++) {
1685 printk("%02x", *p);
1686 if (i < (dev->hard_header_len - 1))
1687 printk(":");
1688 }
1689 printk("\n");
1690 }
1691 }
1692#endif
1693}
1694
1695static inline int __mkroute_input(struct sk_buff *skb,
1696 struct fib_result* res,
1697 struct in_device *in_dev,
1698 u32 daddr, u32 saddr, u32 tos,
1699 struct rtable **result)
1700{
1701
1702 struct rtable *rth;
1703 int err;
1704 struct in_device *out_dev;
1705 unsigned flags = 0;
1706 u32 spec_dst, itag;
1707
1708 /* get a working reference to the output device */
1709 out_dev = in_dev_get(FIB_RES_DEV(*res));
1710 if (out_dev == NULL) {
1711 if (net_ratelimit())
1712 printk(KERN_CRIT "Bug in ip_route_input" \
1713 "_slow(). Please, report\n");
1714 return -EINVAL;
1715 }
1716
1717
1718 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1719 in_dev->dev, &spec_dst, &itag);
1720 if (err < 0) {
1721 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1722 saddr);
1723
1724 err = -EINVAL;
1725 goto cleanup;
1726 }
1727
1728 if (err)
1729 flags |= RTCF_DIRECTSRC;
1730
1731 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1732 (IN_DEV_SHARED_MEDIA(out_dev) ||
1733 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1734 flags |= RTCF_DOREDIRECT;
1735
1736 if (skb->protocol != htons(ETH_P_IP)) {
1737 /* Not IP (i.e. ARP). Do not create route, if it is
1738 * invalid for proxy arp. DNAT routes are always valid.
1739 */
1740 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1741 err = -EINVAL;
1742 goto cleanup;
1743 }
1744 }
1745
1746
1747 rth = dst_alloc(&ipv4_dst_ops);
1748 if (!rth) {
1749 err = -ENOBUFS;
1750 goto cleanup;
1751 }
1752
1753 rth->u.dst.flags= DST_HOST;
1754#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1755 if (res->fi->fib_nhs > 1)
1756 rth->u.dst.flags |= DST_BALANCED;
1757#endif
1758 if (in_dev->cnf.no_policy)
1759 rth->u.dst.flags |= DST_NOPOLICY;
1760 if (in_dev->cnf.no_xfrm)
1761 rth->u.dst.flags |= DST_NOXFRM;
1762 rth->fl.fl4_dst = daddr;
1763 rth->rt_dst = daddr;
1764 rth->fl.fl4_tos = tos;
1765#ifdef CONFIG_IP_ROUTE_FWMARK
1766 rth->fl.fl4_fwmark= skb->nfmark;
1767#endif
1768 rth->fl.fl4_src = saddr;
1769 rth->rt_src = saddr;
1770 rth->rt_gateway = daddr;
1771 rth->rt_iif =
1772 rth->fl.iif = in_dev->dev->ifindex;
1773 rth->u.dst.dev = (out_dev)->dev;
1774 dev_hold(rth->u.dst.dev);
1775 rth->idev = in_dev_get(rth->u.dst.dev);
1776 rth->fl.oif = 0;
1777 rth->rt_spec_dst= spec_dst;
1778
1779 rth->u.dst.input = ip_forward;
1780 rth->u.dst.output = ip_output;
1781
1782 rt_set_nexthop(rth, res, itag);
1783
1784 rth->rt_flags = flags;
1785
1786 *result = rth;
1787 err = 0;
1788 cleanup:
1789 /* release the working reference to the output device */
1790 in_dev_put(out_dev);
1791 return err;
1792}
1793
1794static inline int ip_mkroute_input_def(struct sk_buff *skb,
1795 struct fib_result* res,
1796 const struct flowi *fl,
1797 struct in_device *in_dev,
1798 u32 daddr, u32 saddr, u32 tos)
1799{
7abaa27c 1800 struct rtable* rth = NULL;
1da177e4
LT
1801 int err;
1802 unsigned hash;
1803
1804#ifdef CONFIG_IP_ROUTE_MULTIPATH
1805 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1806 fib_select_multipath(fl, res);
1807#endif
1808
1809 /* create a routing cache entry */
1810 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1811 if (err)
1812 return err;
1813 atomic_set(&rth->u.dst.__refcnt, 1);
1814
1815 /* put it into the cache */
1816 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1817 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1818}
1819
1820static inline int ip_mkroute_input(struct sk_buff *skb,
1821 struct fib_result* res,
1822 const struct flowi *fl,
1823 struct in_device *in_dev,
1824 u32 daddr, u32 saddr, u32 tos)
1825{
1826#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
7abaa27c 1827 struct rtable* rth = NULL;
1da177e4
LT
1828 unsigned char hop, hopcount, lasthop;
1829 int err = -EINVAL;
1830 unsigned int hash;
1831
1832 if (res->fi)
1833 hopcount = res->fi->fib_nhs;
1834 else
1835 hopcount = 1;
1836
1837 lasthop = hopcount - 1;
1838
1839 /* distinguish between multipath and singlepath */
1840 if (hopcount < 2)
1841 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1842 saddr, tos);
1843
1844 /* add all alternatives to the routing cache */
1845 for (hop = 0; hop < hopcount; hop++) {
1846 res->nh_sel = hop;
1847
1848 /* create a routing cache entry */
1849 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1850 &rth);
1851 if (err)
1852 return err;
1853
1854 /* put it into the cache */
1855 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1856 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1857 if (err)
1858 return err;
1859
1860 /* forward hop information to multipath impl. */
1861 multipath_set_nhinfo(rth,
1862 FIB_RES_NETWORK(*res),
1863 FIB_RES_NETMASK(*res),
1864 res->prefixlen,
1865 &FIB_RES_NH(*res));
1866
1867 /* only for the last hop the reference count is handled
1868 * outside
1869 */
1870 if (hop == lasthop)
1871 atomic_set(&(skb->dst->__refcnt), 1);
1872 }
1873 return err;
1874#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1875 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1876#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1877}
1878
1879
1880/*
1881 * NOTE. We drop all the packets that has local source
1882 * addresses, because every properly looped back packet
1883 * must have correct destination already attached by output routine.
1884 *
1885 * Such approach solves two big problems:
1886 * 1. Not simplex devices are handled properly.
1887 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1888 */
1889
1890static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1891 u8 tos, struct net_device *dev)
1892{
1893 struct fib_result res;
1894 struct in_device *in_dev = in_dev_get(dev);
1895 struct flowi fl = { .nl_u = { .ip4_u =
1896 { .daddr = daddr,
1897 .saddr = saddr,
1898 .tos = tos,
1899 .scope = RT_SCOPE_UNIVERSE,
1900#ifdef CONFIG_IP_ROUTE_FWMARK
1901 .fwmark = skb->nfmark
1902#endif
1903 } },
1904 .iif = dev->ifindex };
1905 unsigned flags = 0;
1906 u32 itag = 0;
1907 struct rtable * rth;
1908 unsigned hash;
1909 u32 spec_dst;
1910 int err = -EINVAL;
1911 int free_res = 0;
1912
1913 /* IP on this device is disabled. */
1914
1915 if (!in_dev)
1916 goto out;
1917
1918 /* Check for the most weird martians, which can be not detected
1919 by fib_lookup.
1920 */
1921
1922 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1923 goto martian_source;
1924
1925 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1926 goto brd_input;
1927
1928 /* Accept zero addresses only to limited broadcast;
1929 * I even do not know to fix it or not. Waiting for complains :-)
1930 */
1931 if (ZERONET(saddr))
1932 goto martian_source;
1933
1934 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1935 goto martian_destination;
1936
1937 /*
1938 * Now we are ready to route packet.
1939 */
1940 if ((err = fib_lookup(&fl, &res)) != 0) {
1941 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1942 goto e_hostunreach;
1da177e4
LT
1943 goto no_route;
1944 }
1945 free_res = 1;
1946
1947 RT_CACHE_STAT_INC(in_slow_tot);
1948
1949 if (res.type == RTN_BROADCAST)
1950 goto brd_input;
1951
1952 if (res.type == RTN_LOCAL) {
1953 int result;
1954 result = fib_validate_source(saddr, daddr, tos,
1955 loopback_dev.ifindex,
1956 dev, &spec_dst, &itag);
1957 if (result < 0)
1958 goto martian_source;
1959 if (result)
1960 flags |= RTCF_DIRECTSRC;
1961 spec_dst = daddr;
1962 goto local_input;
1963 }
1964
1965 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 1966 goto e_hostunreach;
1da177e4
LT
1967 if (res.type != RTN_UNICAST)
1968 goto martian_destination;
1969
1970 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1971 if (err == -ENOBUFS)
1972 goto e_nobufs;
1973 if (err == -EINVAL)
1974 goto e_inval;
1975
1976done:
1977 in_dev_put(in_dev);
1978 if (free_res)
1979 fib_res_put(&res);
1980out: return err;
1981
1982brd_input:
1983 if (skb->protocol != htons(ETH_P_IP))
1984 goto e_inval;
1985
1986 if (ZERONET(saddr))
1987 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1988 else {
1989 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1990 &itag);
1991 if (err < 0)
1992 goto martian_source;
1993 if (err)
1994 flags |= RTCF_DIRECTSRC;
1995 }
1996 flags |= RTCF_BROADCAST;
1997 res.type = RTN_BROADCAST;
1998 RT_CACHE_STAT_INC(in_brd);
1999
2000local_input:
2001 rth = dst_alloc(&ipv4_dst_ops);
2002 if (!rth)
2003 goto e_nobufs;
2004
2005 rth->u.dst.output= ip_rt_bug;
2006
2007 atomic_set(&rth->u.dst.__refcnt, 1);
2008 rth->u.dst.flags= DST_HOST;
2009 if (in_dev->cnf.no_policy)
2010 rth->u.dst.flags |= DST_NOPOLICY;
2011 rth->fl.fl4_dst = daddr;
2012 rth->rt_dst = daddr;
2013 rth->fl.fl4_tos = tos;
2014#ifdef CONFIG_IP_ROUTE_FWMARK
2015 rth->fl.fl4_fwmark= skb->nfmark;
2016#endif
2017 rth->fl.fl4_src = saddr;
2018 rth->rt_src = saddr;
2019#ifdef CONFIG_NET_CLS_ROUTE
2020 rth->u.dst.tclassid = itag;
2021#endif
2022 rth->rt_iif =
2023 rth->fl.iif = dev->ifindex;
2024 rth->u.dst.dev = &loopback_dev;
2025 dev_hold(rth->u.dst.dev);
2026 rth->idev = in_dev_get(rth->u.dst.dev);
2027 rth->rt_gateway = daddr;
2028 rth->rt_spec_dst= spec_dst;
2029 rth->u.dst.input= ip_local_deliver;
2030 rth->rt_flags = flags|RTCF_LOCAL;
2031 if (res.type == RTN_UNREACHABLE) {
2032 rth->u.dst.input= ip_error;
2033 rth->u.dst.error= -err;
2034 rth->rt_flags &= ~RTCF_LOCAL;
2035 }
2036 rth->rt_type = res.type;
2037 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2038 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2039 goto done;
2040
2041no_route:
2042 RT_CACHE_STAT_INC(in_no_route);
2043 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2044 res.type = RTN_UNREACHABLE;
2045 goto local_input;
2046
2047 /*
2048 * Do not cache martian addresses: they should be logged (RFC1812)
2049 */
2050martian_destination:
2051 RT_CACHE_STAT_INC(in_martian_dst);
2052#ifdef CONFIG_IP_ROUTE_VERBOSE
2053 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2054 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2055 "%u.%u.%u.%u, dev %s\n",
2056 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2057#endif
2c2910a4
DE
2058
2059e_hostunreach:
2060 err = -EHOSTUNREACH;
2061 goto done;
2062
1da177e4
LT
2063e_inval:
2064 err = -EINVAL;
2065 goto done;
2066
2067e_nobufs:
2068 err = -ENOBUFS;
2069 goto done;
2070
2071martian_source:
2072 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2073 goto e_inval;
2074}
2075
2076int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2077 u8 tos, struct net_device *dev)
2078{
2079 struct rtable * rth;
2080 unsigned hash;
2081 int iif = dev->ifindex;
2082
2083 tos &= IPTOS_RT_MASK;
2084 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2085
2086 rcu_read_lock();
2087 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2088 rth = rcu_dereference(rth->u.rt_next)) {
2089 if (rth->fl.fl4_dst == daddr &&
2090 rth->fl.fl4_src == saddr &&
2091 rth->fl.iif == iif &&
2092 rth->fl.oif == 0 &&
2093#ifdef CONFIG_IP_ROUTE_FWMARK
2094 rth->fl.fl4_fwmark == skb->nfmark &&
2095#endif
2096 rth->fl.fl4_tos == tos) {
2097 rth->u.dst.lastuse = jiffies;
2098 dst_hold(&rth->u.dst);
2099 rth->u.dst.__use++;
2100 RT_CACHE_STAT_INC(in_hit);
2101 rcu_read_unlock();
2102 skb->dst = (struct dst_entry*)rth;
2103 return 0;
2104 }
2105 RT_CACHE_STAT_INC(in_hlist_search);
2106 }
2107 rcu_read_unlock();
2108
2109 /* Multicast recognition logic is moved from route cache to here.
2110 The problem was that too many Ethernet cards have broken/missing
2111 hardware multicast filters :-( As result the host on multicasting
2112 network acquires a lot of useless route cache entries, sort of
2113 SDR messages from all the world. Now we try to get rid of them.
2114 Really, provided software IP multicast filter is organized
2115 reasonably (at least, hashed), it does not result in a slowdown
2116 comparing with route cache reject entries.
2117 Note, that multicast routers are not affected, because
2118 route cache entry is created eventually.
2119 */
2120 if (MULTICAST(daddr)) {
2121 struct in_device *in_dev;
2122
2123 rcu_read_lock();
2124 if ((in_dev = __in_dev_get(dev)) != NULL) {
2125 int our = ip_check_mc(in_dev, daddr, saddr,
2126 skb->nh.iph->protocol);
2127 if (our
2128#ifdef CONFIG_IP_MROUTE
2129 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2130#endif
2131 ) {
2132 rcu_read_unlock();
2133 return ip_route_input_mc(skb, daddr, saddr,
2134 tos, dev, our);
2135 }
2136 }
2137 rcu_read_unlock();
2138 return -EINVAL;
2139 }
2140 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2141}
2142
2143static inline int __mkroute_output(struct rtable **result,
2144 struct fib_result* res,
2145 const struct flowi *fl,
2146 const struct flowi *oldflp,
2147 struct net_device *dev_out,
2148 unsigned flags)
2149{
2150 struct rtable *rth;
2151 struct in_device *in_dev;
2152 u32 tos = RT_FL_TOS(oldflp);
2153 int err = 0;
2154
2155 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2156 return -EINVAL;
2157
2158 if (fl->fl4_dst == 0xFFFFFFFF)
2159 res->type = RTN_BROADCAST;
2160 else if (MULTICAST(fl->fl4_dst))
2161 res->type = RTN_MULTICAST;
2162 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2163 return -EINVAL;
2164
2165 if (dev_out->flags & IFF_LOOPBACK)
2166 flags |= RTCF_LOCAL;
2167
2168 /* get work reference to inet device */
2169 in_dev = in_dev_get(dev_out);
2170 if (!in_dev)
2171 return -EINVAL;
2172
2173 if (res->type == RTN_BROADCAST) {
2174 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2175 if (res->fi) {
2176 fib_info_put(res->fi);
2177 res->fi = NULL;
2178 }
2179 } else if (res->type == RTN_MULTICAST) {
2180 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2181 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2182 oldflp->proto))
2183 flags &= ~RTCF_LOCAL;
2184 /* If multicast route do not exist use
2185 default one, but do not gateway in this case.
2186 Yes, it is hack.
2187 */
2188 if (res->fi && res->prefixlen < 4) {
2189 fib_info_put(res->fi);
2190 res->fi = NULL;
2191 }
2192 }
2193
2194
2195 rth = dst_alloc(&ipv4_dst_ops);
2196 if (!rth) {
2197 err = -ENOBUFS;
2198 goto cleanup;
2199 }
2200
2201 rth->u.dst.flags= DST_HOST;
2202#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2203 if (res->fi) {
2204 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2205 if (res->fi->fib_nhs > 1)
2206 rth->u.dst.flags |= DST_BALANCED;
2207 }
2208#endif
2209 if (in_dev->cnf.no_xfrm)
2210 rth->u.dst.flags |= DST_NOXFRM;
2211 if (in_dev->cnf.no_policy)
2212 rth->u.dst.flags |= DST_NOPOLICY;
2213
2214 rth->fl.fl4_dst = oldflp->fl4_dst;
2215 rth->fl.fl4_tos = tos;
2216 rth->fl.fl4_src = oldflp->fl4_src;
2217 rth->fl.oif = oldflp->oif;
2218#ifdef CONFIG_IP_ROUTE_FWMARK
2219 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2220#endif
2221 rth->rt_dst = fl->fl4_dst;
2222 rth->rt_src = fl->fl4_src;
2223 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2224 /* get references to the devices that are to be hold by the routing
2225 cache entry */
2226 rth->u.dst.dev = dev_out;
2227 dev_hold(dev_out);
2228 rth->idev = in_dev_get(dev_out);
2229 rth->rt_gateway = fl->fl4_dst;
2230 rth->rt_spec_dst= fl->fl4_src;
2231
2232 rth->u.dst.output=ip_output;
2233
2234 RT_CACHE_STAT_INC(out_slow_tot);
2235
2236 if (flags & RTCF_LOCAL) {
2237 rth->u.dst.input = ip_local_deliver;
2238 rth->rt_spec_dst = fl->fl4_dst;
2239 }
2240 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2241 rth->rt_spec_dst = fl->fl4_src;
2242 if (flags & RTCF_LOCAL &&
2243 !(dev_out->flags & IFF_LOOPBACK)) {
2244 rth->u.dst.output = ip_mc_output;
2245 RT_CACHE_STAT_INC(out_slow_mc);
2246 }
2247#ifdef CONFIG_IP_MROUTE
2248 if (res->type == RTN_MULTICAST) {
2249 if (IN_DEV_MFORWARD(in_dev) &&
2250 !LOCAL_MCAST(oldflp->fl4_dst)) {
2251 rth->u.dst.input = ip_mr_input;
2252 rth->u.dst.output = ip_mc_output;
2253 }
2254 }
2255#endif
2256 }
2257
2258 rt_set_nexthop(rth, res, 0);
2259
2260 rth->rt_flags = flags;
2261
2262 *result = rth;
2263 cleanup:
2264 /* release work reference to inet device */
2265 in_dev_put(in_dev);
2266
2267 return err;
2268}
2269
2270static inline int ip_mkroute_output_def(struct rtable **rp,
2271 struct fib_result* res,
2272 const struct flowi *fl,
2273 const struct flowi *oldflp,
2274 struct net_device *dev_out,
2275 unsigned flags)
2276{
7abaa27c 2277 struct rtable *rth = NULL;
1da177e4
LT
2278 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2279 unsigned hash;
2280 if (err == 0) {
2281 u32 tos = RT_FL_TOS(oldflp);
2282
2283 atomic_set(&rth->u.dst.__refcnt, 1);
2284
2285 hash = rt_hash_code(oldflp->fl4_dst,
2286 oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2287 err = rt_intern_hash(hash, rth, rp);
2288 }
2289
2290 return err;
2291}
2292
2293static inline int ip_mkroute_output(struct rtable** rp,
2294 struct fib_result* res,
2295 const struct flowi *fl,
2296 const struct flowi *oldflp,
2297 struct net_device *dev_out,
2298 unsigned flags)
2299{
2300#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2301 u32 tos = RT_FL_TOS(oldflp);
2302 unsigned char hop;
2303 unsigned hash;
2304 int err = -EINVAL;
7abaa27c 2305 struct rtable *rth = NULL;
1da177e4
LT
2306
2307 if (res->fi && res->fi->fib_nhs > 1) {
2308 unsigned char hopcount = res->fi->fib_nhs;
2309
2310 for (hop = 0; hop < hopcount; hop++) {
2311 struct net_device *dev2nexthop;
2312
2313 res->nh_sel = hop;
2314
2315 /* hold a work reference to the output device */
2316 dev2nexthop = FIB_RES_DEV(*res);
2317 dev_hold(dev2nexthop);
2318
2319 err = __mkroute_output(&rth, res, fl, oldflp,
2320 dev2nexthop, flags);
2321
2322 if (err != 0)
2323 goto cleanup;
2324
2325 hash = rt_hash_code(oldflp->fl4_dst,
2326 oldflp->fl4_src ^
2327 (oldflp->oif << 5), tos);
2328 err = rt_intern_hash(hash, rth, rp);
2329
2330 /* forward hop information to multipath impl. */
2331 multipath_set_nhinfo(rth,
2332 FIB_RES_NETWORK(*res),
2333 FIB_RES_NETMASK(*res),
2334 res->prefixlen,
2335 &FIB_RES_NH(*res));
2336 cleanup:
2337 /* release work reference to output device */
2338 dev_put(dev2nexthop);
2339
2340 if (err != 0)
2341 return err;
2342 }
2343 atomic_set(&(*rp)->u.dst.__refcnt, 1);
2344 return err;
2345 } else {
2346 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2347 flags);
2348 }
2349#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2350 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2351#endif
2352}
2353
2354/*
2355 * Major route resolver routine.
2356 */
2357
2358static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2359{
2360 u32 tos = RT_FL_TOS(oldflp);
2361 struct flowi fl = { .nl_u = { .ip4_u =
2362 { .daddr = oldflp->fl4_dst,
2363 .saddr = oldflp->fl4_src,
2364 .tos = tos & IPTOS_RT_MASK,
2365 .scope = ((tos & RTO_ONLINK) ?
2366 RT_SCOPE_LINK :
2367 RT_SCOPE_UNIVERSE),
2368#ifdef CONFIG_IP_ROUTE_FWMARK
2369 .fwmark = oldflp->fl4_fwmark
2370#endif
2371 } },
2372 .iif = loopback_dev.ifindex,
2373 .oif = oldflp->oif };
2374 struct fib_result res;
2375 unsigned flags = 0;
2376 struct net_device *dev_out = NULL;
2377 int free_res = 0;
2378 int err;
2379
2380
2381 res.fi = NULL;
2382#ifdef CONFIG_IP_MULTIPLE_TABLES
2383 res.r = NULL;
2384#endif
2385
2386 if (oldflp->fl4_src) {
2387 err = -EINVAL;
2388 if (MULTICAST(oldflp->fl4_src) ||
2389 BADCLASS(oldflp->fl4_src) ||
2390 ZERONET(oldflp->fl4_src))
2391 goto out;
2392
2393 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2394 dev_out = ip_dev_find(oldflp->fl4_src);
2395 if (dev_out == NULL)
2396 goto out;
2397
2398 /* I removed check for oif == dev_out->oif here.
2399 It was wrong for two reasons:
2400 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2401 assigned to multiple interfaces.
2402 2. Moreover, we are allowed to send packets with saddr
2403 of another iface. --ANK
2404 */
2405
2406 if (oldflp->oif == 0
2407 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2408 /* Special hack: user can direct multicasts
2409 and limited broadcast via necessary interface
2410 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2411 This hack is not just for fun, it allows
2412 vic,vat and friends to work.
2413 They bind socket to loopback, set ttl to zero
2414 and expect that it will work.
2415 From the viewpoint of routing cache they are broken,
2416 because we are not allowed to build multicast path
2417 with loopback source addr (look, routing cache
2418 cannot know, that ttl is zero, so that packet
2419 will not leave this host and route is valid).
2420 Luckily, this hack is good workaround.
2421 */
2422
2423 fl.oif = dev_out->ifindex;
2424 goto make_route;
2425 }
2426 if (dev_out)
2427 dev_put(dev_out);
2428 dev_out = NULL;
2429 }
2430
2431
2432 if (oldflp->oif) {
2433 dev_out = dev_get_by_index(oldflp->oif);
2434 err = -ENODEV;
2435 if (dev_out == NULL)
2436 goto out;
2437 if (__in_dev_get(dev_out) == NULL) {
2438 dev_put(dev_out);
2439 goto out; /* Wrong error code */
2440 }
2441
2442 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2443 if (!fl.fl4_src)
2444 fl.fl4_src = inet_select_addr(dev_out, 0,
2445 RT_SCOPE_LINK);
2446 goto make_route;
2447 }
2448 if (!fl.fl4_src) {
2449 if (MULTICAST(oldflp->fl4_dst))
2450 fl.fl4_src = inet_select_addr(dev_out, 0,
2451 fl.fl4_scope);
2452 else if (!oldflp->fl4_dst)
2453 fl.fl4_src = inet_select_addr(dev_out, 0,
2454 RT_SCOPE_HOST);
2455 }
2456 }
2457
2458 if (!fl.fl4_dst) {
2459 fl.fl4_dst = fl.fl4_src;
2460 if (!fl.fl4_dst)
2461 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2462 if (dev_out)
2463 dev_put(dev_out);
2464 dev_out = &loopback_dev;
2465 dev_hold(dev_out);
2466 fl.oif = loopback_dev.ifindex;
2467 res.type = RTN_LOCAL;
2468 flags |= RTCF_LOCAL;
2469 goto make_route;
2470 }
2471
2472 if (fib_lookup(&fl, &res)) {
2473 res.fi = NULL;
2474 if (oldflp->oif) {
2475 /* Apparently, routing tables are wrong. Assume,
2476 that the destination is on link.
2477
2478 WHY? DW.
2479 Because we are allowed to send to iface
2480 even if it has NO routes and NO assigned
2481 addresses. When oif is specified, routing
2482 tables are looked up with only one purpose:
2483 to catch if destination is gatewayed, rather than
2484 direct. Moreover, if MSG_DONTROUTE is set,
2485 we send packet, ignoring both routing tables
2486 and ifaddr state. --ANK
2487
2488
2489 We could make it even if oif is unknown,
2490 likely IPv6, but we do not.
2491 */
2492
2493 if (fl.fl4_src == 0)
2494 fl.fl4_src = inet_select_addr(dev_out, 0,
2495 RT_SCOPE_LINK);
2496 res.type = RTN_UNICAST;
2497 goto make_route;
2498 }
2499 if (dev_out)
2500 dev_put(dev_out);
2501 err = -ENETUNREACH;
2502 goto out;
2503 }
2504 free_res = 1;
2505
2506 if (res.type == RTN_LOCAL) {
2507 if (!fl.fl4_src)
2508 fl.fl4_src = fl.fl4_dst;
2509 if (dev_out)
2510 dev_put(dev_out);
2511 dev_out = &loopback_dev;
2512 dev_hold(dev_out);
2513 fl.oif = dev_out->ifindex;
2514 if (res.fi)
2515 fib_info_put(res.fi);
2516 res.fi = NULL;
2517 flags |= RTCF_LOCAL;
2518 goto make_route;
2519 }
2520
2521#ifdef CONFIG_IP_ROUTE_MULTIPATH
2522 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2523 fib_select_multipath(&fl, &res);
2524 else
2525#endif
2526 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2527 fib_select_default(&fl, &res);
2528
2529 if (!fl.fl4_src)
2530 fl.fl4_src = FIB_RES_PREFSRC(res);
2531
2532 if (dev_out)
2533 dev_put(dev_out);
2534 dev_out = FIB_RES_DEV(res);
2535 dev_hold(dev_out);
2536 fl.oif = dev_out->ifindex;
2537
2538
2539make_route:
2540 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2541
2542
2543 if (free_res)
2544 fib_res_put(&res);
2545 if (dev_out)
2546 dev_put(dev_out);
2547out: return err;
2548}
2549
2550int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2551{
2552 unsigned hash;
2553 struct rtable *rth;
2554
2555 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2556
2557 rcu_read_lock_bh();
2558 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2559 rth = rcu_dereference(rth->u.rt_next)) {
2560 if (rth->fl.fl4_dst == flp->fl4_dst &&
2561 rth->fl.fl4_src == flp->fl4_src &&
2562 rth->fl.iif == 0 &&
2563 rth->fl.oif == flp->oif &&
2564#ifdef CONFIG_IP_ROUTE_FWMARK
2565 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2566#endif
2567 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2568 (IPTOS_RT_MASK | RTO_ONLINK))) {
2569
2570 /* check for multipath routes and choose one if
2571 * necessary
2572 */
2573 if (multipath_select_route(flp, rth, rp)) {
2574 dst_hold(&(*rp)->u.dst);
2575 RT_CACHE_STAT_INC(out_hit);
2576 rcu_read_unlock_bh();
2577 return 0;
2578 }
2579
2580 rth->u.dst.lastuse = jiffies;
2581 dst_hold(&rth->u.dst);
2582 rth->u.dst.__use++;
2583 RT_CACHE_STAT_INC(out_hit);
2584 rcu_read_unlock_bh();
2585 *rp = rth;
2586 return 0;
2587 }
2588 RT_CACHE_STAT_INC(out_hlist_search);
2589 }
2590 rcu_read_unlock_bh();
2591
2592 return ip_route_output_slow(rp, flp);
2593}
2594
2595int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2596{
2597 int err;
2598
2599 if ((err = __ip_route_output_key(rp, flp)) != 0)
2600 return err;
2601
2602 if (flp->proto) {
2603 if (!flp->fl4_src)
2604 flp->fl4_src = (*rp)->rt_src;
2605 if (!flp->fl4_dst)
2606 flp->fl4_dst = (*rp)->rt_dst;
2607 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2608 }
2609
2610 return 0;
2611}
2612
2613int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2614{
2615 return ip_route_output_flow(rp, flp, NULL, 0);
2616}
2617
2618static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2619 int nowait, unsigned int flags)
1da177e4
LT
2620{
2621 struct rtable *rt = (struct rtable*)skb->dst;
2622 struct rtmsg *r;
2623 struct nlmsghdr *nlh;
2624 unsigned char *b = skb->tail;
2625 struct rta_cacheinfo ci;
2626#ifdef CONFIG_IP_MROUTE
2627 struct rtattr *eptr;
2628#endif
b6544c0b 2629 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
1da177e4 2630 r = NLMSG_DATA(nlh);
1da177e4
LT
2631 r->rtm_family = AF_INET;
2632 r->rtm_dst_len = 32;
2633 r->rtm_src_len = 0;
2634 r->rtm_tos = rt->fl.fl4_tos;
2635 r->rtm_table = RT_TABLE_MAIN;
2636 r->rtm_type = rt->rt_type;
2637 r->rtm_scope = RT_SCOPE_UNIVERSE;
2638 r->rtm_protocol = RTPROT_UNSPEC;
2639 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2640 if (rt->rt_flags & RTCF_NOTIFY)
2641 r->rtm_flags |= RTM_F_NOTIFY;
2642 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2643 if (rt->fl.fl4_src) {
2644 r->rtm_src_len = 32;
2645 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2646 }
2647 if (rt->u.dst.dev)
2648 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2649#ifdef CONFIG_NET_CLS_ROUTE
2650 if (rt->u.dst.tclassid)
2651 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2652#endif
2653#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2654 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2655 __u32 alg = rt->rt_multipath_alg;
2656
2657 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2658 }
2659#endif
2660 if (rt->fl.iif)
2661 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2662 else if (rt->rt_src != rt->fl.fl4_src)
2663 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2664 if (rt->rt_dst != rt->rt_gateway)
2665 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2666 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2667 goto rtattr_failure;
2668 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2669 ci.rta_used = rt->u.dst.__use;
2670 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2671 if (rt->u.dst.expires)
2672 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2673 else
2674 ci.rta_expires = 0;
2675 ci.rta_error = rt->u.dst.error;
2676 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2677 if (rt->peer) {
2678 ci.rta_id = rt->peer->ip_id_count;
2679 if (rt->peer->tcp_ts_stamp) {
2680 ci.rta_ts = rt->peer->tcp_ts;
2681 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2682 }
2683 }
2684#ifdef CONFIG_IP_MROUTE
2685 eptr = (struct rtattr*)skb->tail;
2686#endif
2687 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2688 if (rt->fl.iif) {
2689#ifdef CONFIG_IP_MROUTE
2690 u32 dst = rt->rt_dst;
2691
2692 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2693 ipv4_devconf.mc_forwarding) {
2694 int err = ipmr_get_route(skb, r, nowait);
2695 if (err <= 0) {
2696 if (!nowait) {
2697 if (err == 0)
2698 return 0;
2699 goto nlmsg_failure;
2700 } else {
2701 if (err == -EMSGSIZE)
2702 goto nlmsg_failure;
2703 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2704 }
2705 }
2706 } else
2707#endif
2708 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2709 }
2710
2711 nlh->nlmsg_len = skb->tail - b;
2712 return skb->len;
2713
2714nlmsg_failure:
2715rtattr_failure:
2716 skb_trim(skb, b - skb->data);
2717 return -1;
2718}
2719
2720int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2721{
2722 struct rtattr **rta = arg;
2723 struct rtmsg *rtm = NLMSG_DATA(nlh);
2724 struct rtable *rt = NULL;
2725 u32 dst = 0;
2726 u32 src = 0;
2727 int iif = 0;
2728 int err = -ENOBUFS;
2729 struct sk_buff *skb;
2730
2731 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2732 if (!skb)
2733 goto out;
2734
2735 /* Reserve room for dummy headers, this skb can pass
2736 through good chunk of routing engine.
2737 */
2738 skb->mac.raw = skb->data;
2739 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2740
2741 if (rta[RTA_SRC - 1])
2742 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2743 if (rta[RTA_DST - 1])
2744 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2745 if (rta[RTA_IIF - 1])
2746 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2747
2748 if (iif) {
2749 struct net_device *dev = __dev_get_by_index(iif);
2750 err = -ENODEV;
2751 if (!dev)
2752 goto out_free;
2753 skb->protocol = htons(ETH_P_IP);
2754 skb->dev = dev;
2755 local_bh_disable();
2756 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2757 local_bh_enable();
2758 rt = (struct rtable*)skb->dst;
2759 if (!err && rt->u.dst.error)
2760 err = -rt->u.dst.error;
2761 } else {
2762 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2763 .saddr = src,
2764 .tos = rtm->rtm_tos } } };
2765 int oif = 0;
2766 if (rta[RTA_OIF - 1])
2767 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2768 fl.oif = oif;
2769 err = ip_route_output_key(&rt, &fl);
2770 }
2771 if (err)
2772 goto out_free;
2773
2774 skb->dst = &rt->u.dst;
2775 if (rtm->rtm_flags & RTM_F_NOTIFY)
2776 rt->rt_flags |= RTCF_NOTIFY;
2777
2778 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2779
2780 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
b6544c0b 2781 RTM_NEWROUTE, 0, 0);
1da177e4
LT
2782 if (!err)
2783 goto out_free;
2784 if (err < 0) {
2785 err = -EMSGSIZE;
2786 goto out_free;
2787 }
2788
2789 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2790 if (err > 0)
2791 err = 0;
2792out: return err;
2793
2794out_free:
2795 kfree_skb(skb);
2796 goto out;
2797}
2798
2799int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2800{
2801 struct rtable *rt;
2802 int h, s_h;
2803 int idx, s_idx;
2804
2805 s_h = cb->args[0];
2806 s_idx = idx = cb->args[1];
2807 for (h = 0; h <= rt_hash_mask; h++) {
2808 if (h < s_h) continue;
2809 if (h > s_h)
2810 s_idx = 0;
2811 rcu_read_lock_bh();
2812 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2813 rt = rcu_dereference(rt->u.rt_next), idx++) {
2814 if (idx < s_idx)
2815 continue;
2816 skb->dst = dst_clone(&rt->u.dst);
2817 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
b6544c0b
JHS
2818 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2819 1, NLM_F_MULTI) <= 0) {
1da177e4
LT
2820 dst_release(xchg(&skb->dst, NULL));
2821 rcu_read_unlock_bh();
2822 goto done;
2823 }
2824 dst_release(xchg(&skb->dst, NULL));
2825 }
2826 rcu_read_unlock_bh();
2827 }
2828
2829done:
2830 cb->args[0] = h;
2831 cb->args[1] = idx;
2832 return skb->len;
2833}
2834
2835void ip_rt_multicast_event(struct in_device *in_dev)
2836{
2837 rt_cache_flush(0);
2838}
2839
2840#ifdef CONFIG_SYSCTL
2841static int flush_delay;
2842
2843static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2844 struct file *filp, void __user *buffer,
2845 size_t *lenp, loff_t *ppos)
2846{
2847 if (write) {
2848 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2849 rt_cache_flush(flush_delay);
2850 return 0;
2851 }
2852
2853 return -EINVAL;
2854}
2855
2856static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2857 int __user *name,
2858 int nlen,
2859 void __user *oldval,
2860 size_t __user *oldlenp,
2861 void __user *newval,
2862 size_t newlen,
2863 void **context)
2864{
2865 int delay;
2866 if (newlen != sizeof(int))
2867 return -EINVAL;
2868 if (get_user(delay, (int __user *)newval))
2869 return -EFAULT;
2870 rt_cache_flush(delay);
2871 return 0;
2872}
2873
2874ctl_table ipv4_route_table[] = {
2875 {
2876 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2877 .procname = "flush",
2878 .data = &flush_delay,
2879 .maxlen = sizeof(int),
7e3e0360 2880 .mode = 0200,
1da177e4
LT
2881 .proc_handler = &ipv4_sysctl_rtcache_flush,
2882 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2883 },
2884 {
2885 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2886 .procname = "min_delay",
2887 .data = &ip_rt_min_delay,
2888 .maxlen = sizeof(int),
2889 .mode = 0644,
2890 .proc_handler = &proc_dointvec_jiffies,
2891 .strategy = &sysctl_jiffies,
2892 },
2893 {
2894 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2895 .procname = "max_delay",
2896 .data = &ip_rt_max_delay,
2897 .maxlen = sizeof(int),
2898 .mode = 0644,
2899 .proc_handler = &proc_dointvec_jiffies,
2900 .strategy = &sysctl_jiffies,
2901 },
2902 {
2903 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2904 .procname = "gc_thresh",
2905 .data = &ipv4_dst_ops.gc_thresh,
2906 .maxlen = sizeof(int),
2907 .mode = 0644,
2908 .proc_handler = &proc_dointvec,
2909 },
2910 {
2911 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2912 .procname = "max_size",
2913 .data = &ip_rt_max_size,
2914 .maxlen = sizeof(int),
2915 .mode = 0644,
2916 .proc_handler = &proc_dointvec,
2917 },
2918 {
2919 /* Deprecated. Use gc_min_interval_ms */
2920
2921 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2922 .procname = "gc_min_interval",
2923 .data = &ip_rt_gc_min_interval,
2924 .maxlen = sizeof(int),
2925 .mode = 0644,
2926 .proc_handler = &proc_dointvec_jiffies,
2927 .strategy = &sysctl_jiffies,
2928 },
2929 {
2930 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2931 .procname = "gc_min_interval_ms",
2932 .data = &ip_rt_gc_min_interval,
2933 .maxlen = sizeof(int),
2934 .mode = 0644,
2935 .proc_handler = &proc_dointvec_ms_jiffies,
2936 .strategy = &sysctl_ms_jiffies,
2937 },
2938 {
2939 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2940 .procname = "gc_timeout",
2941 .data = &ip_rt_gc_timeout,
2942 .maxlen = sizeof(int),
2943 .mode = 0644,
2944 .proc_handler = &proc_dointvec_jiffies,
2945 .strategy = &sysctl_jiffies,
2946 },
2947 {
2948 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2949 .procname = "gc_interval",
2950 .data = &ip_rt_gc_interval,
2951 .maxlen = sizeof(int),
2952 .mode = 0644,
2953 .proc_handler = &proc_dointvec_jiffies,
2954 .strategy = &sysctl_jiffies,
2955 },
2956 {
2957 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2958 .procname = "redirect_load",
2959 .data = &ip_rt_redirect_load,
2960 .maxlen = sizeof(int),
2961 .mode = 0644,
2962 .proc_handler = &proc_dointvec,
2963 },
2964 {
2965 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2966 .procname = "redirect_number",
2967 .data = &ip_rt_redirect_number,
2968 .maxlen = sizeof(int),
2969 .mode = 0644,
2970 .proc_handler = &proc_dointvec,
2971 },
2972 {
2973 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2974 .procname = "redirect_silence",
2975 .data = &ip_rt_redirect_silence,
2976 .maxlen = sizeof(int),
2977 .mode = 0644,
2978 .proc_handler = &proc_dointvec,
2979 },
2980 {
2981 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2982 .procname = "error_cost",
2983 .data = &ip_rt_error_cost,
2984 .maxlen = sizeof(int),
2985 .mode = 0644,
2986 .proc_handler = &proc_dointvec,
2987 },
2988 {
2989 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2990 .procname = "error_burst",
2991 .data = &ip_rt_error_burst,
2992 .maxlen = sizeof(int),
2993 .mode = 0644,
2994 .proc_handler = &proc_dointvec,
2995 },
2996 {
2997 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2998 .procname = "gc_elasticity",
2999 .data = &ip_rt_gc_elasticity,
3000 .maxlen = sizeof(int),
3001 .mode = 0644,
3002 .proc_handler = &proc_dointvec,
3003 },
3004 {
3005 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3006 .procname = "mtu_expires",
3007 .data = &ip_rt_mtu_expires,
3008 .maxlen = sizeof(int),
3009 .mode = 0644,
3010 .proc_handler = &proc_dointvec_jiffies,
3011 .strategy = &sysctl_jiffies,
3012 },
3013 {
3014 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3015 .procname = "min_pmtu",
3016 .data = &ip_rt_min_pmtu,
3017 .maxlen = sizeof(int),
3018 .mode = 0644,
3019 .proc_handler = &proc_dointvec,
3020 },
3021 {
3022 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3023 .procname = "min_adv_mss",
3024 .data = &ip_rt_min_advmss,
3025 .maxlen = sizeof(int),
3026 .mode = 0644,
3027 .proc_handler = &proc_dointvec,
3028 },
3029 {
3030 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3031 .procname = "secret_interval",
3032 .data = &ip_rt_secret_interval,
3033 .maxlen = sizeof(int),
3034 .mode = 0644,
3035 .proc_handler = &proc_dointvec_jiffies,
3036 .strategy = &sysctl_jiffies,
3037 },
3038 { .ctl_name = 0 }
3039};
3040#endif
3041
3042#ifdef CONFIG_NET_CLS_ROUTE
3043struct ip_rt_acct *ip_rt_acct;
3044
3045/* This code sucks. But you should have seen it before! --RR */
3046
3047/* IP route accounting ptr for this logical cpu number. */
3048#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3049
3050#ifdef CONFIG_PROC_FS
3051static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3052 int length, int *eof, void *data)
3053{
3054 unsigned int i;
3055
3056 if ((offset & 3) || (length & 3))
3057 return -EIO;
3058
3059 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3060 *eof = 1;
3061 return 0;
3062 }
3063
3064 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3065 length = sizeof(struct ip_rt_acct) * 256 - offset;
3066 *eof = 1;
3067 }
3068
3069 offset /= sizeof(u32);
3070
3071 if (length > 0) {
3072 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3073 u32 *dst = (u32 *) buffer;
3074
3075 /* Copy first cpu. */
3076 *start = buffer;
3077 memcpy(dst, src, length);
3078
3079 /* Add the other cpus in, one int at a time */
3080 for_each_cpu(i) {
3081 unsigned int j;
3082
3083 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3084
3085 for (j = 0; j < length/4; j++)
3086 dst[j] += src[j];
3087 }
3088 }
3089 return length;
3090}
3091#endif /* CONFIG_PROC_FS */
3092#endif /* CONFIG_NET_CLS_ROUTE */
3093
3094static __initdata unsigned long rhash_entries;
3095static int __init set_rhash_entries(char *str)
3096{
3097 if (!str)
3098 return 0;
3099 rhash_entries = simple_strtoul(str, &str, 0);
3100 return 1;
3101}
3102__setup("rhash_entries=", set_rhash_entries);
3103
3104int __init ip_rt_init(void)
3105{
22c047cc 3106 int order, goal, rc = 0;
1da177e4
LT
3107
3108 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3109 (jiffies ^ (jiffies >> 7)));
3110
3111#ifdef CONFIG_NET_CLS_ROUTE
3112 for (order = 0;
3113 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3114 /* NOTHING */;
3115 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3116 if (!ip_rt_acct)
3117 panic("IP: failed to allocate ip_rt_acct\n");
3118 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3119#endif
3120
3121 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3122 sizeof(struct rtable),
3123 0, SLAB_HWCACHE_ALIGN,
3124 NULL, NULL);
3125
3126 if (!ipv4_dst_ops.kmem_cachep)
3127 panic("IP: failed to allocate ip_dst_cache\n");
3128
3129 goal = num_physpages >> (26 - PAGE_SHIFT);
3130 if (rhash_entries)
3131 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
3132 for (order = 0; (1UL << order) < goal; order++)
3133 /* NOTHING */;
3134
3135 do {
3136 rt_hash_mask = (1UL << order) * PAGE_SIZE /
3137 sizeof(struct rt_hash_bucket);
3138 while (rt_hash_mask & (rt_hash_mask - 1))
3139 rt_hash_mask--;
3140 rt_hash_table = (struct rt_hash_bucket *)
3141 __get_free_pages(GFP_ATOMIC, order);
3142 } while (rt_hash_table == NULL && --order > 0);
3143
3144 if (!rt_hash_table)
3145 panic("Failed to allocate IP route cache hash table\n");
3146
3147 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3148 rt_hash_mask,
3149 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3150
3151 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3152 /* NOTHING */;
3153
3154 rt_hash_mask--;
22c047cc
ED
3155 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3156 rt_hash_lock_init();
1da177e4
LT
3157
3158 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3159 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3160
3161 rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3162 if (!rt_cache_stat)
3163 return -ENOMEM;
3164
3165 devinet_init();
3166 ip_fib_init();
3167
3168 init_timer(&rt_flush_timer);
3169 rt_flush_timer.function = rt_run_flush;
3170 init_timer(&rt_periodic_timer);
3171 rt_periodic_timer.function = rt_check_expire;
3172 init_timer(&rt_secret_timer);
3173 rt_secret_timer.function = rt_secret_rebuild;
3174
3175 /* All the timers, started at system startup tend
3176 to synchronize. Perturb it a bit.
3177 */
3178 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3179 ip_rt_gc_interval;
3180 add_timer(&rt_periodic_timer);
3181
3182 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3183 ip_rt_secret_interval;
3184 add_timer(&rt_secret_timer);
3185
3186#ifdef CONFIG_PROC_FS
3187 {
3188 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3189 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3190 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3191 proc_net_stat))) {
3192 free_percpu(rt_cache_stat);
3193 return -ENOMEM;
3194 }
3195 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3196 }
3197#ifdef CONFIG_NET_CLS_ROUTE
3198 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3199#endif
3200#endif
3201#ifdef CONFIG_XFRM
3202 xfrm_init();
3203 xfrm4_init();
3204#endif
3205 return rc;
3206}
3207
3208EXPORT_SYMBOL(__ip_select_ident);
3209EXPORT_SYMBOL(ip_route_input);
3210EXPORT_SYMBOL(ip_route_output_key);