]> bbs.cooldavid.org Git - net-next-2.6.git/blame_incremental - net/ipv4/route.c
igmp: avoid two atomic ops in igmp_rcv()
[net-next-2.6.git] / net / ipv4 / route.c
... / ...
CommitLineData
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
71#include <linux/mm.h>
72#include <linux/bootmem.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/workqueue.h>
83#include <linux/skbuff.h>
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
93#include <linux/slab.h>
94#include <net/dst.h>
95#include <net/net_namespace.h>
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
106#include <net/netevent.h>
107#include <net/rtnetlink.h>
108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
112#define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
119static int ip_rt_max_size;
120static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123static int ip_rt_redirect_number __read_mostly = 9;
124static int ip_rt_redirect_load __read_mostly = HZ / 50;
125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost __read_mostly = HZ;
127static int ip_rt_error_burst __read_mostly = 5 * HZ;
128static int ip_rt_gc_elasticity __read_mostly = 8;
129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256;
132static int rt_chain_length_max __read_mostly = 20;
133
134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
136
137/*
138 * Interface to generic destination cache.
139 */
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142static void ipv4_dst_destroy(struct dst_entry *dst);
143static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148static int rt_garbage_collect(struct dst_ops *ops);
149
150
151static struct dst_ops ipv4_dst_ops = {
152 .family = AF_INET,
153 .protocol = cpu_to_be16(ETH_P_IP),
154 .gc = rt_garbage_collect,
155 .check = ipv4_dst_check,
156 .destroy = ipv4_dst_destroy,
157 .ifdown = ipv4_dst_ifdown,
158 .negative_advice = ipv4_negative_advice,
159 .link_failure = ipv4_link_failure,
160 .update_pmtu = ip_rt_update_pmtu,
161 .local_out = __ip_local_out,
162 .entries = ATOMIC_INIT(0),
163};
164
165#define ECN_OR_COST(class) TC_PRIO_##class
166
167const __u8 ip_tos2prio[16] = {
168 TC_PRIO_BESTEFFORT,
169 ECN_OR_COST(FILLER),
170 TC_PRIO_BESTEFFORT,
171 ECN_OR_COST(BESTEFFORT),
172 TC_PRIO_BULK,
173 ECN_OR_COST(BULK),
174 TC_PRIO_BULK,
175 ECN_OR_COST(BULK),
176 TC_PRIO_INTERACTIVE,
177 ECN_OR_COST(INTERACTIVE),
178 TC_PRIO_INTERACTIVE,
179 ECN_OR_COST(INTERACTIVE),
180 TC_PRIO_INTERACTIVE_BULK,
181 ECN_OR_COST(INTERACTIVE_BULK),
182 TC_PRIO_INTERACTIVE_BULK,
183 ECN_OR_COST(INTERACTIVE_BULK)
184};
185
186
187/*
188 * Route cache.
189 */
190
191/* The locking scheme is rather straight forward:
192 *
193 * 1) Read-Copy Update protects the buckets of the central route hash.
194 * 2) Only writers remove entries, and they hold the lock
195 * as they look at rtable reference counts.
196 * 3) Only readers acquire references to rtable entries,
197 * they do so with atomic increments and with the
198 * lock held.
199 */
200
201struct rt_hash_bucket {
202 struct rtable *chain;
203};
204
205#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
206 defined(CONFIG_PROVE_LOCKING)
207/*
208 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
209 * The size of this table is a power of two and depends on the number of CPUS.
210 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
211 */
212#ifdef CONFIG_LOCKDEP
213# define RT_HASH_LOCK_SZ 256
214#else
215# if NR_CPUS >= 32
216# define RT_HASH_LOCK_SZ 4096
217# elif NR_CPUS >= 16
218# define RT_HASH_LOCK_SZ 2048
219# elif NR_CPUS >= 8
220# define RT_HASH_LOCK_SZ 1024
221# elif NR_CPUS >= 4
222# define RT_HASH_LOCK_SZ 512
223# else
224# define RT_HASH_LOCK_SZ 256
225# endif
226#endif
227
228static spinlock_t *rt_hash_locks;
229# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
230
231static __init void rt_hash_lock_init(void)
232{
233 int i;
234
235 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
236 GFP_KERNEL);
237 if (!rt_hash_locks)
238 panic("IP: failed to allocate rt_hash_locks\n");
239
240 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
241 spin_lock_init(&rt_hash_locks[i]);
242}
243#else
244# define rt_hash_lock_addr(slot) NULL
245
246static inline void rt_hash_lock_init(void)
247{
248}
249#endif
250
251static struct rt_hash_bucket *rt_hash_table __read_mostly;
252static unsigned rt_hash_mask __read_mostly;
253static unsigned int rt_hash_log __read_mostly;
254
255static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
256#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
257
258static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
259 int genid)
260{
261 return jhash_3words((__force u32)daddr, (__force u32)saddr,
262 idx, genid)
263 & rt_hash_mask;
264}
265
266static inline int rt_genid(struct net *net)
267{
268 return atomic_read(&net->ipv4.rt_genid);
269}
270
271#ifdef CONFIG_PROC_FS
272struct rt_cache_iter_state {
273 struct seq_net_private p;
274 int bucket;
275 int genid;
276};
277
278static struct rtable *rt_cache_get_first(struct seq_file *seq)
279{
280 struct rt_cache_iter_state *st = seq->private;
281 struct rtable *r = NULL;
282
283 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
284 if (!rt_hash_table[st->bucket].chain)
285 continue;
286 rcu_read_lock_bh();
287 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
288 while (r) {
289 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
290 r->rt_genid == st->genid)
291 return r;
292 r = rcu_dereference_bh(r->u.dst.rt_next);
293 }
294 rcu_read_unlock_bh();
295 }
296 return r;
297}
298
299static struct rtable *__rt_cache_get_next(struct seq_file *seq,
300 struct rtable *r)
301{
302 struct rt_cache_iter_state *st = seq->private;
303
304 r = r->u.dst.rt_next;
305 while (!r) {
306 rcu_read_unlock_bh();
307 do {
308 if (--st->bucket < 0)
309 return NULL;
310 } while (!rt_hash_table[st->bucket].chain);
311 rcu_read_lock_bh();
312 r = rt_hash_table[st->bucket].chain;
313 }
314 return rcu_dereference_bh(r);
315}
316
317static struct rtable *rt_cache_get_next(struct seq_file *seq,
318 struct rtable *r)
319{
320 struct rt_cache_iter_state *st = seq->private;
321 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
322 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
323 continue;
324 if (r->rt_genid == st->genid)
325 break;
326 }
327 return r;
328}
329
330static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
331{
332 struct rtable *r = rt_cache_get_first(seq);
333
334 if (r)
335 while (pos && (r = rt_cache_get_next(seq, r)))
336 --pos;
337 return pos ? NULL : r;
338}
339
340static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
341{
342 struct rt_cache_iter_state *st = seq->private;
343 if (*pos)
344 return rt_cache_get_idx(seq, *pos - 1);
345 st->genid = rt_genid(seq_file_net(seq));
346 return SEQ_START_TOKEN;
347}
348
349static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
350{
351 struct rtable *r;
352
353 if (v == SEQ_START_TOKEN)
354 r = rt_cache_get_first(seq);
355 else
356 r = rt_cache_get_next(seq, v);
357 ++*pos;
358 return r;
359}
360
361static void rt_cache_seq_stop(struct seq_file *seq, void *v)
362{
363 if (v && v != SEQ_START_TOKEN)
364 rcu_read_unlock_bh();
365}
366
367static int rt_cache_seq_show(struct seq_file *seq, void *v)
368{
369 if (v == SEQ_START_TOKEN)
370 seq_printf(seq, "%-127s\n",
371 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
372 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
373 "HHUptod\tSpecDst");
374 else {
375 struct rtable *r = v;
376 int len;
377
378 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
379 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
380 r->u.dst.dev ? r->u.dst.dev->name : "*",
381 (__force u32)r->rt_dst,
382 (__force u32)r->rt_gateway,
383 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
384 r->u.dst.__use, 0, (__force u32)r->rt_src,
385 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
386 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
387 dst_metric(&r->u.dst, RTAX_WINDOW),
388 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
389 dst_metric(&r->u.dst, RTAX_RTTVAR)),
390 r->fl.fl4_tos,
391 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
392 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
393 dev_queue_xmit) : 0,
394 r->rt_spec_dst, &len);
395
396 seq_printf(seq, "%*s\n", 127 - len, "");
397 }
398 return 0;
399}
400
401static const struct seq_operations rt_cache_seq_ops = {
402 .start = rt_cache_seq_start,
403 .next = rt_cache_seq_next,
404 .stop = rt_cache_seq_stop,
405 .show = rt_cache_seq_show,
406};
407
408static int rt_cache_seq_open(struct inode *inode, struct file *file)
409{
410 return seq_open_net(inode, file, &rt_cache_seq_ops,
411 sizeof(struct rt_cache_iter_state));
412}
413
414static const struct file_operations rt_cache_seq_fops = {
415 .owner = THIS_MODULE,
416 .open = rt_cache_seq_open,
417 .read = seq_read,
418 .llseek = seq_lseek,
419 .release = seq_release_net,
420};
421
422
423static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
424{
425 int cpu;
426
427 if (*pos == 0)
428 return SEQ_START_TOKEN;
429
430 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
431 if (!cpu_possible(cpu))
432 continue;
433 *pos = cpu+1;
434 return &per_cpu(rt_cache_stat, cpu);
435 }
436 return NULL;
437}
438
439static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
440{
441 int cpu;
442
443 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
444 if (!cpu_possible(cpu))
445 continue;
446 *pos = cpu+1;
447 return &per_cpu(rt_cache_stat, cpu);
448 }
449 return NULL;
450
451}
452
453static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
454{
455
456}
457
458static int rt_cpu_seq_show(struct seq_file *seq, void *v)
459{
460 struct rt_cache_stat *st = v;
461
462 if (v == SEQ_START_TOKEN) {
463 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
464 return 0;
465 }
466
467 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
468 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
469 atomic_read(&ipv4_dst_ops.entries),
470 st->in_hit,
471 st->in_slow_tot,
472 st->in_slow_mc,
473 st->in_no_route,
474 st->in_brd,
475 st->in_martian_dst,
476 st->in_martian_src,
477
478 st->out_hit,
479 st->out_slow_tot,
480 st->out_slow_mc,
481
482 st->gc_total,
483 st->gc_ignored,
484 st->gc_goal_miss,
485 st->gc_dst_overflow,
486 st->in_hlist_search,
487 st->out_hlist_search
488 );
489 return 0;
490}
491
492static const struct seq_operations rt_cpu_seq_ops = {
493 .start = rt_cpu_seq_start,
494 .next = rt_cpu_seq_next,
495 .stop = rt_cpu_seq_stop,
496 .show = rt_cpu_seq_show,
497};
498
499
500static int rt_cpu_seq_open(struct inode *inode, struct file *file)
501{
502 return seq_open(file, &rt_cpu_seq_ops);
503}
504
505static const struct file_operations rt_cpu_seq_fops = {
506 .owner = THIS_MODULE,
507 .open = rt_cpu_seq_open,
508 .read = seq_read,
509 .llseek = seq_lseek,
510 .release = seq_release,
511};
512
513#ifdef CONFIG_NET_CLS_ROUTE
514static int rt_acct_proc_show(struct seq_file *m, void *v)
515{
516 struct ip_rt_acct *dst, *src;
517 unsigned int i, j;
518
519 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
520 if (!dst)
521 return -ENOMEM;
522
523 for_each_possible_cpu(i) {
524 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
525 for (j = 0; j < 256; j++) {
526 dst[j].o_bytes += src[j].o_bytes;
527 dst[j].o_packets += src[j].o_packets;
528 dst[j].i_bytes += src[j].i_bytes;
529 dst[j].i_packets += src[j].i_packets;
530 }
531 }
532
533 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
534 kfree(dst);
535 return 0;
536}
537
538static int rt_acct_proc_open(struct inode *inode, struct file *file)
539{
540 return single_open(file, rt_acct_proc_show, NULL);
541}
542
543static const struct file_operations rt_acct_proc_fops = {
544 .owner = THIS_MODULE,
545 .open = rt_acct_proc_open,
546 .read = seq_read,
547 .llseek = seq_lseek,
548 .release = single_release,
549};
550#endif
551
552static int __net_init ip_rt_do_proc_init(struct net *net)
553{
554 struct proc_dir_entry *pde;
555
556 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
557 &rt_cache_seq_fops);
558 if (!pde)
559 goto err1;
560
561 pde = proc_create("rt_cache", S_IRUGO,
562 net->proc_net_stat, &rt_cpu_seq_fops);
563 if (!pde)
564 goto err2;
565
566#ifdef CONFIG_NET_CLS_ROUTE
567 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
568 if (!pde)
569 goto err3;
570#endif
571 return 0;
572
573#ifdef CONFIG_NET_CLS_ROUTE
574err3:
575 remove_proc_entry("rt_cache", net->proc_net_stat);
576#endif
577err2:
578 remove_proc_entry("rt_cache", net->proc_net);
579err1:
580 return -ENOMEM;
581}
582
583static void __net_exit ip_rt_do_proc_exit(struct net *net)
584{
585 remove_proc_entry("rt_cache", net->proc_net_stat);
586 remove_proc_entry("rt_cache", net->proc_net);
587#ifdef CONFIG_NET_CLS_ROUTE
588 remove_proc_entry("rt_acct", net->proc_net);
589#endif
590}
591
592static struct pernet_operations ip_rt_proc_ops __net_initdata = {
593 .init = ip_rt_do_proc_init,
594 .exit = ip_rt_do_proc_exit,
595};
596
597static int __init ip_rt_proc_init(void)
598{
599 return register_pernet_subsys(&ip_rt_proc_ops);
600}
601
602#else
603static inline int ip_rt_proc_init(void)
604{
605 return 0;
606}
607#endif /* CONFIG_PROC_FS */
608
609static inline void rt_free(struct rtable *rt)
610{
611 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
612}
613
614static inline void rt_drop(struct rtable *rt)
615{
616 ip_rt_put(rt);
617 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
618}
619
620static inline int rt_fast_clean(struct rtable *rth)
621{
622 /* Kill broadcast/multicast entries very aggresively, if they
623 collide in hash table with more useful entries */
624 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
625 rth->fl.iif && rth->u.dst.rt_next;
626}
627
628static inline int rt_valuable(struct rtable *rth)
629{
630 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
631 rth->u.dst.expires;
632}
633
634static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
635{
636 unsigned long age;
637 int ret = 0;
638
639 if (atomic_read(&rth->u.dst.__refcnt))
640 goto out;
641
642 ret = 1;
643 if (rth->u.dst.expires &&
644 time_after_eq(jiffies, rth->u.dst.expires))
645 goto out;
646
647 age = jiffies - rth->u.dst.lastuse;
648 ret = 0;
649 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
650 (age <= tmo2 && rt_valuable(rth)))
651 goto out;
652 ret = 1;
653out: return ret;
654}
655
656/* Bits of score are:
657 * 31: very valuable
658 * 30: not quite useless
659 * 29..0: usage counter
660 */
661static inline u32 rt_score(struct rtable *rt)
662{
663 u32 score = jiffies - rt->u.dst.lastuse;
664
665 score = ~score & ~(3<<30);
666
667 if (rt_valuable(rt))
668 score |= (1<<31);
669
670 if (!rt->fl.iif ||
671 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
672 score |= (1<<30);
673
674 return score;
675}
676
677static inline bool rt_caching(const struct net *net)
678{
679 return net->ipv4.current_rt_cache_rebuild_count <=
680 net->ipv4.sysctl_rt_cache_rebuild_count;
681}
682
683static inline bool compare_hash_inputs(const struct flowi *fl1,
684 const struct flowi *fl2)
685{
686 return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
687 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
688 (fl1->iif ^ fl2->iif)) == 0);
689}
690
691static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
692{
693 return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
694 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
695 (fl1->mark ^ fl2->mark) |
696 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
697 (fl1->oif ^ fl2->oif) |
698 (fl1->iif ^ fl2->iif)) == 0;
699}
700
701static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
702{
703 return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
704}
705
706static inline int rt_is_expired(struct rtable *rth)
707{
708 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
709}
710
711/*
712 * Perform a full scan of hash table and free all entries.
713 * Can be called by a softirq or a process.
714 * In the later case, we want to be reschedule if necessary
715 */
716static void rt_do_flush(int process_context)
717{
718 unsigned int i;
719 struct rtable *rth, *next;
720 struct rtable * tail;
721
722 for (i = 0; i <= rt_hash_mask; i++) {
723 if (process_context && need_resched())
724 cond_resched();
725 rth = rt_hash_table[i].chain;
726 if (!rth)
727 continue;
728
729 spin_lock_bh(rt_hash_lock_addr(i));
730#ifdef CONFIG_NET_NS
731 {
732 struct rtable ** prev, * p;
733
734 rth = rt_hash_table[i].chain;
735
736 /* defer releasing the head of the list after spin_unlock */
737 for (tail = rth; tail; tail = tail->u.dst.rt_next)
738 if (!rt_is_expired(tail))
739 break;
740 if (rth != tail)
741 rt_hash_table[i].chain = tail;
742
743 /* call rt_free on entries after the tail requiring flush */
744 prev = &rt_hash_table[i].chain;
745 for (p = *prev; p; p = next) {
746 next = p->u.dst.rt_next;
747 if (!rt_is_expired(p)) {
748 prev = &p->u.dst.rt_next;
749 } else {
750 *prev = next;
751 rt_free(p);
752 }
753 }
754 }
755#else
756 rth = rt_hash_table[i].chain;
757 rt_hash_table[i].chain = NULL;
758 tail = NULL;
759#endif
760 spin_unlock_bh(rt_hash_lock_addr(i));
761
762 for (; rth != tail; rth = next) {
763 next = rth->u.dst.rt_next;
764 rt_free(rth);
765 }
766 }
767}
768
769/*
770 * While freeing expired entries, we compute average chain length
771 * and standard deviation, using fixed-point arithmetic.
772 * This to have an estimation of rt_chain_length_max
773 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
774 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
775 */
776
777#define FRACT_BITS 3
778#define ONE (1UL << FRACT_BITS)
779
780/*
781 * Given a hash chain and an item in this hash chain,
782 * find if a previous entry has the same hash_inputs
783 * (but differs on tos, mark or oif)
784 * Returns 0 if an alias is found.
785 * Returns ONE if rth has no alias before itself.
786 */
787static int has_noalias(const struct rtable *head, const struct rtable *rth)
788{
789 const struct rtable *aux = head;
790
791 while (aux != rth) {
792 if (compare_hash_inputs(&aux->fl, &rth->fl))
793 return 0;
794 aux = aux->u.dst.rt_next;
795 }
796 return ONE;
797}
798
799static void rt_check_expire(void)
800{
801 static unsigned int rover;
802 unsigned int i = rover, goal;
803 struct rtable *rth, **rthp;
804 unsigned long samples = 0;
805 unsigned long sum = 0, sum2 = 0;
806 unsigned long delta;
807 u64 mult;
808
809 delta = jiffies - expires_ljiffies;
810 expires_ljiffies = jiffies;
811 mult = ((u64)delta) << rt_hash_log;
812 if (ip_rt_gc_timeout > 1)
813 do_div(mult, ip_rt_gc_timeout);
814 goal = (unsigned int)mult;
815 if (goal > rt_hash_mask)
816 goal = rt_hash_mask + 1;
817 for (; goal > 0; goal--) {
818 unsigned long tmo = ip_rt_gc_timeout;
819 unsigned long length;
820
821 i = (i + 1) & rt_hash_mask;
822 rthp = &rt_hash_table[i].chain;
823
824 if (need_resched())
825 cond_resched();
826
827 samples++;
828
829 if (*rthp == NULL)
830 continue;
831 length = 0;
832 spin_lock_bh(rt_hash_lock_addr(i));
833 while ((rth = *rthp) != NULL) {
834 prefetch(rth->u.dst.rt_next);
835 if (rt_is_expired(rth)) {
836 *rthp = rth->u.dst.rt_next;
837 rt_free(rth);
838 continue;
839 }
840 if (rth->u.dst.expires) {
841 /* Entry is expired even if it is in use */
842 if (time_before_eq(jiffies, rth->u.dst.expires)) {
843nofree:
844 tmo >>= 1;
845 rthp = &rth->u.dst.rt_next;
846 /*
847 * We only count entries on
848 * a chain with equal hash inputs once
849 * so that entries for different QOS
850 * levels, and other non-hash input
851 * attributes don't unfairly skew
852 * the length computation
853 */
854 length += has_noalias(rt_hash_table[i].chain, rth);
855 continue;
856 }
857 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
858 goto nofree;
859
860 /* Cleanup aged off entries. */
861 *rthp = rth->u.dst.rt_next;
862 rt_free(rth);
863 }
864 spin_unlock_bh(rt_hash_lock_addr(i));
865 sum += length;
866 sum2 += length*length;
867 }
868 if (samples) {
869 unsigned long avg = sum / samples;
870 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
871 rt_chain_length_max = max_t(unsigned long,
872 ip_rt_gc_elasticity,
873 (avg + 4*sd) >> FRACT_BITS);
874 }
875 rover = i;
876}
877
878/*
879 * rt_worker_func() is run in process context.
880 * we call rt_check_expire() to scan part of the hash table
881 */
882static void rt_worker_func(struct work_struct *work)
883{
884 rt_check_expire();
885 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
886}
887
888/*
889 * Pertubation of rt_genid by a small quantity [1..256]
890 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
891 * many times (2^24) without giving recent rt_genid.
892 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
893 */
894static void rt_cache_invalidate(struct net *net)
895{
896 unsigned char shuffle;
897
898 get_random_bytes(&shuffle, sizeof(shuffle));
899 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
900}
901
902/*
903 * delay < 0 : invalidate cache (fast : entries will be deleted later)
904 * delay >= 0 : invalidate & flush cache (can be long)
905 */
906void rt_cache_flush(struct net *net, int delay)
907{
908 rt_cache_invalidate(net);
909 if (delay >= 0)
910 rt_do_flush(!in_softirq());
911}
912
913/* Flush previous cache invalidated entries from the cache */
914void rt_cache_flush_batch(void)
915{
916 rt_do_flush(!in_softirq());
917}
918
919static void rt_emergency_hash_rebuild(struct net *net)
920{
921 if (net_ratelimit())
922 printk(KERN_WARNING "Route hash chain too long!\n");
923 rt_cache_invalidate(net);
924}
925
926/*
927 Short description of GC goals.
928
929 We want to build algorithm, which will keep routing cache
930 at some equilibrium point, when number of aged off entries
931 is kept approximately equal to newly generated ones.
932
933 Current expiration strength is variable "expire".
934 We try to adjust it dynamically, so that if networking
935 is idle expires is large enough to keep enough of warm entries,
936 and when load increases it reduces to limit cache size.
937 */
938
939static int rt_garbage_collect(struct dst_ops *ops)
940{
941 static unsigned long expire = RT_GC_TIMEOUT;
942 static unsigned long last_gc;
943 static int rover;
944 static int equilibrium;
945 struct rtable *rth, **rthp;
946 unsigned long now = jiffies;
947 int goal;
948
949 /*
950 * Garbage collection is pretty expensive,
951 * do not make it too frequently.
952 */
953
954 RT_CACHE_STAT_INC(gc_total);
955
956 if (now - last_gc < ip_rt_gc_min_interval &&
957 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
958 RT_CACHE_STAT_INC(gc_ignored);
959 goto out;
960 }
961
962 /* Calculate number of entries, which we want to expire now. */
963 goal = atomic_read(&ipv4_dst_ops.entries) -
964 (ip_rt_gc_elasticity << rt_hash_log);
965 if (goal <= 0) {
966 if (equilibrium < ipv4_dst_ops.gc_thresh)
967 equilibrium = ipv4_dst_ops.gc_thresh;
968 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
969 if (goal > 0) {
970 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
971 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
972 }
973 } else {
974 /* We are in dangerous area. Try to reduce cache really
975 * aggressively.
976 */
977 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
978 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
979 }
980
981 if (now - last_gc >= ip_rt_gc_min_interval)
982 last_gc = now;
983
984 if (goal <= 0) {
985 equilibrium += goal;
986 goto work_done;
987 }
988
989 do {
990 int i, k;
991
992 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
993 unsigned long tmo = expire;
994
995 k = (k + 1) & rt_hash_mask;
996 rthp = &rt_hash_table[k].chain;
997 spin_lock_bh(rt_hash_lock_addr(k));
998 while ((rth = *rthp) != NULL) {
999 if (!rt_is_expired(rth) &&
1000 !rt_may_expire(rth, tmo, expire)) {
1001 tmo >>= 1;
1002 rthp = &rth->u.dst.rt_next;
1003 continue;
1004 }
1005 *rthp = rth->u.dst.rt_next;
1006 rt_free(rth);
1007 goal--;
1008 }
1009 spin_unlock_bh(rt_hash_lock_addr(k));
1010 if (goal <= 0)
1011 break;
1012 }
1013 rover = k;
1014
1015 if (goal <= 0)
1016 goto work_done;
1017
1018 /* Goal is not achieved. We stop process if:
1019
1020 - if expire reduced to zero. Otherwise, expire is halfed.
1021 - if table is not full.
1022 - if we are called from interrupt.
1023 - jiffies check is just fallback/debug loop breaker.
1024 We will not spin here for long time in any case.
1025 */
1026
1027 RT_CACHE_STAT_INC(gc_goal_miss);
1028
1029 if (expire == 0)
1030 break;
1031
1032 expire >>= 1;
1033#if RT_CACHE_DEBUG >= 2
1034 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1035 atomic_read(&ipv4_dst_ops.entries), goal, i);
1036#endif
1037
1038 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1039 goto out;
1040 } while (!in_softirq() && time_before_eq(jiffies, now));
1041
1042 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1043 goto out;
1044 if (net_ratelimit())
1045 printk(KERN_WARNING "dst cache overflow\n");
1046 RT_CACHE_STAT_INC(gc_dst_overflow);
1047 return 1;
1048
1049work_done:
1050 expire += ip_rt_gc_min_interval;
1051 if (expire > ip_rt_gc_timeout ||
1052 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1053 expire = ip_rt_gc_timeout;
1054#if RT_CACHE_DEBUG >= 2
1055 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1056 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1057#endif
1058out: return 0;
1059}
1060
1061/*
1062 * Returns number of entries in a hash chain that have different hash_inputs
1063 */
1064static int slow_chain_length(const struct rtable *head)
1065{
1066 int length = 0;
1067 const struct rtable *rth = head;
1068
1069 while (rth) {
1070 length += has_noalias(head, rth);
1071 rth = rth->u.dst.rt_next;
1072 }
1073 return length >> FRACT_BITS;
1074}
1075
1076static int rt_intern_hash(unsigned hash, struct rtable *rt,
1077 struct rtable **rp, struct sk_buff *skb, int ifindex)
1078{
1079 struct rtable *rth, **rthp;
1080 unsigned long now;
1081 struct rtable *cand, **candp;
1082 u32 min_score;
1083 int chain_length;
1084 int attempts = !in_softirq();
1085
1086restart:
1087 chain_length = 0;
1088 min_score = ~(u32)0;
1089 cand = NULL;
1090 candp = NULL;
1091 now = jiffies;
1092
1093 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1094 /*
1095 * If we're not caching, just tell the caller we
1096 * were successful and don't touch the route. The
1097 * caller hold the sole reference to the cache entry, and
1098 * it will be released when the caller is done with it.
1099 * If we drop it here, the callers have no way to resolve routes
1100 * when we're not caching. Instead, just point *rp at rt, so
1101 * the caller gets a single use out of the route
1102 * Note that we do rt_free on this new route entry, so that
1103 * once its refcount hits zero, we are still able to reap it
1104 * (Thanks Alexey)
1105 * Note also the rt_free uses call_rcu. We don't actually
1106 * need rcu protection here, this is just our path to get
1107 * on the route gc list.
1108 */
1109
1110 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1111 int err = arp_bind_neighbour(&rt->u.dst);
1112 if (err) {
1113 if (net_ratelimit())
1114 printk(KERN_WARNING
1115 "Neighbour table failure & not caching routes.\n");
1116 rt_drop(rt);
1117 return err;
1118 }
1119 }
1120
1121 rt_free(rt);
1122 goto skip_hashing;
1123 }
1124
1125 rthp = &rt_hash_table[hash].chain;
1126
1127 spin_lock_bh(rt_hash_lock_addr(hash));
1128 while ((rth = *rthp) != NULL) {
1129 if (rt_is_expired(rth)) {
1130 *rthp = rth->u.dst.rt_next;
1131 rt_free(rth);
1132 continue;
1133 }
1134 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1135 /* Put it first */
1136 *rthp = rth->u.dst.rt_next;
1137 /*
1138 * Since lookup is lockfree, the deletion
1139 * must be visible to another weakly ordered CPU before
1140 * the insertion at the start of the hash chain.
1141 */
1142 rcu_assign_pointer(rth->u.dst.rt_next,
1143 rt_hash_table[hash].chain);
1144 /*
1145 * Since lookup is lockfree, the update writes
1146 * must be ordered for consistency on SMP.
1147 */
1148 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1149
1150 dst_use(&rth->u.dst, now);
1151 spin_unlock_bh(rt_hash_lock_addr(hash));
1152
1153 rt_drop(rt);
1154 if (rp)
1155 *rp = rth;
1156 else
1157 skb_dst_set(skb, &rth->u.dst);
1158 return 0;
1159 }
1160
1161 if (!atomic_read(&rth->u.dst.__refcnt)) {
1162 u32 score = rt_score(rth);
1163
1164 if (score <= min_score) {
1165 cand = rth;
1166 candp = rthp;
1167 min_score = score;
1168 }
1169 }
1170
1171 chain_length++;
1172
1173 rthp = &rth->u.dst.rt_next;
1174 }
1175
1176 if (cand) {
1177 /* ip_rt_gc_elasticity used to be average length of chain
1178 * length, when exceeded gc becomes really aggressive.
1179 *
1180 * The second limit is less certain. At the moment it allows
1181 * only 2 entries per bucket. We will see.
1182 */
1183 if (chain_length > ip_rt_gc_elasticity) {
1184 *candp = cand->u.dst.rt_next;
1185 rt_free(cand);
1186 }
1187 } else {
1188 if (chain_length > rt_chain_length_max &&
1189 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1190 struct net *net = dev_net(rt->u.dst.dev);
1191 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1192 if (!rt_caching(net)) {
1193 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1194 rt->u.dst.dev->name, num);
1195 }
1196 rt_emergency_hash_rebuild(net);
1197 spin_unlock_bh(rt_hash_lock_addr(hash));
1198
1199 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1200 ifindex, rt_genid(net));
1201 goto restart;
1202 }
1203 }
1204
1205 /* Try to bind route to arp only if it is output
1206 route or unicast forwarding path.
1207 */
1208 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1209 int err = arp_bind_neighbour(&rt->u.dst);
1210 if (err) {
1211 spin_unlock_bh(rt_hash_lock_addr(hash));
1212
1213 if (err != -ENOBUFS) {
1214 rt_drop(rt);
1215 return err;
1216 }
1217
1218 /* Neighbour tables are full and nothing
1219 can be released. Try to shrink route cache,
1220 it is most likely it holds some neighbour records.
1221 */
1222 if (attempts-- > 0) {
1223 int saved_elasticity = ip_rt_gc_elasticity;
1224 int saved_int = ip_rt_gc_min_interval;
1225 ip_rt_gc_elasticity = 1;
1226 ip_rt_gc_min_interval = 0;
1227 rt_garbage_collect(&ipv4_dst_ops);
1228 ip_rt_gc_min_interval = saved_int;
1229 ip_rt_gc_elasticity = saved_elasticity;
1230 goto restart;
1231 }
1232
1233 if (net_ratelimit())
1234 printk(KERN_WARNING "Neighbour table overflow.\n");
1235 rt_drop(rt);
1236 return -ENOBUFS;
1237 }
1238 }
1239
1240 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1241
1242#if RT_CACHE_DEBUG >= 2
1243 if (rt->u.dst.rt_next) {
1244 struct rtable *trt;
1245 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1246 hash, &rt->rt_dst);
1247 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1248 printk(" . %pI4", &trt->rt_dst);
1249 printk("\n");
1250 }
1251#endif
1252 /*
1253 * Since lookup is lockfree, we must make sure
1254 * previous writes to rt are comitted to memory
1255 * before making rt visible to other CPUS.
1256 */
1257 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1258
1259 spin_unlock_bh(rt_hash_lock_addr(hash));
1260
1261skip_hashing:
1262 if (rp)
1263 *rp = rt;
1264 else
1265 skb_dst_set(skb, &rt->u.dst);
1266 return 0;
1267}
1268
1269void rt_bind_peer(struct rtable *rt, int create)
1270{
1271 static DEFINE_SPINLOCK(rt_peer_lock);
1272 struct inet_peer *peer;
1273
1274 peer = inet_getpeer(rt->rt_dst, create);
1275
1276 spin_lock_bh(&rt_peer_lock);
1277 if (rt->peer == NULL) {
1278 rt->peer = peer;
1279 peer = NULL;
1280 }
1281 spin_unlock_bh(&rt_peer_lock);
1282 if (peer)
1283 inet_putpeer(peer);
1284}
1285
1286/*
1287 * Peer allocation may fail only in serious out-of-memory conditions. However
1288 * we still can generate some output.
1289 * Random ID selection looks a bit dangerous because we have no chances to
1290 * select ID being unique in a reasonable period of time.
1291 * But broken packet identifier may be better than no packet at all.
1292 */
1293static void ip_select_fb_ident(struct iphdr *iph)
1294{
1295 static DEFINE_SPINLOCK(ip_fb_id_lock);
1296 static u32 ip_fallback_id;
1297 u32 salt;
1298
1299 spin_lock_bh(&ip_fb_id_lock);
1300 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1301 iph->id = htons(salt & 0xFFFF);
1302 ip_fallback_id = salt;
1303 spin_unlock_bh(&ip_fb_id_lock);
1304}
1305
1306void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1307{
1308 struct rtable *rt = (struct rtable *) dst;
1309
1310 if (rt) {
1311 if (rt->peer == NULL)
1312 rt_bind_peer(rt, 1);
1313
1314 /* If peer is attached to destination, it is never detached,
1315 so that we need not to grab a lock to dereference it.
1316 */
1317 if (rt->peer) {
1318 iph->id = htons(inet_getid(rt->peer, more));
1319 return;
1320 }
1321 } else
1322 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1323 __builtin_return_address(0));
1324
1325 ip_select_fb_ident(iph);
1326}
1327
1328static void rt_del(unsigned hash, struct rtable *rt)
1329{
1330 struct rtable **rthp, *aux;
1331
1332 rthp = &rt_hash_table[hash].chain;
1333 spin_lock_bh(rt_hash_lock_addr(hash));
1334 ip_rt_put(rt);
1335 while ((aux = *rthp) != NULL) {
1336 if (aux == rt || rt_is_expired(aux)) {
1337 *rthp = aux->u.dst.rt_next;
1338 rt_free(aux);
1339 continue;
1340 }
1341 rthp = &aux->u.dst.rt_next;
1342 }
1343 spin_unlock_bh(rt_hash_lock_addr(hash));
1344}
1345
1346void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1347 __be32 saddr, struct net_device *dev)
1348{
1349 int i, k;
1350 struct in_device *in_dev = in_dev_get(dev);
1351 struct rtable *rth, **rthp;
1352 __be32 skeys[2] = { saddr, 0 };
1353 int ikeys[2] = { dev->ifindex, 0 };
1354 struct netevent_redirect netevent;
1355 struct net *net;
1356
1357 if (!in_dev)
1358 return;
1359
1360 net = dev_net(dev);
1361 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1362 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1363 ipv4_is_zeronet(new_gw))
1364 goto reject_redirect;
1365
1366 if (!rt_caching(net))
1367 goto reject_redirect;
1368
1369 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1370 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1371 goto reject_redirect;
1372 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1373 goto reject_redirect;
1374 } else {
1375 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1376 goto reject_redirect;
1377 }
1378
1379 for (i = 0; i < 2; i++) {
1380 for (k = 0; k < 2; k++) {
1381 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1382 rt_genid(net));
1383
1384 rthp=&rt_hash_table[hash].chain;
1385
1386 rcu_read_lock();
1387 while ((rth = rcu_dereference(*rthp)) != NULL) {
1388 struct rtable *rt;
1389
1390 if (rth->fl.fl4_dst != daddr ||
1391 rth->fl.fl4_src != skeys[i] ||
1392 rth->fl.oif != ikeys[k] ||
1393 rth->fl.iif != 0 ||
1394 rt_is_expired(rth) ||
1395 !net_eq(dev_net(rth->u.dst.dev), net)) {
1396 rthp = &rth->u.dst.rt_next;
1397 continue;
1398 }
1399
1400 if (rth->rt_dst != daddr ||
1401 rth->rt_src != saddr ||
1402 rth->u.dst.error ||
1403 rth->rt_gateway != old_gw ||
1404 rth->u.dst.dev != dev)
1405 break;
1406
1407 dst_hold(&rth->u.dst);
1408 rcu_read_unlock();
1409
1410 rt = dst_alloc(&ipv4_dst_ops);
1411 if (rt == NULL) {
1412 ip_rt_put(rth);
1413 in_dev_put(in_dev);
1414 return;
1415 }
1416
1417 /* Copy all the information. */
1418 *rt = *rth;
1419 rt->u.dst.__use = 1;
1420 atomic_set(&rt->u.dst.__refcnt, 1);
1421 rt->u.dst.child = NULL;
1422 if (rt->u.dst.dev)
1423 dev_hold(rt->u.dst.dev);
1424 if (rt->idev)
1425 in_dev_hold(rt->idev);
1426 rt->u.dst.obsolete = -1;
1427 rt->u.dst.lastuse = jiffies;
1428 rt->u.dst.path = &rt->u.dst;
1429 rt->u.dst.neighbour = NULL;
1430 rt->u.dst.hh = NULL;
1431#ifdef CONFIG_XFRM
1432 rt->u.dst.xfrm = NULL;
1433#endif
1434 rt->rt_genid = rt_genid(net);
1435 rt->rt_flags |= RTCF_REDIRECTED;
1436
1437 /* Gateway is different ... */
1438 rt->rt_gateway = new_gw;
1439
1440 /* Redirect received -> path was valid */
1441 dst_confirm(&rth->u.dst);
1442
1443 if (rt->peer)
1444 atomic_inc(&rt->peer->refcnt);
1445
1446 if (arp_bind_neighbour(&rt->u.dst) ||
1447 !(rt->u.dst.neighbour->nud_state &
1448 NUD_VALID)) {
1449 if (rt->u.dst.neighbour)
1450 neigh_event_send(rt->u.dst.neighbour, NULL);
1451 ip_rt_put(rth);
1452 rt_drop(rt);
1453 goto do_next;
1454 }
1455
1456 netevent.old = &rth->u.dst;
1457 netevent.new = &rt->u.dst;
1458 call_netevent_notifiers(NETEVENT_REDIRECT,
1459 &netevent);
1460
1461 rt_del(hash, rth);
1462 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1463 ip_rt_put(rt);
1464 goto do_next;
1465 }
1466 rcu_read_unlock();
1467 do_next:
1468 ;
1469 }
1470 }
1471 in_dev_put(in_dev);
1472 return;
1473
1474reject_redirect:
1475#ifdef CONFIG_IP_ROUTE_VERBOSE
1476 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1477 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1478 " Advised path = %pI4 -> %pI4\n",
1479 &old_gw, dev->name, &new_gw,
1480 &saddr, &daddr);
1481#endif
1482 in_dev_put(in_dev);
1483}
1484
1485static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1486{
1487 struct rtable *rt = (struct rtable *)dst;
1488 struct dst_entry *ret = dst;
1489
1490 if (rt) {
1491 if (dst->obsolete > 0) {
1492 ip_rt_put(rt);
1493 ret = NULL;
1494 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1495 (rt->u.dst.expires &&
1496 time_after_eq(jiffies, rt->u.dst.expires))) {
1497 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1498 rt->fl.oif,
1499 rt_genid(dev_net(dst->dev)));
1500#if RT_CACHE_DEBUG >= 1
1501 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1502 &rt->rt_dst, rt->fl.fl4_tos);
1503#endif
1504 rt_del(hash, rt);
1505 ret = NULL;
1506 }
1507 }
1508 return ret;
1509}
1510
1511/*
1512 * Algorithm:
1513 * 1. The first ip_rt_redirect_number redirects are sent
1514 * with exponential backoff, then we stop sending them at all,
1515 * assuming that the host ignores our redirects.
1516 * 2. If we did not see packets requiring redirects
1517 * during ip_rt_redirect_silence, we assume that the host
1518 * forgot redirected route and start to send redirects again.
1519 *
1520 * This algorithm is much cheaper and more intelligent than dumb load limiting
1521 * in icmp.c.
1522 *
1523 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1524 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1525 */
1526
1527void ip_rt_send_redirect(struct sk_buff *skb)
1528{
1529 struct rtable *rt = skb_rtable(skb);
1530 struct in_device *in_dev;
1531 int log_martians;
1532
1533 rcu_read_lock();
1534 in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1535 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1536 rcu_read_unlock();
1537 return;
1538 }
1539 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1540 rcu_read_unlock();
1541
1542 /* No redirected packets during ip_rt_redirect_silence;
1543 * reset the algorithm.
1544 */
1545 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1546 rt->u.dst.rate_tokens = 0;
1547
1548 /* Too many ignored redirects; do not send anything
1549 * set u.dst.rate_last to the last seen redirected packet.
1550 */
1551 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1552 rt->u.dst.rate_last = jiffies;
1553 return;
1554 }
1555
1556 /* Check for load limit; set rate_last to the latest sent
1557 * redirect.
1558 */
1559 if (rt->u.dst.rate_tokens == 0 ||
1560 time_after(jiffies,
1561 (rt->u.dst.rate_last +
1562 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1563 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1564 rt->u.dst.rate_last = jiffies;
1565 ++rt->u.dst.rate_tokens;
1566#ifdef CONFIG_IP_ROUTE_VERBOSE
1567 if (log_martians &&
1568 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1569 net_ratelimit())
1570 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1571 &rt->rt_src, rt->rt_iif,
1572 &rt->rt_dst, &rt->rt_gateway);
1573#endif
1574 }
1575}
1576
1577static int ip_error(struct sk_buff *skb)
1578{
1579 struct rtable *rt = skb_rtable(skb);
1580 unsigned long now;
1581 int code;
1582
1583 switch (rt->u.dst.error) {
1584 case EINVAL:
1585 default:
1586 goto out;
1587 case EHOSTUNREACH:
1588 code = ICMP_HOST_UNREACH;
1589 break;
1590 case ENETUNREACH:
1591 code = ICMP_NET_UNREACH;
1592 IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1593 IPSTATS_MIB_INNOROUTES);
1594 break;
1595 case EACCES:
1596 code = ICMP_PKT_FILTERED;
1597 break;
1598 }
1599
1600 now = jiffies;
1601 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1602 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1603 rt->u.dst.rate_tokens = ip_rt_error_burst;
1604 rt->u.dst.rate_last = now;
1605 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1606 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1607 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1608 }
1609
1610out: kfree_skb(skb);
1611 return 0;
1612}
1613
1614/*
1615 * The last two values are not from the RFC but
1616 * are needed for AMPRnet AX.25 paths.
1617 */
1618
1619static const unsigned short mtu_plateau[] =
1620{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1621
1622static inline unsigned short guess_mtu(unsigned short old_mtu)
1623{
1624 int i;
1625
1626 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1627 if (old_mtu > mtu_plateau[i])
1628 return mtu_plateau[i];
1629 return 68;
1630}
1631
1632unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1633 unsigned short new_mtu,
1634 struct net_device *dev)
1635{
1636 int i, k;
1637 unsigned short old_mtu = ntohs(iph->tot_len);
1638 struct rtable *rth;
1639 int ikeys[2] = { dev->ifindex, 0 };
1640 __be32 skeys[2] = { iph->saddr, 0, };
1641 __be32 daddr = iph->daddr;
1642 unsigned short est_mtu = 0;
1643
1644 for (k = 0; k < 2; k++) {
1645 for (i = 0; i < 2; i++) {
1646 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1647 rt_genid(net));
1648
1649 rcu_read_lock();
1650 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1651 rth = rcu_dereference(rth->u.dst.rt_next)) {
1652 unsigned short mtu = new_mtu;
1653
1654 if (rth->fl.fl4_dst != daddr ||
1655 rth->fl.fl4_src != skeys[i] ||
1656 rth->rt_dst != daddr ||
1657 rth->rt_src != iph->saddr ||
1658 rth->fl.oif != ikeys[k] ||
1659 rth->fl.iif != 0 ||
1660 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1661 !net_eq(dev_net(rth->u.dst.dev), net) ||
1662 rt_is_expired(rth))
1663 continue;
1664
1665 if (new_mtu < 68 || new_mtu >= old_mtu) {
1666
1667 /* BSD 4.2 compatibility hack :-( */
1668 if (mtu == 0 &&
1669 old_mtu >= dst_mtu(&rth->u.dst) &&
1670 old_mtu >= 68 + (iph->ihl << 2))
1671 old_mtu -= iph->ihl << 2;
1672
1673 mtu = guess_mtu(old_mtu);
1674 }
1675 if (mtu <= dst_mtu(&rth->u.dst)) {
1676 if (mtu < dst_mtu(&rth->u.dst)) {
1677 dst_confirm(&rth->u.dst);
1678 if (mtu < ip_rt_min_pmtu) {
1679 mtu = ip_rt_min_pmtu;
1680 rth->u.dst.metrics[RTAX_LOCK-1] |=
1681 (1 << RTAX_MTU);
1682 }
1683 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1684 dst_set_expires(&rth->u.dst,
1685 ip_rt_mtu_expires);
1686 }
1687 est_mtu = mtu;
1688 }
1689 }
1690 rcu_read_unlock();
1691 }
1692 }
1693 return est_mtu ? : new_mtu;
1694}
1695
1696static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1697{
1698 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1699 !(dst_metric_locked(dst, RTAX_MTU))) {
1700 if (mtu < ip_rt_min_pmtu) {
1701 mtu = ip_rt_min_pmtu;
1702 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1703 }
1704 dst->metrics[RTAX_MTU-1] = mtu;
1705 dst_set_expires(dst, ip_rt_mtu_expires);
1706 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1707 }
1708}
1709
1710static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1711{
1712 if (rt_is_expired((struct rtable *)dst))
1713 return NULL;
1714 return dst;
1715}
1716
1717static void ipv4_dst_destroy(struct dst_entry *dst)
1718{
1719 struct rtable *rt = (struct rtable *) dst;
1720 struct inet_peer *peer = rt->peer;
1721 struct in_device *idev = rt->idev;
1722
1723 if (peer) {
1724 rt->peer = NULL;
1725 inet_putpeer(peer);
1726 }
1727
1728 if (idev) {
1729 rt->idev = NULL;
1730 in_dev_put(idev);
1731 }
1732}
1733
1734static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1735 int how)
1736{
1737 struct rtable *rt = (struct rtable *) dst;
1738 struct in_device *idev = rt->idev;
1739 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1740 struct in_device *loopback_idev =
1741 in_dev_get(dev_net(dev)->loopback_dev);
1742 if (loopback_idev) {
1743 rt->idev = loopback_idev;
1744 in_dev_put(idev);
1745 }
1746 }
1747}
1748
1749static void ipv4_link_failure(struct sk_buff *skb)
1750{
1751 struct rtable *rt;
1752
1753 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1754
1755 rt = skb_rtable(skb);
1756 if (rt)
1757 dst_set_expires(&rt->u.dst, 0);
1758}
1759
1760static int ip_rt_bug(struct sk_buff *skb)
1761{
1762 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1763 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1764 skb->dev ? skb->dev->name : "?");
1765 kfree_skb(skb);
1766 return 0;
1767}
1768
1769/*
1770 We do not cache source address of outgoing interface,
1771 because it is used only by IP RR, TS and SRR options,
1772 so that it out of fast path.
1773
1774 BTW remember: "addr" is allowed to be not aligned
1775 in IP options!
1776 */
1777
1778void ip_rt_get_source(u8 *addr, struct rtable *rt)
1779{
1780 __be32 src;
1781 struct fib_result res;
1782
1783 if (rt->fl.iif == 0)
1784 src = rt->rt_src;
1785 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1786 src = FIB_RES_PREFSRC(res);
1787 fib_res_put(&res);
1788 } else
1789 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1790 RT_SCOPE_UNIVERSE);
1791 memcpy(addr, &src, 4);
1792}
1793
1794#ifdef CONFIG_NET_CLS_ROUTE
1795static void set_class_tag(struct rtable *rt, u32 tag)
1796{
1797 if (!(rt->u.dst.tclassid & 0xFFFF))
1798 rt->u.dst.tclassid |= tag & 0xFFFF;
1799 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1800 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1801}
1802#endif
1803
1804static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1805{
1806 struct fib_info *fi = res->fi;
1807
1808 if (fi) {
1809 if (FIB_RES_GW(*res) &&
1810 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1811 rt->rt_gateway = FIB_RES_GW(*res);
1812 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1813 sizeof(rt->u.dst.metrics));
1814 if (fi->fib_mtu == 0) {
1815 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1816 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1817 rt->rt_gateway != rt->rt_dst &&
1818 rt->u.dst.dev->mtu > 576)
1819 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1820 }
1821#ifdef CONFIG_NET_CLS_ROUTE
1822 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1823#endif
1824 } else
1825 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1826
1827 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1828 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1829 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1830 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1831 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1832 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1833 ip_rt_min_advmss);
1834 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1835 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1836
1837#ifdef CONFIG_NET_CLS_ROUTE
1838#ifdef CONFIG_IP_MULTIPLE_TABLES
1839 set_class_tag(rt, fib_rules_tclass(res));
1840#endif
1841 set_class_tag(rt, itag);
1842#endif
1843 rt->rt_type = res->type;
1844}
1845
1846/* called in rcu_read_lock() section */
1847static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1848 u8 tos, struct net_device *dev, int our)
1849{
1850 unsigned int hash;
1851 struct rtable *rth;
1852 __be32 spec_dst;
1853 struct in_device *in_dev = __in_dev_get_rcu(dev);
1854 u32 itag = 0;
1855 int err;
1856
1857 /* Primary sanity checks. */
1858
1859 if (in_dev == NULL)
1860 return -EINVAL;
1861
1862 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1863 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1864 goto e_inval;
1865
1866 if (ipv4_is_zeronet(saddr)) {
1867 if (!ipv4_is_local_multicast(daddr))
1868 goto e_inval;
1869 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1870 } else {
1871 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1872 &itag, 0);
1873 if (err < 0)
1874 goto e_err;
1875 }
1876 rth = dst_alloc(&ipv4_dst_ops);
1877 if (!rth)
1878 goto e_nobufs;
1879
1880 rth->u.dst.output = ip_rt_bug;
1881 rth->u.dst.obsolete = -1;
1882
1883 atomic_set(&rth->u.dst.__refcnt, 1);
1884 rth->u.dst.flags= DST_HOST;
1885 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1886 rth->u.dst.flags |= DST_NOPOLICY;
1887 rth->fl.fl4_dst = daddr;
1888 rth->rt_dst = daddr;
1889 rth->fl.fl4_tos = tos;
1890 rth->fl.mark = skb->mark;
1891 rth->fl.fl4_src = saddr;
1892 rth->rt_src = saddr;
1893#ifdef CONFIG_NET_CLS_ROUTE
1894 rth->u.dst.tclassid = itag;
1895#endif
1896 rth->rt_iif =
1897 rth->fl.iif = dev->ifindex;
1898 rth->u.dst.dev = init_net.loopback_dev;
1899 dev_hold(rth->u.dst.dev);
1900 rth->idev = in_dev_get(rth->u.dst.dev);
1901 rth->fl.oif = 0;
1902 rth->rt_gateway = daddr;
1903 rth->rt_spec_dst= spec_dst;
1904 rth->rt_genid = rt_genid(dev_net(dev));
1905 rth->rt_flags = RTCF_MULTICAST;
1906 rth->rt_type = RTN_MULTICAST;
1907 if (our) {
1908 rth->u.dst.input= ip_local_deliver;
1909 rth->rt_flags |= RTCF_LOCAL;
1910 }
1911
1912#ifdef CONFIG_IP_MROUTE
1913 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1914 rth->u.dst.input = ip_mr_input;
1915#endif
1916 RT_CACHE_STAT_INC(in_slow_mc);
1917
1918 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1919 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1920
1921e_nobufs:
1922 return -ENOBUFS;
1923e_inval:
1924 return -EINVAL;
1925e_err:
1926 return err;
1927}
1928
1929
1930static void ip_handle_martian_source(struct net_device *dev,
1931 struct in_device *in_dev,
1932 struct sk_buff *skb,
1933 __be32 daddr,
1934 __be32 saddr)
1935{
1936 RT_CACHE_STAT_INC(in_martian_src);
1937#ifdef CONFIG_IP_ROUTE_VERBOSE
1938 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1939 /*
1940 * RFC1812 recommendation, if source is martian,
1941 * the only hint is MAC header.
1942 */
1943 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1944 &daddr, &saddr, dev->name);
1945 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1946 int i;
1947 const unsigned char *p = skb_mac_header(skb);
1948 printk(KERN_WARNING "ll header: ");
1949 for (i = 0; i < dev->hard_header_len; i++, p++) {
1950 printk("%02x", *p);
1951 if (i < (dev->hard_header_len - 1))
1952 printk(":");
1953 }
1954 printk("\n");
1955 }
1956 }
1957#endif
1958}
1959
1960/* called in rcu_read_lock() section */
1961static int __mkroute_input(struct sk_buff *skb,
1962 struct fib_result *res,
1963 struct in_device *in_dev,
1964 __be32 daddr, __be32 saddr, u32 tos,
1965 struct rtable **result)
1966{
1967 struct rtable *rth;
1968 int err;
1969 struct in_device *out_dev;
1970 unsigned int flags = 0;
1971 __be32 spec_dst;
1972 u32 itag;
1973
1974 /* get a working reference to the output device */
1975 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1976 if (out_dev == NULL) {
1977 if (net_ratelimit())
1978 printk(KERN_CRIT "Bug in ip_route_input" \
1979 "_slow(). Please, report\n");
1980 return -EINVAL;
1981 }
1982
1983
1984 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1985 in_dev->dev, &spec_dst, &itag, skb->mark);
1986 if (err < 0) {
1987 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1988 saddr);
1989
1990 goto cleanup;
1991 }
1992
1993 if (err)
1994 flags |= RTCF_DIRECTSRC;
1995
1996 if (out_dev == in_dev && err &&
1997 (IN_DEV_SHARED_MEDIA(out_dev) ||
1998 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1999 flags |= RTCF_DOREDIRECT;
2000
2001 if (skb->protocol != htons(ETH_P_IP)) {
2002 /* Not IP (i.e. ARP). Do not create route, if it is
2003 * invalid for proxy arp. DNAT routes are always valid.
2004 *
2005 * Proxy arp feature have been extended to allow, ARP
2006 * replies back to the same interface, to support
2007 * Private VLAN switch technologies. See arp.c.
2008 */
2009 if (out_dev == in_dev &&
2010 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2011 err = -EINVAL;
2012 goto cleanup;
2013 }
2014 }
2015
2016
2017 rth = dst_alloc(&ipv4_dst_ops);
2018 if (!rth) {
2019 err = -ENOBUFS;
2020 goto cleanup;
2021 }
2022
2023 atomic_set(&rth->u.dst.__refcnt, 1);
2024 rth->u.dst.flags= DST_HOST;
2025 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2026 rth->u.dst.flags |= DST_NOPOLICY;
2027 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2028 rth->u.dst.flags |= DST_NOXFRM;
2029 rth->fl.fl4_dst = daddr;
2030 rth->rt_dst = daddr;
2031 rth->fl.fl4_tos = tos;
2032 rth->fl.mark = skb->mark;
2033 rth->fl.fl4_src = saddr;
2034 rth->rt_src = saddr;
2035 rth->rt_gateway = daddr;
2036 rth->rt_iif =
2037 rth->fl.iif = in_dev->dev->ifindex;
2038 rth->u.dst.dev = (out_dev)->dev;
2039 dev_hold(rth->u.dst.dev);
2040 rth->idev = in_dev_get(rth->u.dst.dev);
2041 rth->fl.oif = 0;
2042 rth->rt_spec_dst= spec_dst;
2043
2044 rth->u.dst.obsolete = -1;
2045 rth->u.dst.input = ip_forward;
2046 rth->u.dst.output = ip_output;
2047 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2048
2049 rt_set_nexthop(rth, res, itag);
2050
2051 rth->rt_flags = flags;
2052
2053 *result = rth;
2054 err = 0;
2055 cleanup:
2056 return err;
2057}
2058
2059static int ip_mkroute_input(struct sk_buff *skb,
2060 struct fib_result *res,
2061 const struct flowi *fl,
2062 struct in_device *in_dev,
2063 __be32 daddr, __be32 saddr, u32 tos)
2064{
2065 struct rtable* rth = NULL;
2066 int err;
2067 unsigned hash;
2068
2069#ifdef CONFIG_IP_ROUTE_MULTIPATH
2070 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2071 fib_select_multipath(fl, res);
2072#endif
2073
2074 /* create a routing cache entry */
2075 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2076 if (err)
2077 return err;
2078
2079 /* put it into the cache */
2080 hash = rt_hash(daddr, saddr, fl->iif,
2081 rt_genid(dev_net(rth->u.dst.dev)));
2082 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2083}
2084
2085/*
2086 * NOTE. We drop all the packets that has local source
2087 * addresses, because every properly looped back packet
2088 * must have correct destination already attached by output routine.
2089 *
2090 * Such approach solves two big problems:
2091 * 1. Not simplex devices are handled properly.
2092 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2093 */
2094
2095static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2096 u8 tos, struct net_device *dev)
2097{
2098 struct fib_result res;
2099 struct in_device *in_dev = __in_dev_get_rcu(dev);
2100 struct flowi fl = { .nl_u = { .ip4_u =
2101 { .daddr = daddr,
2102 .saddr = saddr,
2103 .tos = tos,
2104 .scope = RT_SCOPE_UNIVERSE,
2105 } },
2106 .mark = skb->mark,
2107 .iif = dev->ifindex };
2108 unsigned flags = 0;
2109 u32 itag = 0;
2110 struct rtable * rth;
2111 unsigned hash;
2112 __be32 spec_dst;
2113 int err = -EINVAL;
2114 int free_res = 0;
2115 struct net * net = dev_net(dev);
2116
2117 /* IP on this device is disabled. */
2118
2119 if (!in_dev)
2120 goto out;
2121
2122 /* Check for the most weird martians, which can be not detected
2123 by fib_lookup.
2124 */
2125
2126 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2127 ipv4_is_loopback(saddr))
2128 goto martian_source;
2129
2130 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2131 goto brd_input;
2132
2133 /* Accept zero addresses only to limited broadcast;
2134 * I even do not know to fix it or not. Waiting for complains :-)
2135 */
2136 if (ipv4_is_zeronet(saddr))
2137 goto martian_source;
2138
2139 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2140 ipv4_is_loopback(daddr))
2141 goto martian_destination;
2142
2143 /*
2144 * Now we are ready to route packet.
2145 */
2146 if ((err = fib_lookup(net, &fl, &res)) != 0) {
2147 if (!IN_DEV_FORWARD(in_dev))
2148 goto e_hostunreach;
2149 goto no_route;
2150 }
2151 free_res = 1;
2152
2153 RT_CACHE_STAT_INC(in_slow_tot);
2154
2155 if (res.type == RTN_BROADCAST)
2156 goto brd_input;
2157
2158 if (res.type == RTN_LOCAL) {
2159 err = fib_validate_source(saddr, daddr, tos,
2160 net->loopback_dev->ifindex,
2161 dev, &spec_dst, &itag, skb->mark);
2162 if (err < 0)
2163 goto martian_source_keep_err;
2164 if (err)
2165 flags |= RTCF_DIRECTSRC;
2166 spec_dst = daddr;
2167 goto local_input;
2168 }
2169
2170 if (!IN_DEV_FORWARD(in_dev))
2171 goto e_hostunreach;
2172 if (res.type != RTN_UNICAST)
2173 goto martian_destination;
2174
2175 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2176done:
2177 if (free_res)
2178 fib_res_put(&res);
2179out: return err;
2180
2181brd_input:
2182 if (skb->protocol != htons(ETH_P_IP))
2183 goto e_inval;
2184
2185 if (ipv4_is_zeronet(saddr))
2186 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2187 else {
2188 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2189 &itag, skb->mark);
2190 if (err < 0)
2191 goto martian_source_keep_err;
2192 if (err)
2193 flags |= RTCF_DIRECTSRC;
2194 }
2195 flags |= RTCF_BROADCAST;
2196 res.type = RTN_BROADCAST;
2197 RT_CACHE_STAT_INC(in_brd);
2198
2199local_input:
2200 rth = dst_alloc(&ipv4_dst_ops);
2201 if (!rth)
2202 goto e_nobufs;
2203
2204 rth->u.dst.output= ip_rt_bug;
2205 rth->u.dst.obsolete = -1;
2206 rth->rt_genid = rt_genid(net);
2207
2208 atomic_set(&rth->u.dst.__refcnt, 1);
2209 rth->u.dst.flags= DST_HOST;
2210 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2211 rth->u.dst.flags |= DST_NOPOLICY;
2212 rth->fl.fl4_dst = daddr;
2213 rth->rt_dst = daddr;
2214 rth->fl.fl4_tos = tos;
2215 rth->fl.mark = skb->mark;
2216 rth->fl.fl4_src = saddr;
2217 rth->rt_src = saddr;
2218#ifdef CONFIG_NET_CLS_ROUTE
2219 rth->u.dst.tclassid = itag;
2220#endif
2221 rth->rt_iif =
2222 rth->fl.iif = dev->ifindex;
2223 rth->u.dst.dev = net->loopback_dev;
2224 dev_hold(rth->u.dst.dev);
2225 rth->idev = in_dev_get(rth->u.dst.dev);
2226 rth->rt_gateway = daddr;
2227 rth->rt_spec_dst= spec_dst;
2228 rth->u.dst.input= ip_local_deliver;
2229 rth->rt_flags = flags|RTCF_LOCAL;
2230 if (res.type == RTN_UNREACHABLE) {
2231 rth->u.dst.input= ip_error;
2232 rth->u.dst.error= -err;
2233 rth->rt_flags &= ~RTCF_LOCAL;
2234 }
2235 rth->rt_type = res.type;
2236 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2237 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2238 goto done;
2239
2240no_route:
2241 RT_CACHE_STAT_INC(in_no_route);
2242 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2243 res.type = RTN_UNREACHABLE;
2244 if (err == -ESRCH)
2245 err = -ENETUNREACH;
2246 goto local_input;
2247
2248 /*
2249 * Do not cache martian addresses: they should be logged (RFC1812)
2250 */
2251martian_destination:
2252 RT_CACHE_STAT_INC(in_martian_dst);
2253#ifdef CONFIG_IP_ROUTE_VERBOSE
2254 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2255 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2256 &daddr, &saddr, dev->name);
2257#endif
2258
2259e_hostunreach:
2260 err = -EHOSTUNREACH;
2261 goto done;
2262
2263e_inval:
2264 err = -EINVAL;
2265 goto done;
2266
2267e_nobufs:
2268 err = -ENOBUFS;
2269 goto done;
2270
2271martian_source:
2272 err = -EINVAL;
2273martian_source_keep_err:
2274 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2275 goto done;
2276}
2277
2278int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2279 u8 tos, struct net_device *dev, bool noref)
2280{
2281 struct rtable * rth;
2282 unsigned hash;
2283 int iif = dev->ifindex;
2284 struct net *net;
2285 int res;
2286
2287 net = dev_net(dev);
2288
2289 rcu_read_lock();
2290
2291 if (!rt_caching(net))
2292 goto skip_cache;
2293
2294 tos &= IPTOS_RT_MASK;
2295 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2296
2297 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2298 rth = rcu_dereference(rth->u.dst.rt_next)) {
2299 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2300 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2301 (rth->fl.iif ^ iif) |
2302 rth->fl.oif |
2303 (rth->fl.fl4_tos ^ tos)) == 0 &&
2304 rth->fl.mark == skb->mark &&
2305 net_eq(dev_net(rth->u.dst.dev), net) &&
2306 !rt_is_expired(rth)) {
2307 if (noref) {
2308 dst_use_noref(&rth->u.dst, jiffies);
2309 skb_dst_set_noref(skb, &rth->u.dst);
2310 } else {
2311 dst_use(&rth->u.dst, jiffies);
2312 skb_dst_set(skb, &rth->u.dst);
2313 }
2314 RT_CACHE_STAT_INC(in_hit);
2315 rcu_read_unlock();
2316 return 0;
2317 }
2318 RT_CACHE_STAT_INC(in_hlist_search);
2319 }
2320
2321skip_cache:
2322 /* Multicast recognition logic is moved from route cache to here.
2323 The problem was that too many Ethernet cards have broken/missing
2324 hardware multicast filters :-( As result the host on multicasting
2325 network acquires a lot of useless route cache entries, sort of
2326 SDR messages from all the world. Now we try to get rid of them.
2327 Really, provided software IP multicast filter is organized
2328 reasonably (at least, hashed), it does not result in a slowdown
2329 comparing with route cache reject entries.
2330 Note, that multicast routers are not affected, because
2331 route cache entry is created eventually.
2332 */
2333 if (ipv4_is_multicast(daddr)) {
2334 struct in_device *in_dev = __in_dev_get_rcu(dev);
2335
2336 if (in_dev) {
2337 int our = ip_check_mc(in_dev, daddr, saddr,
2338 ip_hdr(skb)->protocol);
2339 if (our
2340#ifdef CONFIG_IP_MROUTE
2341 ||
2342 (!ipv4_is_local_multicast(daddr) &&
2343 IN_DEV_MFORWARD(in_dev))
2344#endif
2345 ) {
2346 int res = ip_route_input_mc(skb, daddr, saddr,
2347 tos, dev, our);
2348 rcu_read_unlock();
2349 return res;
2350 }
2351 }
2352 rcu_read_unlock();
2353 return -EINVAL;
2354 }
2355 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2356 rcu_read_unlock();
2357 return res;
2358}
2359EXPORT_SYMBOL(ip_route_input_common);
2360
2361static int __mkroute_output(struct rtable **result,
2362 struct fib_result *res,
2363 const struct flowi *fl,
2364 const struct flowi *oldflp,
2365 struct net_device *dev_out,
2366 unsigned flags)
2367{
2368 struct rtable *rth;
2369 struct in_device *in_dev;
2370 u32 tos = RT_FL_TOS(oldflp);
2371 int err = 0;
2372
2373 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2374 return -EINVAL;
2375
2376 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2377 res->type = RTN_BROADCAST;
2378 else if (ipv4_is_multicast(fl->fl4_dst))
2379 res->type = RTN_MULTICAST;
2380 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2381 return -EINVAL;
2382
2383 if (dev_out->flags & IFF_LOOPBACK)
2384 flags |= RTCF_LOCAL;
2385
2386 /* get work reference to inet device */
2387 in_dev = in_dev_get(dev_out);
2388 if (!in_dev)
2389 return -EINVAL;
2390
2391 if (res->type == RTN_BROADCAST) {
2392 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2393 if (res->fi) {
2394 fib_info_put(res->fi);
2395 res->fi = NULL;
2396 }
2397 } else if (res->type == RTN_MULTICAST) {
2398 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2399 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2400 oldflp->proto))
2401 flags &= ~RTCF_LOCAL;
2402 /* If multicast route do not exist use
2403 default one, but do not gateway in this case.
2404 Yes, it is hack.
2405 */
2406 if (res->fi && res->prefixlen < 4) {
2407 fib_info_put(res->fi);
2408 res->fi = NULL;
2409 }
2410 }
2411
2412
2413 rth = dst_alloc(&ipv4_dst_ops);
2414 if (!rth) {
2415 err = -ENOBUFS;
2416 goto cleanup;
2417 }
2418
2419 atomic_set(&rth->u.dst.__refcnt, 1);
2420 rth->u.dst.flags= DST_HOST;
2421 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2422 rth->u.dst.flags |= DST_NOXFRM;
2423 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2424 rth->u.dst.flags |= DST_NOPOLICY;
2425
2426 rth->fl.fl4_dst = oldflp->fl4_dst;
2427 rth->fl.fl4_tos = tos;
2428 rth->fl.fl4_src = oldflp->fl4_src;
2429 rth->fl.oif = oldflp->oif;
2430 rth->fl.mark = oldflp->mark;
2431 rth->rt_dst = fl->fl4_dst;
2432 rth->rt_src = fl->fl4_src;
2433 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2434 /* get references to the devices that are to be hold by the routing
2435 cache entry */
2436 rth->u.dst.dev = dev_out;
2437 dev_hold(dev_out);
2438 rth->idev = in_dev_get(dev_out);
2439 rth->rt_gateway = fl->fl4_dst;
2440 rth->rt_spec_dst= fl->fl4_src;
2441
2442 rth->u.dst.output=ip_output;
2443 rth->u.dst.obsolete = -1;
2444 rth->rt_genid = rt_genid(dev_net(dev_out));
2445
2446 RT_CACHE_STAT_INC(out_slow_tot);
2447
2448 if (flags & RTCF_LOCAL) {
2449 rth->u.dst.input = ip_local_deliver;
2450 rth->rt_spec_dst = fl->fl4_dst;
2451 }
2452 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2453 rth->rt_spec_dst = fl->fl4_src;
2454 if (flags & RTCF_LOCAL &&
2455 !(dev_out->flags & IFF_LOOPBACK)) {
2456 rth->u.dst.output = ip_mc_output;
2457 RT_CACHE_STAT_INC(out_slow_mc);
2458 }
2459#ifdef CONFIG_IP_MROUTE
2460 if (res->type == RTN_MULTICAST) {
2461 if (IN_DEV_MFORWARD(in_dev) &&
2462 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2463 rth->u.dst.input = ip_mr_input;
2464 rth->u.dst.output = ip_mc_output;
2465 }
2466 }
2467#endif
2468 }
2469
2470 rt_set_nexthop(rth, res, 0);
2471
2472 rth->rt_flags = flags;
2473
2474 *result = rth;
2475 cleanup:
2476 /* release work reference to inet device */
2477 in_dev_put(in_dev);
2478
2479 return err;
2480}
2481
2482static int ip_mkroute_output(struct rtable **rp,
2483 struct fib_result *res,
2484 const struct flowi *fl,
2485 const struct flowi *oldflp,
2486 struct net_device *dev_out,
2487 unsigned flags)
2488{
2489 struct rtable *rth = NULL;
2490 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2491 unsigned hash;
2492 if (err == 0) {
2493 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2494 rt_genid(dev_net(dev_out)));
2495 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2496 }
2497
2498 return err;
2499}
2500
2501/*
2502 * Major route resolver routine.
2503 */
2504
2505static int ip_route_output_slow(struct net *net, struct rtable **rp,
2506 const struct flowi *oldflp)
2507{
2508 u32 tos = RT_FL_TOS(oldflp);
2509 struct flowi fl = { .nl_u = { .ip4_u =
2510 { .daddr = oldflp->fl4_dst,
2511 .saddr = oldflp->fl4_src,
2512 .tos = tos & IPTOS_RT_MASK,
2513 .scope = ((tos & RTO_ONLINK) ?
2514 RT_SCOPE_LINK :
2515 RT_SCOPE_UNIVERSE),
2516 } },
2517 .mark = oldflp->mark,
2518 .iif = net->loopback_dev->ifindex,
2519 .oif = oldflp->oif };
2520 struct fib_result res;
2521 unsigned flags = 0;
2522 struct net_device *dev_out = NULL;
2523 int free_res = 0;
2524 int err;
2525
2526
2527 res.fi = NULL;
2528#ifdef CONFIG_IP_MULTIPLE_TABLES
2529 res.r = NULL;
2530#endif
2531
2532 if (oldflp->fl4_src) {
2533 err = -EINVAL;
2534 if (ipv4_is_multicast(oldflp->fl4_src) ||
2535 ipv4_is_lbcast(oldflp->fl4_src) ||
2536 ipv4_is_zeronet(oldflp->fl4_src))
2537 goto out;
2538
2539 /* I removed check for oif == dev_out->oif here.
2540 It was wrong for two reasons:
2541 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2542 is assigned to multiple interfaces.
2543 2. Moreover, we are allowed to send packets with saddr
2544 of another iface. --ANK
2545 */
2546
2547 if (oldflp->oif == 0 &&
2548 (ipv4_is_multicast(oldflp->fl4_dst) ||
2549 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2550 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2551 dev_out = ip_dev_find(net, oldflp->fl4_src);
2552 if (dev_out == NULL)
2553 goto out;
2554
2555 /* Special hack: user can direct multicasts
2556 and limited broadcast via necessary interface
2557 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2558 This hack is not just for fun, it allows
2559 vic,vat and friends to work.
2560 They bind socket to loopback, set ttl to zero
2561 and expect that it will work.
2562 From the viewpoint of routing cache they are broken,
2563 because we are not allowed to build multicast path
2564 with loopback source addr (look, routing cache
2565 cannot know, that ttl is zero, so that packet
2566 will not leave this host and route is valid).
2567 Luckily, this hack is good workaround.
2568 */
2569
2570 fl.oif = dev_out->ifindex;
2571 goto make_route;
2572 }
2573
2574 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2575 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2576 dev_out = ip_dev_find(net, oldflp->fl4_src);
2577 if (dev_out == NULL)
2578 goto out;
2579 dev_put(dev_out);
2580 dev_out = NULL;
2581 }
2582 }
2583
2584
2585 if (oldflp->oif) {
2586 dev_out = dev_get_by_index(net, oldflp->oif);
2587 err = -ENODEV;
2588 if (dev_out == NULL)
2589 goto out;
2590
2591 /* RACE: Check return value of inet_select_addr instead. */
2592 if (__in_dev_get_rtnl(dev_out) == NULL) {
2593 dev_put(dev_out);
2594 goto out; /* Wrong error code */
2595 }
2596
2597 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2598 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2599 if (!fl.fl4_src)
2600 fl.fl4_src = inet_select_addr(dev_out, 0,
2601 RT_SCOPE_LINK);
2602 goto make_route;
2603 }
2604 if (!fl.fl4_src) {
2605 if (ipv4_is_multicast(oldflp->fl4_dst))
2606 fl.fl4_src = inet_select_addr(dev_out, 0,
2607 fl.fl4_scope);
2608 else if (!oldflp->fl4_dst)
2609 fl.fl4_src = inet_select_addr(dev_out, 0,
2610 RT_SCOPE_HOST);
2611 }
2612 }
2613
2614 if (!fl.fl4_dst) {
2615 fl.fl4_dst = fl.fl4_src;
2616 if (!fl.fl4_dst)
2617 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2618 if (dev_out)
2619 dev_put(dev_out);
2620 dev_out = net->loopback_dev;
2621 dev_hold(dev_out);
2622 fl.oif = net->loopback_dev->ifindex;
2623 res.type = RTN_LOCAL;
2624 flags |= RTCF_LOCAL;
2625 goto make_route;
2626 }
2627
2628 if (fib_lookup(net, &fl, &res)) {
2629 res.fi = NULL;
2630 if (oldflp->oif) {
2631 /* Apparently, routing tables are wrong. Assume,
2632 that the destination is on link.
2633
2634 WHY? DW.
2635 Because we are allowed to send to iface
2636 even if it has NO routes and NO assigned
2637 addresses. When oif is specified, routing
2638 tables are looked up with only one purpose:
2639 to catch if destination is gatewayed, rather than
2640 direct. Moreover, if MSG_DONTROUTE is set,
2641 we send packet, ignoring both routing tables
2642 and ifaddr state. --ANK
2643
2644
2645 We could make it even if oif is unknown,
2646 likely IPv6, but we do not.
2647 */
2648
2649 if (fl.fl4_src == 0)
2650 fl.fl4_src = inet_select_addr(dev_out, 0,
2651 RT_SCOPE_LINK);
2652 res.type = RTN_UNICAST;
2653 goto make_route;
2654 }
2655 if (dev_out)
2656 dev_put(dev_out);
2657 err = -ENETUNREACH;
2658 goto out;
2659 }
2660 free_res = 1;
2661
2662 if (res.type == RTN_LOCAL) {
2663 if (!fl.fl4_src)
2664 fl.fl4_src = fl.fl4_dst;
2665 if (dev_out)
2666 dev_put(dev_out);
2667 dev_out = net->loopback_dev;
2668 dev_hold(dev_out);
2669 fl.oif = dev_out->ifindex;
2670 if (res.fi)
2671 fib_info_put(res.fi);
2672 res.fi = NULL;
2673 flags |= RTCF_LOCAL;
2674 goto make_route;
2675 }
2676
2677#ifdef CONFIG_IP_ROUTE_MULTIPATH
2678 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2679 fib_select_multipath(&fl, &res);
2680 else
2681#endif
2682 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2683 fib_select_default(net, &fl, &res);
2684
2685 if (!fl.fl4_src)
2686 fl.fl4_src = FIB_RES_PREFSRC(res);
2687
2688 if (dev_out)
2689 dev_put(dev_out);
2690 dev_out = FIB_RES_DEV(res);
2691 dev_hold(dev_out);
2692 fl.oif = dev_out->ifindex;
2693
2694
2695make_route:
2696 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2697
2698
2699 if (free_res)
2700 fib_res_put(&res);
2701 if (dev_out)
2702 dev_put(dev_out);
2703out: return err;
2704}
2705
2706int __ip_route_output_key(struct net *net, struct rtable **rp,
2707 const struct flowi *flp)
2708{
2709 unsigned hash;
2710 struct rtable *rth;
2711
2712 if (!rt_caching(net))
2713 goto slow_output;
2714
2715 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2716
2717 rcu_read_lock_bh();
2718 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2719 rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
2720 if (rth->fl.fl4_dst == flp->fl4_dst &&
2721 rth->fl.fl4_src == flp->fl4_src &&
2722 rth->fl.iif == 0 &&
2723 rth->fl.oif == flp->oif &&
2724 rth->fl.mark == flp->mark &&
2725 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2726 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2727 net_eq(dev_net(rth->u.dst.dev), net) &&
2728 !rt_is_expired(rth)) {
2729 dst_use(&rth->u.dst, jiffies);
2730 RT_CACHE_STAT_INC(out_hit);
2731 rcu_read_unlock_bh();
2732 *rp = rth;
2733 return 0;
2734 }
2735 RT_CACHE_STAT_INC(out_hlist_search);
2736 }
2737 rcu_read_unlock_bh();
2738
2739slow_output:
2740 return ip_route_output_slow(net, rp, flp);
2741}
2742
2743EXPORT_SYMBOL_GPL(__ip_route_output_key);
2744
2745static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2746{
2747}
2748
2749static struct dst_ops ipv4_dst_blackhole_ops = {
2750 .family = AF_INET,
2751 .protocol = cpu_to_be16(ETH_P_IP),
2752 .destroy = ipv4_dst_destroy,
2753 .check = ipv4_dst_check,
2754 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2755 .entries = ATOMIC_INIT(0),
2756};
2757
2758
2759static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2760{
2761 struct rtable *ort = *rp;
2762 struct rtable *rt = (struct rtable *)
2763 dst_alloc(&ipv4_dst_blackhole_ops);
2764
2765 if (rt) {
2766 struct dst_entry *new = &rt->u.dst;
2767
2768 atomic_set(&new->__refcnt, 1);
2769 new->__use = 1;
2770 new->input = dst_discard;
2771 new->output = dst_discard;
2772 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2773
2774 new->dev = ort->u.dst.dev;
2775 if (new->dev)
2776 dev_hold(new->dev);
2777
2778 rt->fl = ort->fl;
2779
2780 rt->idev = ort->idev;
2781 if (rt->idev)
2782 in_dev_hold(rt->idev);
2783 rt->rt_genid = rt_genid(net);
2784 rt->rt_flags = ort->rt_flags;
2785 rt->rt_type = ort->rt_type;
2786 rt->rt_dst = ort->rt_dst;
2787 rt->rt_src = ort->rt_src;
2788 rt->rt_iif = ort->rt_iif;
2789 rt->rt_gateway = ort->rt_gateway;
2790 rt->rt_spec_dst = ort->rt_spec_dst;
2791 rt->peer = ort->peer;
2792 if (rt->peer)
2793 atomic_inc(&rt->peer->refcnt);
2794
2795 dst_free(new);
2796 }
2797
2798 dst_release(&(*rp)->u.dst);
2799 *rp = rt;
2800 return (rt ? 0 : -ENOMEM);
2801}
2802
2803int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2804 struct sock *sk, int flags)
2805{
2806 int err;
2807
2808 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2809 return err;
2810
2811 if (flp->proto) {
2812 if (!flp->fl4_src)
2813 flp->fl4_src = (*rp)->rt_src;
2814 if (!flp->fl4_dst)
2815 flp->fl4_dst = (*rp)->rt_dst;
2816 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2817 flags ? XFRM_LOOKUP_WAIT : 0);
2818 if (err == -EREMOTE)
2819 err = ipv4_dst_blackhole(net, rp, flp);
2820
2821 return err;
2822 }
2823
2824 return 0;
2825}
2826
2827EXPORT_SYMBOL_GPL(ip_route_output_flow);
2828
2829int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2830{
2831 return ip_route_output_flow(net, rp, flp, NULL, 0);
2832}
2833
2834static int rt_fill_info(struct net *net,
2835 struct sk_buff *skb, u32 pid, u32 seq, int event,
2836 int nowait, unsigned int flags)
2837{
2838 struct rtable *rt = skb_rtable(skb);
2839 struct rtmsg *r;
2840 struct nlmsghdr *nlh;
2841 long expires;
2842 u32 id = 0, ts = 0, tsage = 0, error;
2843
2844 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2845 if (nlh == NULL)
2846 return -EMSGSIZE;
2847
2848 r = nlmsg_data(nlh);
2849 r->rtm_family = AF_INET;
2850 r->rtm_dst_len = 32;
2851 r->rtm_src_len = 0;
2852 r->rtm_tos = rt->fl.fl4_tos;
2853 r->rtm_table = RT_TABLE_MAIN;
2854 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2855 r->rtm_type = rt->rt_type;
2856 r->rtm_scope = RT_SCOPE_UNIVERSE;
2857 r->rtm_protocol = RTPROT_UNSPEC;
2858 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2859 if (rt->rt_flags & RTCF_NOTIFY)
2860 r->rtm_flags |= RTM_F_NOTIFY;
2861
2862 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2863
2864 if (rt->fl.fl4_src) {
2865 r->rtm_src_len = 32;
2866 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2867 }
2868 if (rt->u.dst.dev)
2869 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2870#ifdef CONFIG_NET_CLS_ROUTE
2871 if (rt->u.dst.tclassid)
2872 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2873#endif
2874 if (rt->fl.iif)
2875 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2876 else if (rt->rt_src != rt->fl.fl4_src)
2877 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2878
2879 if (rt->rt_dst != rt->rt_gateway)
2880 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2881
2882 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2883 goto nla_put_failure;
2884
2885 error = rt->u.dst.error;
2886 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2887 if (rt->peer) {
2888 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2889 if (rt->peer->tcp_ts_stamp) {
2890 ts = rt->peer->tcp_ts;
2891 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2892 }
2893 }
2894
2895 if (rt->fl.iif) {
2896#ifdef CONFIG_IP_MROUTE
2897 __be32 dst = rt->rt_dst;
2898
2899 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2900 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2901 int err = ipmr_get_route(net, skb, r, nowait);
2902 if (err <= 0) {
2903 if (!nowait) {
2904 if (err == 0)
2905 return 0;
2906 goto nla_put_failure;
2907 } else {
2908 if (err == -EMSGSIZE)
2909 goto nla_put_failure;
2910 error = err;
2911 }
2912 }
2913 } else
2914#endif
2915 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2916 }
2917
2918 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2919 expires, error) < 0)
2920 goto nla_put_failure;
2921
2922 return nlmsg_end(skb, nlh);
2923
2924nla_put_failure:
2925 nlmsg_cancel(skb, nlh);
2926 return -EMSGSIZE;
2927}
2928
2929static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2930{
2931 struct net *net = sock_net(in_skb->sk);
2932 struct rtmsg *rtm;
2933 struct nlattr *tb[RTA_MAX+1];
2934 struct rtable *rt = NULL;
2935 __be32 dst = 0;
2936 __be32 src = 0;
2937 u32 iif;
2938 int err;
2939 struct sk_buff *skb;
2940
2941 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2942 if (err < 0)
2943 goto errout;
2944
2945 rtm = nlmsg_data(nlh);
2946
2947 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2948 if (skb == NULL) {
2949 err = -ENOBUFS;
2950 goto errout;
2951 }
2952
2953 /* Reserve room for dummy headers, this skb can pass
2954 through good chunk of routing engine.
2955 */
2956 skb_reset_mac_header(skb);
2957 skb_reset_network_header(skb);
2958
2959 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2960 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2961 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2962
2963 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2964 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2965 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2966
2967 if (iif) {
2968 struct net_device *dev;
2969
2970 dev = __dev_get_by_index(net, iif);
2971 if (dev == NULL) {
2972 err = -ENODEV;
2973 goto errout_free;
2974 }
2975
2976 skb->protocol = htons(ETH_P_IP);
2977 skb->dev = dev;
2978 local_bh_disable();
2979 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2980 local_bh_enable();
2981
2982 rt = skb_rtable(skb);
2983 if (err == 0 && rt->u.dst.error)
2984 err = -rt->u.dst.error;
2985 } else {
2986 struct flowi fl = {
2987 .nl_u = {
2988 .ip4_u = {
2989 .daddr = dst,
2990 .saddr = src,
2991 .tos = rtm->rtm_tos,
2992 },
2993 },
2994 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2995 };
2996 err = ip_route_output_key(net, &rt, &fl);
2997 }
2998
2999 if (err)
3000 goto errout_free;
3001
3002 skb_dst_set(skb, &rt->u.dst);
3003 if (rtm->rtm_flags & RTM_F_NOTIFY)
3004 rt->rt_flags |= RTCF_NOTIFY;
3005
3006 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3007 RTM_NEWROUTE, 0, 0);
3008 if (err <= 0)
3009 goto errout_free;
3010
3011 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3012errout:
3013 return err;
3014
3015errout_free:
3016 kfree_skb(skb);
3017 goto errout;
3018}
3019
3020int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3021{
3022 struct rtable *rt;
3023 int h, s_h;
3024 int idx, s_idx;
3025 struct net *net;
3026
3027 net = sock_net(skb->sk);
3028
3029 s_h = cb->args[0];
3030 if (s_h < 0)
3031 s_h = 0;
3032 s_idx = idx = cb->args[1];
3033 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3034 if (!rt_hash_table[h].chain)
3035 continue;
3036 rcu_read_lock_bh();
3037 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3038 rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
3039 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3040 continue;
3041 if (rt_is_expired(rt))
3042 continue;
3043 skb_dst_set_noref(skb, &rt->u.dst);
3044 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3045 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3046 1, NLM_F_MULTI) <= 0) {
3047 skb_dst_drop(skb);
3048 rcu_read_unlock_bh();
3049 goto done;
3050 }
3051 skb_dst_drop(skb);
3052 }
3053 rcu_read_unlock_bh();
3054 }
3055
3056done:
3057 cb->args[0] = h;
3058 cb->args[1] = idx;
3059 return skb->len;
3060}
3061
3062void ip_rt_multicast_event(struct in_device *in_dev)
3063{
3064 rt_cache_flush(dev_net(in_dev->dev), 0);
3065}
3066
3067#ifdef CONFIG_SYSCTL
3068static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3069 void __user *buffer,
3070 size_t *lenp, loff_t *ppos)
3071{
3072 if (write) {
3073 int flush_delay;
3074 ctl_table ctl;
3075 struct net *net;
3076
3077 memcpy(&ctl, __ctl, sizeof(ctl));
3078 ctl.data = &flush_delay;
3079 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3080
3081 net = (struct net *)__ctl->extra1;
3082 rt_cache_flush(net, flush_delay);
3083 return 0;
3084 }
3085
3086 return -EINVAL;
3087}
3088
3089static ctl_table ipv4_route_table[] = {
3090 {
3091 .procname = "gc_thresh",
3092 .data = &ipv4_dst_ops.gc_thresh,
3093 .maxlen = sizeof(int),
3094 .mode = 0644,
3095 .proc_handler = proc_dointvec,
3096 },
3097 {
3098 .procname = "max_size",
3099 .data = &ip_rt_max_size,
3100 .maxlen = sizeof(int),
3101 .mode = 0644,
3102 .proc_handler = proc_dointvec,
3103 },
3104 {
3105 /* Deprecated. Use gc_min_interval_ms */
3106
3107 .procname = "gc_min_interval",
3108 .data = &ip_rt_gc_min_interval,
3109 .maxlen = sizeof(int),
3110 .mode = 0644,
3111 .proc_handler = proc_dointvec_jiffies,
3112 },
3113 {
3114 .procname = "gc_min_interval_ms",
3115 .data = &ip_rt_gc_min_interval,
3116 .maxlen = sizeof(int),
3117 .mode = 0644,
3118 .proc_handler = proc_dointvec_ms_jiffies,
3119 },
3120 {
3121 .procname = "gc_timeout",
3122 .data = &ip_rt_gc_timeout,
3123 .maxlen = sizeof(int),
3124 .mode = 0644,
3125 .proc_handler = proc_dointvec_jiffies,
3126 },
3127 {
3128 .procname = "gc_interval",
3129 .data = &ip_rt_gc_interval,
3130 .maxlen = sizeof(int),
3131 .mode = 0644,
3132 .proc_handler = proc_dointvec_jiffies,
3133 },
3134 {
3135 .procname = "redirect_load",
3136 .data = &ip_rt_redirect_load,
3137 .maxlen = sizeof(int),
3138 .mode = 0644,
3139 .proc_handler = proc_dointvec,
3140 },
3141 {
3142 .procname = "redirect_number",
3143 .data = &ip_rt_redirect_number,
3144 .maxlen = sizeof(int),
3145 .mode = 0644,
3146 .proc_handler = proc_dointvec,
3147 },
3148 {
3149 .procname = "redirect_silence",
3150 .data = &ip_rt_redirect_silence,
3151 .maxlen = sizeof(int),
3152 .mode = 0644,
3153 .proc_handler = proc_dointvec,
3154 },
3155 {
3156 .procname = "error_cost",
3157 .data = &ip_rt_error_cost,
3158 .maxlen = sizeof(int),
3159 .mode = 0644,
3160 .proc_handler = proc_dointvec,
3161 },
3162 {
3163 .procname = "error_burst",
3164 .data = &ip_rt_error_burst,
3165 .maxlen = sizeof(int),
3166 .mode = 0644,
3167 .proc_handler = proc_dointvec,
3168 },
3169 {
3170 .procname = "gc_elasticity",
3171 .data = &ip_rt_gc_elasticity,
3172 .maxlen = sizeof(int),
3173 .mode = 0644,
3174 .proc_handler = proc_dointvec,
3175 },
3176 {
3177 .procname = "mtu_expires",
3178 .data = &ip_rt_mtu_expires,
3179 .maxlen = sizeof(int),
3180 .mode = 0644,
3181 .proc_handler = proc_dointvec_jiffies,
3182 },
3183 {
3184 .procname = "min_pmtu",
3185 .data = &ip_rt_min_pmtu,
3186 .maxlen = sizeof(int),
3187 .mode = 0644,
3188 .proc_handler = proc_dointvec,
3189 },
3190 {
3191 .procname = "min_adv_mss",
3192 .data = &ip_rt_min_advmss,
3193 .maxlen = sizeof(int),
3194 .mode = 0644,
3195 .proc_handler = proc_dointvec,
3196 },
3197 { }
3198};
3199
3200static struct ctl_table empty[1];
3201
3202static struct ctl_table ipv4_skeleton[] =
3203{
3204 { .procname = "route",
3205 .mode = 0555, .child = ipv4_route_table},
3206 { .procname = "neigh",
3207 .mode = 0555, .child = empty},
3208 { }
3209};
3210
3211static __net_initdata struct ctl_path ipv4_path[] = {
3212 { .procname = "net", },
3213 { .procname = "ipv4", },
3214 { },
3215};
3216
3217static struct ctl_table ipv4_route_flush_table[] = {
3218 {
3219 .procname = "flush",
3220 .maxlen = sizeof(int),
3221 .mode = 0200,
3222 .proc_handler = ipv4_sysctl_rtcache_flush,
3223 },
3224 { },
3225};
3226
3227static __net_initdata struct ctl_path ipv4_route_path[] = {
3228 { .procname = "net", },
3229 { .procname = "ipv4", },
3230 { .procname = "route", },
3231 { },
3232};
3233
3234static __net_init int sysctl_route_net_init(struct net *net)
3235{
3236 struct ctl_table *tbl;
3237
3238 tbl = ipv4_route_flush_table;
3239 if (!net_eq(net, &init_net)) {
3240 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3241 if (tbl == NULL)
3242 goto err_dup;
3243 }
3244 tbl[0].extra1 = net;
3245
3246 net->ipv4.route_hdr =
3247 register_net_sysctl_table(net, ipv4_route_path, tbl);
3248 if (net->ipv4.route_hdr == NULL)
3249 goto err_reg;
3250 return 0;
3251
3252err_reg:
3253 if (tbl != ipv4_route_flush_table)
3254 kfree(tbl);
3255err_dup:
3256 return -ENOMEM;
3257}
3258
3259static __net_exit void sysctl_route_net_exit(struct net *net)
3260{
3261 struct ctl_table *tbl;
3262
3263 tbl = net->ipv4.route_hdr->ctl_table_arg;
3264 unregister_net_sysctl_table(net->ipv4.route_hdr);
3265 BUG_ON(tbl == ipv4_route_flush_table);
3266 kfree(tbl);
3267}
3268
3269static __net_initdata struct pernet_operations sysctl_route_ops = {
3270 .init = sysctl_route_net_init,
3271 .exit = sysctl_route_net_exit,
3272};
3273#endif
3274
3275static __net_init int rt_genid_init(struct net *net)
3276{
3277 get_random_bytes(&net->ipv4.rt_genid,
3278 sizeof(net->ipv4.rt_genid));
3279 return 0;
3280}
3281
3282static __net_initdata struct pernet_operations rt_genid_ops = {
3283 .init = rt_genid_init,
3284};
3285
3286
3287#ifdef CONFIG_NET_CLS_ROUTE
3288struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3289#endif /* CONFIG_NET_CLS_ROUTE */
3290
3291static __initdata unsigned long rhash_entries;
3292static int __init set_rhash_entries(char *str)
3293{
3294 if (!str)
3295 return 0;
3296 rhash_entries = simple_strtoul(str, &str, 0);
3297 return 1;
3298}
3299__setup("rhash_entries=", set_rhash_entries);
3300
3301int __init ip_rt_init(void)
3302{
3303 int rc = 0;
3304
3305#ifdef CONFIG_NET_CLS_ROUTE
3306 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3307 if (!ip_rt_acct)
3308 panic("IP: failed to allocate ip_rt_acct\n");
3309#endif
3310
3311 ipv4_dst_ops.kmem_cachep =
3312 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3313 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3314
3315 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3316
3317 rt_hash_table = (struct rt_hash_bucket *)
3318 alloc_large_system_hash("IP route cache",
3319 sizeof(struct rt_hash_bucket),
3320 rhash_entries,
3321 (totalram_pages >= 128 * 1024) ?
3322 15 : 17,
3323 0,
3324 &rt_hash_log,
3325 &rt_hash_mask,
3326 rhash_entries ? 0 : 512 * 1024);
3327 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3328 rt_hash_lock_init();
3329
3330 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3331 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3332
3333 devinet_init();
3334 ip_fib_init();
3335
3336 /* All the timers, started at system startup tend
3337 to synchronize. Perturb it a bit.
3338 */
3339 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3340 expires_ljiffies = jiffies;
3341 schedule_delayed_work(&expires_work,
3342 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3343
3344 if (ip_rt_proc_init())
3345 printk(KERN_ERR "Unable to create route proc files\n");
3346#ifdef CONFIG_XFRM
3347 xfrm_init();
3348 xfrm4_init(ip_rt_max_size);
3349#endif
3350 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3351
3352#ifdef CONFIG_SYSCTL
3353 register_pernet_subsys(&sysctl_route_ops);
3354#endif
3355 register_pernet_subsys(&rt_genid_ops);
3356 return rc;
3357}
3358
3359#ifdef CONFIG_SYSCTL
3360/*
3361 * We really need to sanitize the damn ipv4 init order, then all
3362 * this nonsense will go away.
3363 */
3364void __init ip_static_sysctl_init(void)
3365{
3366 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3367}
3368#endif
3369
3370EXPORT_SYMBOL(__ip_select_ident);
3371EXPORT_SYMBOL(ip_route_output_key);