]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv6/route.c
[IPV6]: Remove ndiscs rt6_lock dependency
[net-next-2.6.git] / net / ipv6 / route.c
CommitLineData
1da177e4
LT
1/*
2 * Linux INET6 implementation
3 * FIB front-end.
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16/* Changes:
17 *
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
25 */
26
4fc268d2 27#include <linux/capability.h>
1da177e4
LT
28#include <linux/errno.h>
29#include <linux/types.h>
30#include <linux/times.h>
31#include <linux/socket.h>
32#include <linux/sockios.h>
33#include <linux/net.h>
34#include <linux/route.h>
35#include <linux/netdevice.h>
36#include <linux/in6.h>
37#include <linux/init.h>
38#include <linux/netlink.h>
39#include <linux/if_arp.h>
40
41#ifdef CONFIG_PROC_FS
42#include <linux/proc_fs.h>
43#include <linux/seq_file.h>
44#endif
45
46#include <net/snmp.h>
47#include <net/ipv6.h>
48#include <net/ip6_fib.h>
49#include <net/ip6_route.h>
50#include <net/ndisc.h>
51#include <net/addrconf.h>
52#include <net/tcp.h>
53#include <linux/rtnetlink.h>
54#include <net/dst.h>
55#include <net/xfrm.h>
8d71740c 56#include <net/netevent.h>
1da177e4
LT
57
58#include <asm/uaccess.h>
59
60#ifdef CONFIG_SYSCTL
61#include <linux/sysctl.h>
62#endif
63
64/* Set to 3 to get tracing. */
65#define RT6_DEBUG 2
66
67#if RT6_DEBUG >= 3
68#define RDBG(x) printk x
69#define RT6_TRACE(x...) printk(KERN_DEBUG x)
70#else
71#define RDBG(x)
72#define RT6_TRACE(x...) do { ; } while (0)
73#endif
74
519fbd87 75#define CLONE_OFFLINK_ROUTE 0
1da177e4 76
554cfb7e
YH
77#define RT6_SELECT_F_IFACE 0x1
78#define RT6_SELECT_F_REACHABLE 0x2
79
1da177e4
LT
80static int ip6_rt_max_size = 4096;
81static int ip6_rt_gc_min_interval = HZ / 2;
82static int ip6_rt_gc_timeout = 60*HZ;
83int ip6_rt_gc_interval = 30*HZ;
84static int ip6_rt_gc_elasticity = 9;
85static int ip6_rt_mtu_expires = 10*60*HZ;
86static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87
88static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91static void ip6_dst_destroy(struct dst_entry *);
92static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94static int ip6_dst_gc(void);
95
96static int ip6_pkt_discard(struct sk_buff *skb);
97static int ip6_pkt_discard_out(struct sk_buff *skb);
98static void ip6_link_failure(struct sk_buff *skb);
99static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100
70ceb4f5
YH
101#ifdef CONFIG_IPV6_ROUTE_INFO
102static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103 struct in6_addr *gwaddr, int ifindex,
104 unsigned pref);
105static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106 struct in6_addr *gwaddr, int ifindex);
107#endif
108
1da177e4
LT
109static struct dst_ops ip6_dst_ops = {
110 .family = AF_INET6,
111 .protocol = __constant_htons(ETH_P_IPV6),
112 .gc = ip6_dst_gc,
113 .gc_thresh = 1024,
114 .check = ip6_dst_check,
115 .destroy = ip6_dst_destroy,
116 .ifdown = ip6_dst_ifdown,
117 .negative_advice = ip6_negative_advice,
118 .link_failure = ip6_link_failure,
119 .update_pmtu = ip6_rt_update_pmtu,
120 .entry_size = sizeof(struct rt6_info),
121};
122
123struct rt6_info ip6_null_entry = {
124 .u = {
125 .dst = {
126 .__refcnt = ATOMIC_INIT(1),
127 .__use = 1,
128 .dev = &loopback_dev,
129 .obsolete = -1,
130 .error = -ENETUNREACH,
131 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
132 .input = ip6_pkt_discard,
133 .output = ip6_pkt_discard_out,
134 .ops = &ip6_dst_ops,
135 .path = (struct dst_entry*)&ip6_null_entry,
136 }
137 },
138 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
139 .rt6i_metric = ~(u32) 0,
140 .rt6i_ref = ATOMIC_INIT(1),
141};
142
143struct fib6_node ip6_routing_table = {
144 .leaf = &ip6_null_entry,
145 .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
146};
147
148/* Protects all the ip6 fib */
149
150DEFINE_RWLOCK(rt6_lock);
151
152
153/* allocate dst with ip6_dst_ops */
154static __inline__ struct rt6_info *ip6_dst_alloc(void)
155{
156 return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
157}
158
159static void ip6_dst_destroy(struct dst_entry *dst)
160{
161 struct rt6_info *rt = (struct rt6_info *)dst;
162 struct inet6_dev *idev = rt->rt6i_idev;
163
164 if (idev != NULL) {
165 rt->rt6i_idev = NULL;
166 in6_dev_put(idev);
167 }
168}
169
170static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
171 int how)
172{
173 struct rt6_info *rt = (struct rt6_info *)dst;
174 struct inet6_dev *idev = rt->rt6i_idev;
175
176 if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178 if (loopback_idev != NULL) {
179 rt->rt6i_idev = loopback_idev;
180 in6_dev_put(idev);
181 }
182 }
183}
184
185static __inline__ int rt6_check_expired(const struct rt6_info *rt)
186{
187 return (rt->rt6i_flags & RTF_EXPIRES &&
188 time_after(jiffies, rt->rt6i_expires));
189}
190
191/*
192 * Route lookup. Any rt6_lock is implied.
193 */
194
195static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
196 int oif,
197 int strict)
198{
199 struct rt6_info *local = NULL;
200 struct rt6_info *sprt;
201
202 if (oif) {
203 for (sprt = rt; sprt; sprt = sprt->u.next) {
204 struct net_device *dev = sprt->rt6i_dev;
205 if (dev->ifindex == oif)
206 return sprt;
207 if (dev->flags & IFF_LOOPBACK) {
208 if (sprt->rt6i_idev == NULL ||
209 sprt->rt6i_idev->dev->ifindex != oif) {
210 if (strict && oif)
211 continue;
212 if (local && (!oif ||
213 local->rt6i_idev->dev->ifindex == oif))
214 continue;
215 }
216 local = sprt;
217 }
218 }
219
220 if (local)
221 return local;
222
223 if (strict)
224 return &ip6_null_entry;
225 }
226 return rt;
227}
228
27097255
YH
229#ifdef CONFIG_IPV6_ROUTER_PREF
230static void rt6_probe(struct rt6_info *rt)
231{
232 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
233 /*
234 * Okay, this does not seem to be appropriate
235 * for now, however, we need to check if it
236 * is really so; aka Router Reachability Probing.
237 *
238 * Router Reachability Probe MUST be rate-limited
239 * to no more than one per minute.
240 */
241 if (!neigh || (neigh->nud_state & NUD_VALID))
242 return;
243 read_lock_bh(&neigh->lock);
244 if (!(neigh->nud_state & NUD_VALID) &&
52e16356 245 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
27097255
YH
246 struct in6_addr mcaddr;
247 struct in6_addr *target;
248
249 neigh->updated = jiffies;
250 read_unlock_bh(&neigh->lock);
251
252 target = (struct in6_addr *)&neigh->primary_key;
253 addrconf_addr_solict_mult(target, &mcaddr);
254 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
255 } else
256 read_unlock_bh(&neigh->lock);
257}
258#else
259static inline void rt6_probe(struct rt6_info *rt)
260{
261 return;
262}
263#endif
264
1da177e4 265/*
554cfb7e 266 * Default Router Selection (RFC 2461 6.3.6)
1da177e4 267 */
554cfb7e
YH
268static int inline rt6_check_dev(struct rt6_info *rt, int oif)
269{
270 struct net_device *dev = rt->rt6i_dev;
271 if (!oif || dev->ifindex == oif)
272 return 2;
273 if ((dev->flags & IFF_LOOPBACK) &&
274 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
275 return 1;
276 return 0;
277}
1da177e4 278
554cfb7e 279static int inline rt6_check_neigh(struct rt6_info *rt)
1da177e4 280{
554cfb7e
YH
281 struct neighbour *neigh = rt->rt6i_nexthop;
282 int m = 0;
4d0c5911
YH
283 if (rt->rt6i_flags & RTF_NONEXTHOP ||
284 !(rt->rt6i_flags & RTF_GATEWAY))
285 m = 1;
286 else if (neigh) {
554cfb7e
YH
287 read_lock_bh(&neigh->lock);
288 if (neigh->nud_state & NUD_VALID)
4d0c5911 289 m = 2;
554cfb7e 290 read_unlock_bh(&neigh->lock);
1da177e4 291 }
554cfb7e 292 return m;
1da177e4
LT
293}
294
554cfb7e
YH
295static int rt6_score_route(struct rt6_info *rt, int oif,
296 int strict)
1da177e4 297{
4d0c5911
YH
298 int m, n;
299
300 m = rt6_check_dev(rt, oif);
554cfb7e
YH
301 if (!m && (strict & RT6_SELECT_F_IFACE))
302 return -1;
ebacaaa0
YH
303#ifdef CONFIG_IPV6_ROUTER_PREF
304 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
305#endif
4d0c5911
YH
306 n = rt6_check_neigh(rt);
307 if (n > 1)
ebacaaa0 308 m |= 16;
4d0c5911 309 else if (!n && strict & RT6_SELECT_F_REACHABLE)
554cfb7e
YH
310 return -1;
311 return m;
312}
313
314static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
315 int strict)
316{
317 struct rt6_info *match = NULL, *last = NULL;
318 struct rt6_info *rt, *rt0 = *head;
319 u32 metric;
320 int mpri = -1;
1da177e4 321
554cfb7e
YH
322 RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
323 __FUNCTION__, head, head ? *head : NULL, oif);
1da177e4 324
554cfb7e 325 for (rt = rt0, metric = rt0->rt6i_metric;
c302e6d5 326 rt && rt->rt6i_metric == metric && (!last || rt != rt0);
554cfb7e
YH
327 rt = rt->u.next) {
328 int m;
1da177e4 329
554cfb7e 330 if (rt6_check_expired(rt))
1da177e4
LT
331 continue;
332
554cfb7e
YH
333 last = rt;
334
335 m = rt6_score_route(rt, oif, strict);
336 if (m < 0)
1da177e4 337 continue;
1da177e4 338
554cfb7e 339 if (m > mpri) {
27097255 340 rt6_probe(match);
554cfb7e 341 match = rt;
1da177e4 342 mpri = m;
27097255
YH
343 } else {
344 rt6_probe(rt);
1da177e4
LT
345 }
346 }
347
554cfb7e
YH
348 if (!match &&
349 (strict & RT6_SELECT_F_REACHABLE) &&
350 last && last != rt0) {
351 /* no entries matched; do round-robin */
34af946a 352 static DEFINE_SPINLOCK(lock);
c302e6d5 353 spin_lock(&lock);
554cfb7e
YH
354 *head = rt0->u.next;
355 rt0->u.next = last->u.next;
356 last->u.next = rt0;
c302e6d5 357 spin_unlock(&lock);
1da177e4 358 }
1da177e4 359
554cfb7e
YH
360 RT6_TRACE("%s() => %p, score=%d\n",
361 __FUNCTION__, match, mpri);
1da177e4 362
554cfb7e 363 return (match ? match : &ip6_null_entry);
1da177e4
LT
364}
365
70ceb4f5
YH
366#ifdef CONFIG_IPV6_ROUTE_INFO
367int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
368 struct in6_addr *gwaddr)
369{
370 struct route_info *rinfo = (struct route_info *) opt;
371 struct in6_addr prefix_buf, *prefix;
372 unsigned int pref;
373 u32 lifetime;
374 struct rt6_info *rt;
375
376 if (len < sizeof(struct route_info)) {
377 return -EINVAL;
378 }
379
380 /* Sanity check for prefix_len and length */
381 if (rinfo->length > 3) {
382 return -EINVAL;
383 } else if (rinfo->prefix_len > 128) {
384 return -EINVAL;
385 } else if (rinfo->prefix_len > 64) {
386 if (rinfo->length < 2) {
387 return -EINVAL;
388 }
389 } else if (rinfo->prefix_len > 0) {
390 if (rinfo->length < 1) {
391 return -EINVAL;
392 }
393 }
394
395 pref = rinfo->route_pref;
396 if (pref == ICMPV6_ROUTER_PREF_INVALID)
397 pref = ICMPV6_ROUTER_PREF_MEDIUM;
398
399 lifetime = htonl(rinfo->lifetime);
400 if (lifetime == 0xffffffff) {
401 /* infinity */
402 } else if (lifetime > 0x7fffffff/HZ) {
403 /* Avoid arithmetic overflow */
404 lifetime = 0x7fffffff/HZ - 1;
405 }
406
407 if (rinfo->length == 3)
408 prefix = (struct in6_addr *)rinfo->prefix;
409 else {
410 /* this function is safe */
411 ipv6_addr_prefix(&prefix_buf,
412 (struct in6_addr *)rinfo->prefix,
413 rinfo->prefix_len);
414 prefix = &prefix_buf;
415 }
416
417 rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
418
419 if (rt && !lifetime) {
420 ip6_del_rt(rt, NULL, NULL, NULL);
421 rt = NULL;
422 }
423
424 if (!rt && lifetime)
425 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
426 pref);
427 else if (rt)
428 rt->rt6i_flags = RTF_ROUTEINFO |
429 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
430
431 if (rt) {
432 if (lifetime == 0xffffffff) {
433 rt->rt6i_flags &= ~RTF_EXPIRES;
434 } else {
435 rt->rt6i_expires = jiffies + HZ * lifetime;
436 rt->rt6i_flags |= RTF_EXPIRES;
437 }
438 dst_release(&rt->u.dst);
439 }
440 return 0;
441}
442#endif
443
1da177e4
LT
444struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
445 int oif, int strict)
446{
447 struct fib6_node *fn;
448 struct rt6_info *rt;
449
450 read_lock_bh(&rt6_lock);
451 fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
452 rt = rt6_device_match(fn->leaf, oif, strict);
453 dst_hold(&rt->u.dst);
454 rt->u.dst.__use++;
455 read_unlock_bh(&rt6_lock);
456
457 rt->u.dst.lastuse = jiffies;
458 if (rt->u.dst.error == 0)
459 return rt;
460 dst_release(&rt->u.dst);
461 return NULL;
462}
463
464/* ip6_ins_rt is called with FREE rt6_lock.
465 It takes new route entry, the addition fails by any reason the
466 route is freed. In any case, if caller does not hold it, it may
467 be destroyed.
468 */
469
0d51aa80
JHS
470int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
471 void *_rtattr, struct netlink_skb_parms *req)
1da177e4
LT
472{
473 int err;
474
475 write_lock_bh(&rt6_lock);
0d51aa80 476 err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
1da177e4
LT
477 write_unlock_bh(&rt6_lock);
478
479 return err;
480}
481
95a9a5ba
YH
482static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
483 struct in6_addr *saddr)
1da177e4 484{
1da177e4
LT
485 struct rt6_info *rt;
486
487 /*
488 * Clone the route.
489 */
490
491 rt = ip6_rt_copy(ort);
492
493 if (rt) {
58c4fb86
YH
494 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
495 if (rt->rt6i_dst.plen != 128 &&
496 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
497 rt->rt6i_flags |= RTF_ANYCAST;
1da177e4 498 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
58c4fb86 499 }
1da177e4 500
58c4fb86 501 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
1da177e4
LT
502 rt->rt6i_dst.plen = 128;
503 rt->rt6i_flags |= RTF_CACHE;
504 rt->u.dst.flags |= DST_HOST;
505
506#ifdef CONFIG_IPV6_SUBTREES
507 if (rt->rt6i_src.plen && saddr) {
508 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
509 rt->rt6i_src.plen = 128;
510 }
511#endif
512
513 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
514
95a9a5ba 515 }
1da177e4 516
95a9a5ba
YH
517 return rt;
518}
1da177e4 519
299d9939
YH
520static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
521{
522 struct rt6_info *rt = ip6_rt_copy(ort);
523 if (rt) {
524 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
525 rt->rt6i_dst.plen = 128;
526 rt->rt6i_flags |= RTF_CACHE;
527 if (rt->rt6i_flags & RTF_REJECT)
528 rt->u.dst.error = ort->u.dst.error;
529 rt->u.dst.flags |= DST_HOST;
530 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
531 }
532 return rt;
533}
534
1da177e4 535#define BACKTRACK() \
bb133964 536if (rt == &ip6_null_entry) { \
1da177e4
LT
537 while ((fn = fn->parent) != NULL) { \
538 if (fn->fn_flags & RTN_ROOT) { \
1da177e4
LT
539 goto out; \
540 } \
541 if (fn->fn_flags & RTN_RTINFO) \
542 goto restart; \
543 } \
544}
545
546
547void ip6_route_input(struct sk_buff *skb)
548{
549 struct fib6_node *fn;
519fbd87 550 struct rt6_info *rt, *nrt;
1da177e4
LT
551 int strict;
552 int attempts = 3;
519fbd87 553 int err;
8238dd06 554 int reachable = RT6_SELECT_F_REACHABLE;
1da177e4 555
118f8c16 556 strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
1da177e4
LT
557
558relookup:
559 read_lock_bh(&rt6_lock);
560
8238dd06 561restart_2:
1da177e4
LT
562 fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
563 &skb->nh.ipv6h->saddr);
564
565restart:
8238dd06 566 rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
1da177e4 567 BACKTRACK();
8238dd06
YH
568 if (rt == &ip6_null_entry ||
569 rt->rt6i_flags & RTF_CACHE)
1ddef044 570 goto out;
1da177e4 571
fb9de91e
YH
572 dst_hold(&rt->u.dst);
573 read_unlock_bh(&rt6_lock);
574
519fbd87
YH
575 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
576 nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
577 else {
578#if CLONE_OFFLINK_ROUTE
579 nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
580#else
581 goto out2;
582#endif
583 }
e40cf353 584
519fbd87
YH
585 dst_release(&rt->u.dst);
586 rt = nrt ? : &ip6_null_entry;
1da177e4 587
519fbd87
YH
588 dst_hold(&rt->u.dst);
589 if (nrt) {
590 err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
591 if (!err)
1da177e4 592 goto out2;
1da177e4 593 }
1da177e4 594
519fbd87
YH
595 if (--attempts <= 0)
596 goto out2;
597
598 /*
599 * Race condition! In the gap, when rt6_lock was
600 * released someone could insert this route. Relookup.
601 */
602 dst_release(&rt->u.dst);
603 goto relookup;
604
605out:
8238dd06
YH
606 if (reachable) {
607 reachable = 0;
608 goto restart_2;
609 }
519fbd87
YH
610 dst_hold(&rt->u.dst);
611 read_unlock_bh(&rt6_lock);
1da177e4
LT
612out2:
613 rt->u.dst.lastuse = jiffies;
614 rt->u.dst.__use++;
615 skb->dst = (struct dst_entry *) rt;
fb9de91e 616 return;
1da177e4
LT
617}
618
619struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
620{
621 struct fib6_node *fn;
519fbd87 622 struct rt6_info *rt, *nrt;
1da177e4
LT
623 int strict;
624 int attempts = 3;
519fbd87 625 int err;
8238dd06 626 int reachable = RT6_SELECT_F_REACHABLE;
1da177e4 627
554cfb7e 628 strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
1da177e4
LT
629
630relookup:
631 read_lock_bh(&rt6_lock);
632
8238dd06 633restart_2:
1da177e4
LT
634 fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
635
636restart:
8238dd06 637 rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
1ddef044 638 BACKTRACK();
8238dd06
YH
639 if (rt == &ip6_null_entry ||
640 rt->rt6i_flags & RTF_CACHE)
1da177e4 641 goto out;
1da177e4 642
fb9de91e
YH
643 dst_hold(&rt->u.dst);
644 read_unlock_bh(&rt6_lock);
645
519fbd87 646 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
e40cf353 647 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
519fbd87
YH
648 else {
649#if CLONE_OFFLINK_ROUTE
650 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
651#else
652 goto out2;
653#endif
654 }
1da177e4 655
519fbd87
YH
656 dst_release(&rt->u.dst);
657 rt = nrt ? : &ip6_null_entry;
1da177e4 658
519fbd87
YH
659 dst_hold(&rt->u.dst);
660 if (nrt) {
661 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
662 if (!err)
1da177e4 663 goto out2;
1da177e4 664 }
e40cf353 665
519fbd87
YH
666 if (--attempts <= 0)
667 goto out2;
668
669 /*
670 * Race condition! In the gap, when rt6_lock was
671 * released someone could insert this route. Relookup.
672 */
673 dst_release(&rt->u.dst);
674 goto relookup;
675
676out:
8238dd06
YH
677 if (reachable) {
678 reachable = 0;
679 goto restart_2;
680 }
519fbd87
YH
681 dst_hold(&rt->u.dst);
682 read_unlock_bh(&rt6_lock);
1da177e4
LT
683out2:
684 rt->u.dst.lastuse = jiffies;
685 rt->u.dst.__use++;
686 return &rt->u.dst;
687}
688
689
690/*
691 * Destination cache support functions
692 */
693
694static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
695{
696 struct rt6_info *rt;
697
698 rt = (struct rt6_info *) dst;
699
700 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
701 return dst;
702
703 return NULL;
704}
705
706static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
707{
708 struct rt6_info *rt = (struct rt6_info *) dst;
709
710 if (rt) {
711 if (rt->rt6i_flags & RTF_CACHE)
0d51aa80 712 ip6_del_rt(rt, NULL, NULL, NULL);
1da177e4
LT
713 else
714 dst_release(dst);
715 }
716 return NULL;
717}
718
719static void ip6_link_failure(struct sk_buff *skb)
720{
721 struct rt6_info *rt;
722
723 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
724
725 rt = (struct rt6_info *) skb->dst;
726 if (rt) {
727 if (rt->rt6i_flags&RTF_CACHE) {
728 dst_set_expires(&rt->u.dst, 0);
729 rt->rt6i_flags |= RTF_EXPIRES;
730 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
731 rt->rt6i_node->fn_sernum = -1;
732 }
733}
734
735static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
736{
737 struct rt6_info *rt6 = (struct rt6_info*)dst;
738
739 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
740 rt6->rt6i_flags |= RTF_MODIFIED;
741 if (mtu < IPV6_MIN_MTU) {
742 mtu = IPV6_MIN_MTU;
743 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
744 }
745 dst->metrics[RTAX_MTU-1] = mtu;
8d71740c 746 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1da177e4
LT
747 }
748}
749
1da177e4
LT
750static int ipv6_get_mtu(struct net_device *dev);
751
752static inline unsigned int ipv6_advmss(unsigned int mtu)
753{
754 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
755
756 if (mtu < ip6_rt_min_advmss)
757 mtu = ip6_rt_min_advmss;
758
759 /*
760 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
761 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
762 * IPV6_MAXPLEN is also valid and means: "any MSS,
763 * rely only on pmtu discovery"
764 */
765 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
766 mtu = IPV6_MAXPLEN;
767 return mtu;
768}
769
5d0bbeeb
TG
770static struct dst_entry *ndisc_dst_gc_list;
771DEFINE_SPINLOCK(ndisc_lock);
772
1da177e4
LT
773struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
774 struct neighbour *neigh,
775 struct in6_addr *addr,
776 int (*output)(struct sk_buff *))
777{
778 struct rt6_info *rt;
779 struct inet6_dev *idev = in6_dev_get(dev);
780
781 if (unlikely(idev == NULL))
782 return NULL;
783
784 rt = ip6_dst_alloc();
785 if (unlikely(rt == NULL)) {
786 in6_dev_put(idev);
787 goto out;
788 }
789
790 dev_hold(dev);
791 if (neigh)
792 neigh_hold(neigh);
793 else
794 neigh = ndisc_get_neigh(dev, addr);
795
796 rt->rt6i_dev = dev;
797 rt->rt6i_idev = idev;
798 rt->rt6i_nexthop = neigh;
799 atomic_set(&rt->u.dst.__refcnt, 1);
800 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
801 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
802 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
803 rt->u.dst.output = output;
804
805#if 0 /* there's no chance to use these for ndisc */
806 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
807 ? DST_HOST
808 : 0;
809 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
810 rt->rt6i_dst.plen = 128;
811#endif
812
5d0bbeeb 813 spin_lock_bh(&ndisc_lock);
1da177e4
LT
814 rt->u.dst.next = ndisc_dst_gc_list;
815 ndisc_dst_gc_list = &rt->u.dst;
5d0bbeeb 816 spin_unlock_bh(&ndisc_lock);
1da177e4
LT
817
818 fib6_force_start_gc();
819
820out:
821 return (struct dst_entry *)rt;
822}
823
824int ndisc_dst_gc(int *more)
825{
826 struct dst_entry *dst, *next, **pprev;
827 int freed;
828
829 next = NULL;
5d0bbeeb
TG
830 freed = 0;
831
832 spin_lock_bh(&ndisc_lock);
1da177e4 833 pprev = &ndisc_dst_gc_list;
5d0bbeeb 834
1da177e4
LT
835 while ((dst = *pprev) != NULL) {
836 if (!atomic_read(&dst->__refcnt)) {
837 *pprev = dst->next;
838 dst_free(dst);
839 freed++;
840 } else {
841 pprev = &dst->next;
842 (*more)++;
843 }
844 }
845
5d0bbeeb
TG
846 spin_unlock_bh(&ndisc_lock);
847
1da177e4
LT
848 return freed;
849}
850
851static int ip6_dst_gc(void)
852{
853 static unsigned expire = 30*HZ;
854 static unsigned long last_gc;
855 unsigned long now = jiffies;
856
857 if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
858 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
859 goto out;
860
861 expire++;
862 fib6_run_gc(expire);
863 last_gc = now;
864 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
865 expire = ip6_rt_gc_timeout>>1;
866
867out:
868 expire -= expire>>ip6_rt_gc_elasticity;
869 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
870}
871
872/* Clean host part of a prefix. Not necessary in radix tree,
873 but results in cleaner routing tables.
874
875 Remove it only when all the things will work!
876 */
877
878static int ipv6_get_mtu(struct net_device *dev)
879{
880 int mtu = IPV6_MIN_MTU;
881 struct inet6_dev *idev;
882
883 idev = in6_dev_get(dev);
884 if (idev) {
885 mtu = idev->cnf.mtu6;
886 in6_dev_put(idev);
887 }
888 return mtu;
889}
890
891int ipv6_get_hoplimit(struct net_device *dev)
892{
893 int hoplimit = ipv6_devconf.hop_limit;
894 struct inet6_dev *idev;
895
896 idev = in6_dev_get(dev);
897 if (idev) {
898 hoplimit = idev->cnf.hop_limit;
899 in6_dev_put(idev);
900 }
901 return hoplimit;
902}
903
904/*
905 *
906 */
907
0d51aa80
JHS
908int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
909 void *_rtattr, struct netlink_skb_parms *req)
1da177e4
LT
910{
911 int err;
912 struct rtmsg *r;
913 struct rtattr **rta;
914 struct rt6_info *rt = NULL;
915 struct net_device *dev = NULL;
916 struct inet6_dev *idev = NULL;
917 int addr_type;
918
919 rta = (struct rtattr **) _rtattr;
920
921 if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
922 return -EINVAL;
923#ifndef CONFIG_IPV6_SUBTREES
924 if (rtmsg->rtmsg_src_len)
925 return -EINVAL;
926#endif
927 if (rtmsg->rtmsg_ifindex) {
928 err = -ENODEV;
929 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
930 if (!dev)
931 goto out;
932 idev = in6_dev_get(dev);
933 if (!idev)
934 goto out;
935 }
936
937 if (rtmsg->rtmsg_metric == 0)
938 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
939
940 rt = ip6_dst_alloc();
941
942 if (rt == NULL) {
943 err = -ENOMEM;
944 goto out;
945 }
946
947 rt->u.dst.obsolete = -1;
3dd4bc68 948 rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
1da177e4
LT
949 if (nlh && (r = NLMSG_DATA(nlh))) {
950 rt->rt6i_protocol = r->rtm_protocol;
951 } else {
952 rt->rt6i_protocol = RTPROT_BOOT;
953 }
954
955 addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
956
957 if (addr_type & IPV6_ADDR_MULTICAST)
958 rt->u.dst.input = ip6_mc_input;
959 else
960 rt->u.dst.input = ip6_forward;
961
962 rt->u.dst.output = ip6_output;
963
964 ipv6_addr_prefix(&rt->rt6i_dst.addr,
965 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
966 rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
967 if (rt->rt6i_dst.plen == 128)
968 rt->u.dst.flags = DST_HOST;
969
970#ifdef CONFIG_IPV6_SUBTREES
971 ipv6_addr_prefix(&rt->rt6i_src.addr,
972 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
973 rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
974#endif
975
976 rt->rt6i_metric = rtmsg->rtmsg_metric;
977
978 /* We cannot add true routes via loopback here,
979 they would result in kernel looping; promote them to reject routes
980 */
981 if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
982 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
983 /* hold loopback dev/idev if we haven't done so. */
984 if (dev != &loopback_dev) {
985 if (dev) {
986 dev_put(dev);
987 in6_dev_put(idev);
988 }
989 dev = &loopback_dev;
990 dev_hold(dev);
991 idev = in6_dev_get(dev);
992 if (!idev) {
993 err = -ENODEV;
994 goto out;
995 }
996 }
997 rt->u.dst.output = ip6_pkt_discard_out;
998 rt->u.dst.input = ip6_pkt_discard;
999 rt->u.dst.error = -ENETUNREACH;
1000 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1001 goto install_route;
1002 }
1003
1004 if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
1005 struct in6_addr *gw_addr;
1006 int gwa_type;
1007
1008 gw_addr = &rtmsg->rtmsg_gateway;
1009 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
1010 gwa_type = ipv6_addr_type(gw_addr);
1011
1012 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1013 struct rt6_info *grt;
1014
1015 /* IPv6 strictly inhibits using not link-local
1016 addresses as nexthop address.
1017 Otherwise, router will not able to send redirects.
1018 It is very good, but in some (rare!) circumstances
1019 (SIT, PtP, NBMA NOARP links) it is handy to allow
1020 some exceptions. --ANK
1021 */
1022 err = -EINVAL;
1023 if (!(gwa_type&IPV6_ADDR_UNICAST))
1024 goto out;
1025
1026 grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1027
1028 err = -EHOSTUNREACH;
1029 if (grt == NULL)
1030 goto out;
1031 if (dev) {
1032 if (dev != grt->rt6i_dev) {
1033 dst_release(&grt->u.dst);
1034 goto out;
1035 }
1036 } else {
1037 dev = grt->rt6i_dev;
1038 idev = grt->rt6i_idev;
1039 dev_hold(dev);
1040 in6_dev_hold(grt->rt6i_idev);
1041 }
1042 if (!(grt->rt6i_flags&RTF_GATEWAY))
1043 err = 0;
1044 dst_release(&grt->u.dst);
1045
1046 if (err)
1047 goto out;
1048 }
1049 err = -EINVAL;
1050 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1051 goto out;
1052 }
1053
1054 err = -ENODEV;
1055 if (dev == NULL)
1056 goto out;
1057
1058 if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1059 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1060 if (IS_ERR(rt->rt6i_nexthop)) {
1061 err = PTR_ERR(rt->rt6i_nexthop);
1062 rt->rt6i_nexthop = NULL;
1063 goto out;
1064 }
1065 }
1066
1067 rt->rt6i_flags = rtmsg->rtmsg_flags;
1068
1069install_route:
1070 if (rta && rta[RTA_METRICS-1]) {
1071 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1072 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1073
1074 while (RTA_OK(attr, attrlen)) {
1075 unsigned flavor = attr->rta_type;
1076 if (flavor) {
1077 if (flavor > RTAX_MAX) {
1078 err = -EINVAL;
1079 goto out;
1080 }
1081 rt->u.dst.metrics[flavor-1] =
1082 *(u32 *)RTA_DATA(attr);
1083 }
1084 attr = RTA_NEXT(attr, attrlen);
1085 }
1086 }
1087
1088 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1089 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1090 if (!rt->u.dst.metrics[RTAX_MTU-1])
1091 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1092 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1093 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1094 rt->u.dst.dev = dev;
1095 rt->rt6i_idev = idev;
0d51aa80 1096 return ip6_ins_rt(rt, nlh, _rtattr, req);
1da177e4
LT
1097
1098out:
1099 if (dev)
1100 dev_put(dev);
1101 if (idev)
1102 in6_dev_put(idev);
1103 if (rt)
1104 dst_free((struct dst_entry *) rt);
1105 return err;
1106}
1107
0d51aa80 1108int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1da177e4
LT
1109{
1110 int err;
1111
1112 write_lock_bh(&rt6_lock);
1113
0d51aa80 1114 err = fib6_del(rt, nlh, _rtattr, req);
1da177e4
LT
1115 dst_release(&rt->u.dst);
1116
1117 write_unlock_bh(&rt6_lock);
1118
1119 return err;
1120}
1121
0d51aa80 1122static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1da177e4
LT
1123{
1124 struct fib6_node *fn;
1125 struct rt6_info *rt;
1126 int err = -ESRCH;
1127
1128 read_lock_bh(&rt6_lock);
1129
1130 fn = fib6_locate(&ip6_routing_table,
1131 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1132 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1133
1134 if (fn) {
1135 for (rt = fn->leaf; rt; rt = rt->u.next) {
1136 if (rtmsg->rtmsg_ifindex &&
1137 (rt->rt6i_dev == NULL ||
1138 rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1139 continue;
1140 if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1141 !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1142 continue;
1143 if (rtmsg->rtmsg_metric &&
1144 rtmsg->rtmsg_metric != rt->rt6i_metric)
1145 continue;
1146 dst_hold(&rt->u.dst);
1147 read_unlock_bh(&rt6_lock);
1148
0d51aa80 1149 return ip6_del_rt(rt, nlh, _rtattr, req);
1da177e4
LT
1150 }
1151 }
1152 read_unlock_bh(&rt6_lock);
1153
1154 return err;
1155}
1156
1157/*
1158 * Handle redirects
1159 */
1160void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1161 struct neighbour *neigh, u8 *lladdr, int on_link)
1162{
e843b9e1
YH
1163 struct rt6_info *rt, *nrt = NULL;
1164 int strict;
1165 struct fib6_node *fn;
8d71740c 1166 struct netevent_redirect netevent;
1da177e4
LT
1167
1168 /*
e843b9e1
YH
1169 * Get the "current" route for this destination and
1170 * check if the redirect has come from approriate router.
1171 *
1172 * RFC 2461 specifies that redirects should only be
1173 * accepted if they come from the nexthop to the target.
1174 * Due to the way the routes are chosen, this notion
1175 * is a bit fuzzy and one might need to check all possible
1176 * routes.
1da177e4 1177 */
e843b9e1 1178 strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
1da177e4 1179
e843b9e1
YH
1180 read_lock_bh(&rt6_lock);
1181 fn = fib6_lookup(&ip6_routing_table, dest, NULL);
1182restart:
1183 for (rt = fn->leaf; rt; rt = rt->u.next) {
1184 /*
1185 * Current route is on-link; redirect is always invalid.
1186 *
1187 * Seems, previous statement is not true. It could
1188 * be node, which looks for us as on-link (f.e. proxy ndisc)
1189 * But then router serving it might decide, that we should
1190 * know truth 8)8) --ANK (980726).
1191 */
1192 if (rt6_check_expired(rt))
1193 continue;
1194 if (!(rt->rt6i_flags & RTF_GATEWAY))
1195 continue;
1196 if (neigh->dev != rt->rt6i_dev)
1197 continue;
1198 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1199 continue;
1200 break;
1201 }
1202 if (rt)
1203 dst_hold(&rt->u.dst);
1204 else if (strict) {
1205 while ((fn = fn->parent) != NULL) {
1206 if (fn->fn_flags & RTN_ROOT)
1207 break;
1208 if (fn->fn_flags & RTN_RTINFO)
1209 goto restart;
1da177e4 1210 }
e843b9e1
YH
1211 }
1212 read_unlock_bh(&rt6_lock);
1213
1214 if (!rt) {
1da177e4
LT
1215 if (net_ratelimit())
1216 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1217 "for redirect target\n");
e843b9e1 1218 return;
1da177e4
LT
1219 }
1220
1da177e4
LT
1221 /*
1222 * We have finally decided to accept it.
1223 */
1224
1225 neigh_update(neigh, lladdr, NUD_STALE,
1226 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1227 NEIGH_UPDATE_F_OVERRIDE|
1228 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1229 NEIGH_UPDATE_F_ISROUTER))
1230 );
1231
1232 /*
1233 * Redirect received -> path was valid.
1234 * Look, redirects are sent only in response to data packets,
1235 * so that this nexthop apparently is reachable. --ANK
1236 */
1237 dst_confirm(&rt->u.dst);
1238
1239 /* Duplicate redirect: silently ignore. */
1240 if (neigh == rt->u.dst.neighbour)
1241 goto out;
1242
1243 nrt = ip6_rt_copy(rt);
1244 if (nrt == NULL)
1245 goto out;
1246
1247 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1248 if (on_link)
1249 nrt->rt6i_flags &= ~RTF_GATEWAY;
1250
1251 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1252 nrt->rt6i_dst.plen = 128;
1253 nrt->u.dst.flags |= DST_HOST;
1254
1255 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1256 nrt->rt6i_nexthop = neigh_clone(neigh);
1257 /* Reset pmtu, it may be better */
1258 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1259 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1260
0d51aa80 1261 if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1da177e4
LT
1262 goto out;
1263
8d71740c
TT
1264 netevent.old = &rt->u.dst;
1265 netevent.new = &nrt->u.dst;
1266 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1267
1da177e4 1268 if (rt->rt6i_flags&RTF_CACHE) {
0d51aa80 1269 ip6_del_rt(rt, NULL, NULL, NULL);
1da177e4
LT
1270 return;
1271 }
1272
1273out:
1274 dst_release(&rt->u.dst);
1275 return;
1276}
1277
1278/*
1279 * Handle ICMP "packet too big" messages
1280 * i.e. Path MTU discovery
1281 */
1282
1283void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1284 struct net_device *dev, u32 pmtu)
1285{
1286 struct rt6_info *rt, *nrt;
1287 int allfrag = 0;
1288
1289 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1290 if (rt == NULL)
1291 return;
1292
1293 if (pmtu >= dst_mtu(&rt->u.dst))
1294 goto out;
1295
1296 if (pmtu < IPV6_MIN_MTU) {
1297 /*
1298 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1299 * MTU (1280) and a fragment header should always be included
1300 * after a node receiving Too Big message reporting PMTU is
1301 * less than the IPv6 Minimum Link MTU.
1302 */
1303 pmtu = IPV6_MIN_MTU;
1304 allfrag = 1;
1305 }
1306
1307 /* New mtu received -> path was valid.
1308 They are sent only in response to data packets,
1309 so that this nexthop apparently is reachable. --ANK
1310 */
1311 dst_confirm(&rt->u.dst);
1312
1313 /* Host route. If it is static, it would be better
1314 not to override it, but add new one, so that
1315 when cache entry will expire old pmtu
1316 would return automatically.
1317 */
1318 if (rt->rt6i_flags & RTF_CACHE) {
1319 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1320 if (allfrag)
1321 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1322 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1323 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1324 goto out;
1325 }
1326
1327 /* Network route.
1328 Two cases are possible:
1329 1. It is connected route. Action: COW
1330 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1331 */
d5315b50 1332 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
a1e78363 1333 nrt = rt6_alloc_cow(rt, daddr, saddr);
d5315b50
YH
1334 else
1335 nrt = rt6_alloc_clone(rt, daddr);
a1e78363 1336
d5315b50 1337 if (nrt) {
a1e78363
YH
1338 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1339 if (allfrag)
1340 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1341
1342 /* According to RFC 1981, detecting PMTU increase shouldn't be
1343 * happened within 5 mins, the recommended timer is 10 mins.
1344 * Here this route expiration time is set to ip6_rt_mtu_expires
1345 * which is 10 mins. After 10 mins the decreased pmtu is expired
1346 * and detecting PMTU increase will be automatically happened.
1347 */
1348 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1349 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1350
1351 ip6_ins_rt(nrt, NULL, NULL, NULL);
1da177e4 1352 }
1da177e4
LT
1353out:
1354 dst_release(&rt->u.dst);
1355}
1356
1357/*
1358 * Misc support functions
1359 */
1360
1361static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1362{
1363 struct rt6_info *rt = ip6_dst_alloc();
1364
1365 if (rt) {
1366 rt->u.dst.input = ort->u.dst.input;
1367 rt->u.dst.output = ort->u.dst.output;
1368
1369 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1370 rt->u.dst.dev = ort->u.dst.dev;
1371 if (rt->u.dst.dev)
1372 dev_hold(rt->u.dst.dev);
1373 rt->rt6i_idev = ort->rt6i_idev;
1374 if (rt->rt6i_idev)
1375 in6_dev_hold(rt->rt6i_idev);
1376 rt->u.dst.lastuse = jiffies;
1377 rt->rt6i_expires = 0;
1378
1379 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1380 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1381 rt->rt6i_metric = 0;
1382
1383 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1384#ifdef CONFIG_IPV6_SUBTREES
1385 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1386#endif
1387 }
1388 return rt;
1389}
1390
70ceb4f5
YH
1391#ifdef CONFIG_IPV6_ROUTE_INFO
1392static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1393 struct in6_addr *gwaddr, int ifindex)
1394{
1395 struct fib6_node *fn;
1396 struct rt6_info *rt = NULL;
1397
1398 write_lock_bh(&rt6_lock);
1399 fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1400 if (!fn)
1401 goto out;
1402
1403 for (rt = fn->leaf; rt; rt = rt->u.next) {
1404 if (rt->rt6i_dev->ifindex != ifindex)
1405 continue;
1406 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1407 continue;
1408 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1409 continue;
1410 dst_hold(&rt->u.dst);
1411 break;
1412 }
1413out:
1414 write_unlock_bh(&rt6_lock);
1415 return rt;
1416}
1417
1418static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1419 struct in6_addr *gwaddr, int ifindex,
1420 unsigned pref)
1421{
1422 struct in6_rtmsg rtmsg;
1423
1424 memset(&rtmsg, 0, sizeof(rtmsg));
1425 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1426 ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1427 rtmsg.rtmsg_dst_len = prefixlen;
1428 ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1429 rtmsg.rtmsg_metric = 1024;
1430 rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
e317da96
YH
1431 /* We should treat it as a default route if prefix length is 0. */
1432 if (!prefixlen)
1433 rtmsg.rtmsg_flags |= RTF_DEFAULT;
70ceb4f5
YH
1434 rtmsg.rtmsg_ifindex = ifindex;
1435
1436 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1437
1438 return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1439}
1440#endif
1441
1da177e4
LT
1442struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1443{
1444 struct rt6_info *rt;
1445 struct fib6_node *fn;
1446
1447 fn = &ip6_routing_table;
1448
1449 write_lock_bh(&rt6_lock);
1450 for (rt = fn->leaf; rt; rt=rt->u.next) {
1451 if (dev == rt->rt6i_dev &&
045927ff 1452 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1da177e4
LT
1453 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1454 break;
1455 }
1456 if (rt)
1457 dst_hold(&rt->u.dst);
1458 write_unlock_bh(&rt6_lock);
1459 return rt;
1460}
1461
1462struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
ebacaaa0
YH
1463 struct net_device *dev,
1464 unsigned int pref)
1da177e4
LT
1465{
1466 struct in6_rtmsg rtmsg;
1467
1468 memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1469 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1470 ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1471 rtmsg.rtmsg_metric = 1024;
ebacaaa0
YH
1472 rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1473 RTF_PREF(pref);
1da177e4
LT
1474
1475 rtmsg.rtmsg_ifindex = dev->ifindex;
1476
0d51aa80 1477 ip6_route_add(&rtmsg, NULL, NULL, NULL);
1da177e4
LT
1478 return rt6_get_dflt_router(gwaddr, dev);
1479}
1480
1481void rt6_purge_dflt_routers(void)
1482{
1483 struct rt6_info *rt;
1484
1485restart:
1486 read_lock_bh(&rt6_lock);
1487 for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1488 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1489 dst_hold(&rt->u.dst);
1490
1da177e4
LT
1491 read_unlock_bh(&rt6_lock);
1492
0d51aa80 1493 ip6_del_rt(rt, NULL, NULL, NULL);
1da177e4
LT
1494
1495 goto restart;
1496 }
1497 }
1498 read_unlock_bh(&rt6_lock);
1499}
1500
1501int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1502{
1503 struct in6_rtmsg rtmsg;
1504 int err;
1505
1506 switch(cmd) {
1507 case SIOCADDRT: /* Add a route */
1508 case SIOCDELRT: /* Delete a route */
1509 if (!capable(CAP_NET_ADMIN))
1510 return -EPERM;
1511 err = copy_from_user(&rtmsg, arg,
1512 sizeof(struct in6_rtmsg));
1513 if (err)
1514 return -EFAULT;
1515
1516 rtnl_lock();
1517 switch (cmd) {
1518 case SIOCADDRT:
0d51aa80 1519 err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1da177e4
LT
1520 break;
1521 case SIOCDELRT:
0d51aa80 1522 err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1da177e4
LT
1523 break;
1524 default:
1525 err = -EINVAL;
1526 }
1527 rtnl_unlock();
1528
1529 return err;
1530 };
1531
1532 return -EINVAL;
1533}
1534
1535/*
1536 * Drop the packet on the floor
1537 */
1538
20380731 1539static int ip6_pkt_discard(struct sk_buff *skb)
1da177e4 1540{
76d0cc1b
LL
1541 int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1542 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1543 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1544
1da177e4
LT
1545 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1546 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1547 kfree_skb(skb);
1548 return 0;
1549}
1550
20380731 1551static int ip6_pkt_discard_out(struct sk_buff *skb)
1da177e4
LT
1552{
1553 skb->dev = skb->dst->dev;
1554 return ip6_pkt_discard(skb);
1555}
1556
1557/*
1558 * Allocate a dst for local (unicast / anycast) address.
1559 */
1560
1561struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1562 const struct in6_addr *addr,
1563 int anycast)
1564{
1565 struct rt6_info *rt = ip6_dst_alloc();
1566
1567 if (rt == NULL)
1568 return ERR_PTR(-ENOMEM);
1569
1570 dev_hold(&loopback_dev);
1571 in6_dev_hold(idev);
1572
1573 rt->u.dst.flags = DST_HOST;
1574 rt->u.dst.input = ip6_input;
1575 rt->u.dst.output = ip6_output;
1576 rt->rt6i_dev = &loopback_dev;
1577 rt->rt6i_idev = idev;
1578 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1579 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1580 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1581 rt->u.dst.obsolete = -1;
1582
1583 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
58c4fb86
YH
1584 if (anycast)
1585 rt->rt6i_flags |= RTF_ANYCAST;
1586 else
1da177e4
LT
1587 rt->rt6i_flags |= RTF_LOCAL;
1588 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1589 if (rt->rt6i_nexthop == NULL) {
1590 dst_free((struct dst_entry *) rt);
1591 return ERR_PTR(-ENOMEM);
1592 }
1593
1594 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1595 rt->rt6i_dst.plen = 128;
1596
1597 atomic_set(&rt->u.dst.__refcnt, 1);
1598
1599 return rt;
1600}
1601
1602static int fib6_ifdown(struct rt6_info *rt, void *arg)
1603{
1604 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1605 rt != &ip6_null_entry) {
1606 RT6_TRACE("deleted by ifdown %p\n", rt);
1607 return -1;
1608 }
1609 return 0;
1610}
1611
1612void rt6_ifdown(struct net_device *dev)
1613{
1614 write_lock_bh(&rt6_lock);
1615 fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1616 write_unlock_bh(&rt6_lock);
1617}
1618
1619struct rt6_mtu_change_arg
1620{
1621 struct net_device *dev;
1622 unsigned mtu;
1623};
1624
1625static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1626{
1627 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1628 struct inet6_dev *idev;
1629
1630 /* In IPv6 pmtu discovery is not optional,
1631 so that RTAX_MTU lock cannot disable it.
1632 We still use this lock to block changes
1633 caused by addrconf/ndisc.
1634 */
1635
1636 idev = __in6_dev_get(arg->dev);
1637 if (idev == NULL)
1638 return 0;
1639
1640 /* For administrative MTU increase, there is no way to discover
1641 IPv6 PMTU increase, so PMTU increase should be updated here.
1642 Since RFC 1981 doesn't include administrative MTU increase
1643 update PMTU increase is a MUST. (i.e. jumbo frame)
1644 */
1645 /*
1646 If new MTU is less than route PMTU, this new MTU will be the
1647 lowest MTU in the path, update the route PMTU to reflect PMTU
1648 decreases; if new MTU is greater than route PMTU, and the
1649 old MTU is the lowest MTU in the path, update the route PMTU
1650 to reflect the increase. In this case if the other nodes' MTU
1651 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1652 PMTU discouvery.
1653 */
1654 if (rt->rt6i_dev == arg->dev &&
1655 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1656 (dst_mtu(&rt->u.dst) > arg->mtu ||
1657 (dst_mtu(&rt->u.dst) < arg->mtu &&
1658 dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1659 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1660 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1661 return 0;
1662}
1663
1664void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1665{
1666 struct rt6_mtu_change_arg arg;
1667
1668 arg.dev = dev;
1669 arg.mtu = mtu;
1670 read_lock_bh(&rt6_lock);
1671 fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1672 read_unlock_bh(&rt6_lock);
1673}
1674
1675static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1676 struct in6_rtmsg *rtmsg)
1677{
1678 memset(rtmsg, 0, sizeof(*rtmsg));
1679
1680 rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1681 rtmsg->rtmsg_src_len = r->rtm_src_len;
1682 rtmsg->rtmsg_flags = RTF_UP;
1683 if (r->rtm_type == RTN_UNREACHABLE)
1684 rtmsg->rtmsg_flags |= RTF_REJECT;
1685
1686 if (rta[RTA_GATEWAY-1]) {
1687 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1688 return -EINVAL;
1689 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1690 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1691 }
1692 if (rta[RTA_DST-1]) {
1693 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1694 return -EINVAL;
1695 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1696 }
1697 if (rta[RTA_SRC-1]) {
1698 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1699 return -EINVAL;
1700 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1701 }
1702 if (rta[RTA_OIF-1]) {
1703 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1704 return -EINVAL;
1705 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1706 }
1707 if (rta[RTA_PRIORITY-1]) {
1708 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1709 return -EINVAL;
1710 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1711 }
1712 return 0;
1713}
1714
1715int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1716{
1717 struct rtmsg *r = NLMSG_DATA(nlh);
1718 struct in6_rtmsg rtmsg;
1719
1720 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1721 return -EINVAL;
0d51aa80 1722 return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1da177e4
LT
1723}
1724
1725int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1726{
1727 struct rtmsg *r = NLMSG_DATA(nlh);
1728 struct in6_rtmsg rtmsg;
1729
1730 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1731 return -EINVAL;
0d51aa80 1732 return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1da177e4
LT
1733}
1734
1735struct rt6_rtnl_dump_arg
1736{
1737 struct sk_buff *skb;
1738 struct netlink_callback *cb;
1739};
1740
1741static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
0d51aa80
JHS
1742 struct in6_addr *dst, struct in6_addr *src,
1743 int iif, int type, u32 pid, u32 seq,
1744 int prefix, unsigned int flags)
1da177e4
LT
1745{
1746 struct rtmsg *rtm;
1747 struct nlmsghdr *nlh;
1748 unsigned char *b = skb->tail;
1749 struct rta_cacheinfo ci;
1750
1751 if (prefix) { /* user wants prefix routes only */
1752 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1753 /* success since this is not a prefix route */
1754 return 1;
1755 }
1756 }
1757
b6544c0b 1758 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1da177e4
LT
1759 rtm = NLMSG_DATA(nlh);
1760 rtm->rtm_family = AF_INET6;
1761 rtm->rtm_dst_len = rt->rt6i_dst.plen;
1762 rtm->rtm_src_len = rt->rt6i_src.plen;
1763 rtm->rtm_tos = 0;
1764 rtm->rtm_table = RT_TABLE_MAIN;
1765 if (rt->rt6i_flags&RTF_REJECT)
1766 rtm->rtm_type = RTN_UNREACHABLE;
1767 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1768 rtm->rtm_type = RTN_LOCAL;
1769 else
1770 rtm->rtm_type = RTN_UNICAST;
1771 rtm->rtm_flags = 0;
1772 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1773 rtm->rtm_protocol = rt->rt6i_protocol;
1774 if (rt->rt6i_flags&RTF_DYNAMIC)
1775 rtm->rtm_protocol = RTPROT_REDIRECT;
1776 else if (rt->rt6i_flags & RTF_ADDRCONF)
1777 rtm->rtm_protocol = RTPROT_KERNEL;
1778 else if (rt->rt6i_flags&RTF_DEFAULT)
1779 rtm->rtm_protocol = RTPROT_RA;
1780
1781 if (rt->rt6i_flags&RTF_CACHE)
1782 rtm->rtm_flags |= RTM_F_CLONED;
1783
1784 if (dst) {
1785 RTA_PUT(skb, RTA_DST, 16, dst);
1786 rtm->rtm_dst_len = 128;
1787 } else if (rtm->rtm_dst_len)
1788 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1789#ifdef CONFIG_IPV6_SUBTREES
1790 if (src) {
1791 RTA_PUT(skb, RTA_SRC, 16, src);
1792 rtm->rtm_src_len = 128;
1793 } else if (rtm->rtm_src_len)
1794 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1795#endif
1796 if (iif)
1797 RTA_PUT(skb, RTA_IIF, 4, &iif);
1798 else if (dst) {
1799 struct in6_addr saddr_buf;
1800 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1801 RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1802 }
1803 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1804 goto rtattr_failure;
1805 if (rt->u.dst.neighbour)
1806 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1807 if (rt->u.dst.dev)
1808 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1809 RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1810 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1811 if (rt->rt6i_expires)
1812 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1813 else
1814 ci.rta_expires = 0;
1815 ci.rta_used = rt->u.dst.__use;
1816 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1817 ci.rta_error = rt->u.dst.error;
1818 ci.rta_id = 0;
1819 ci.rta_ts = 0;
1820 ci.rta_tsage = 0;
1821 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1822 nlh->nlmsg_len = skb->tail - b;
1823 return skb->len;
1824
1825nlmsg_failure:
1826rtattr_failure:
1827 skb_trim(skb, b - skb->data);
1828 return -1;
1829}
1830
1831static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1832{
1833 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1834 int prefix;
1835
1836 if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1837 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1838 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1839 } else
1840 prefix = 0;
1841
1842 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1843 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
0d51aa80 1844 prefix, NLM_F_MULTI);
1da177e4
LT
1845}
1846
1847static int fib6_dump_node(struct fib6_walker_t *w)
1848{
1849 int res;
1850 struct rt6_info *rt;
1851
1852 for (rt = w->leaf; rt; rt = rt->u.next) {
1853 res = rt6_dump_route(rt, w->args);
1854 if (res < 0) {
1855 /* Frame is full, suspend walking */
1856 w->leaf = rt;
1857 return 1;
1858 }
1859 BUG_TRAP(res!=0);
1860 }
1861 w->leaf = NULL;
1862 return 0;
1863}
1864
1865static void fib6_dump_end(struct netlink_callback *cb)
1866{
1867 struct fib6_walker_t *w = (void*)cb->args[0];
1868
1869 if (w) {
1870 cb->args[0] = 0;
1871 fib6_walker_unlink(w);
1872 kfree(w);
1873 }
efacfbcb
HX
1874 cb->done = (void*)cb->args[1];
1875 cb->args[1] = 0;
1da177e4
LT
1876}
1877
1878static int fib6_dump_done(struct netlink_callback *cb)
1879{
1880 fib6_dump_end(cb);
a8f74b22 1881 return cb->done ? cb->done(cb) : 0;
1da177e4
LT
1882}
1883
1884int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1885{
1886 struct rt6_rtnl_dump_arg arg;
1887 struct fib6_walker_t *w;
1888 int res;
1889
1890 arg.skb = skb;
1891 arg.cb = cb;
1892
1893 w = (void*)cb->args[0];
1894 if (w == NULL) {
1895 /* New dump:
1896 *
1897 * 1. hook callback destructor.
1898 */
1899 cb->args[1] = (long)cb->done;
1900 cb->done = fib6_dump_done;
1901
1902 /*
1903 * 2. allocate and initialize walker.
1904 */
0c600eda 1905 w = kzalloc(sizeof(*w), GFP_ATOMIC);
1da177e4
LT
1906 if (w == NULL)
1907 return -ENOMEM;
1908 RT6_TRACE("dump<%p", w);
1da177e4
LT
1909 w->root = &ip6_routing_table;
1910 w->func = fib6_dump_node;
1911 w->args = &arg;
1912 cb->args[0] = (long)w;
1913 read_lock_bh(&rt6_lock);
1914 res = fib6_walk(w);
1915 read_unlock_bh(&rt6_lock);
1916 } else {
1917 w->args = &arg;
1918 read_lock_bh(&rt6_lock);
1919 res = fib6_walk_continue(w);
1920 read_unlock_bh(&rt6_lock);
1921 }
1922#if RT6_DEBUG >= 3
1923 if (res <= 0 && skb->len == 0)
1924 RT6_TRACE("%p>dump end\n", w);
1925#endif
1926 res = res < 0 ? res : skb->len;
1927 /* res < 0 is an error. (really, impossible)
1928 res == 0 means that dump is complete, but skb still can contain data.
1929 res > 0 dump is not complete, but frame is full.
1930 */
1931 /* Destroy walker, if dump of this table is complete. */
1932 if (res <= 0)
1933 fib6_dump_end(cb);
1934 return res;
1935}
1936
1937int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1938{
1939 struct rtattr **rta = arg;
1940 int iif = 0;
1941 int err = -ENOBUFS;
1942 struct sk_buff *skb;
1943 struct flowi fl;
1944 struct rt6_info *rt;
1945
1946 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1947 if (skb == NULL)
1948 goto out;
1949
1950 /* Reserve room for dummy headers, this skb can pass
1951 through good chunk of routing engine.
1952 */
1953 skb->mac.raw = skb->data;
1954 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1955
1956 memset(&fl, 0, sizeof(fl));
1957 if (rta[RTA_SRC-1])
1958 ipv6_addr_copy(&fl.fl6_src,
1959 (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1960 if (rta[RTA_DST-1])
1961 ipv6_addr_copy(&fl.fl6_dst,
1962 (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1963
1964 if (rta[RTA_IIF-1])
1965 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1966
1967 if (iif) {
1968 struct net_device *dev;
1969 dev = __dev_get_by_index(iif);
1970 if (!dev) {
1971 err = -ENODEV;
1972 goto out_free;
1973 }
1974 }
1975
1976 fl.oif = 0;
1977 if (rta[RTA_OIF-1])
1978 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1979
1980 rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1981
1982 skb->dst = &rt->u.dst;
1983
1984 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1985 err = rt6_fill_node(skb, rt,
1986 &fl.fl6_dst, &fl.fl6_src,
1987 iif,
1988 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
0d51aa80 1989 nlh->nlmsg_seq, 0, 0);
1da177e4
LT
1990 if (err < 0) {
1991 err = -EMSGSIZE;
1992 goto out_free;
1993 }
1994
1995 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1996 if (err > 0)
1997 err = 0;
1998out:
1999 return err;
2000out_free:
2001 kfree_skb(skb);
2002 goto out;
2003}
2004
0d51aa80
JHS
2005void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
2006 struct netlink_skb_parms *req)
1da177e4
LT
2007{
2008 struct sk_buff *skb;
2009 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
0d51aa80
JHS
2010 u32 pid = current->pid;
2011 u32 seq = 0;
1da177e4 2012
0d51aa80
JHS
2013 if (req)
2014 pid = req->pid;
2015 if (nlh)
2016 seq = nlh->nlmsg_seq;
2017
1da177e4
LT
2018 skb = alloc_skb(size, gfp_any());
2019 if (!skb) {
ac6d439d 2020 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
1da177e4
LT
2021 return;
2022 }
0d51aa80 2023 if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
1da177e4 2024 kfree_skb(skb);
ac6d439d 2025 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
1da177e4
LT
2026 return;
2027 }
ac6d439d
PM
2028 NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2029 netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
1da177e4
LT
2030}
2031
2032/*
2033 * /proc
2034 */
2035
2036#ifdef CONFIG_PROC_FS
2037
2038#define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2039
2040struct rt6_proc_arg
2041{
2042 char *buffer;
2043 int offset;
2044 int length;
2045 int skip;
2046 int len;
2047};
2048
2049static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2050{
2051 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2052 int i;
2053
2054 if (arg->skip < arg->offset / RT6_INFO_LEN) {
2055 arg->skip++;
2056 return 0;
2057 }
2058
2059 if (arg->len >= arg->length)
2060 return 0;
2061
2062 for (i=0; i<16; i++) {
2063 sprintf(arg->buffer + arg->len, "%02x",
2064 rt->rt6i_dst.addr.s6_addr[i]);
2065 arg->len += 2;
2066 }
2067 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2068 rt->rt6i_dst.plen);
2069
2070#ifdef CONFIG_IPV6_SUBTREES
2071 for (i=0; i<16; i++) {
2072 sprintf(arg->buffer + arg->len, "%02x",
2073 rt->rt6i_src.addr.s6_addr[i]);
2074 arg->len += 2;
2075 }
2076 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2077 rt->rt6i_src.plen);
2078#else
2079 sprintf(arg->buffer + arg->len,
2080 "00000000000000000000000000000000 00 ");
2081 arg->len += 36;
2082#endif
2083
2084 if (rt->rt6i_nexthop) {
2085 for (i=0; i<16; i++) {
2086 sprintf(arg->buffer + arg->len, "%02x",
2087 rt->rt6i_nexthop->primary_key[i]);
2088 arg->len += 2;
2089 }
2090 } else {
2091 sprintf(arg->buffer + arg->len,
2092 "00000000000000000000000000000000");
2093 arg->len += 32;
2094 }
2095 arg->len += sprintf(arg->buffer + arg->len,
2096 " %08x %08x %08x %08x %8s\n",
2097 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2098 rt->u.dst.__use, rt->rt6i_flags,
2099 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2100 return 0;
2101}
2102
2103static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2104{
2105 struct rt6_proc_arg arg;
2106 arg.buffer = buffer;
2107 arg.offset = offset;
2108 arg.length = length;
2109 arg.skip = 0;
2110 arg.len = 0;
2111
2112 read_lock_bh(&rt6_lock);
2113 fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2114 read_unlock_bh(&rt6_lock);
2115
2116 *start = buffer;
2117 if (offset)
2118 *start += offset % RT6_INFO_LEN;
2119
2120 arg.len -= offset % RT6_INFO_LEN;
2121
2122 if (arg.len > length)
2123 arg.len = length;
2124 if (arg.len < 0)
2125 arg.len = 0;
2126
2127 return arg.len;
2128}
2129
1da177e4
LT
2130static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2131{
2132 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2133 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2134 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2135 rt6_stats.fib_rt_cache,
2136 atomic_read(&ip6_dst_ops.entries),
2137 rt6_stats.fib_discarded_routes);
2138
2139 return 0;
2140}
2141
2142static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2143{
2144 return single_open(file, rt6_stats_seq_show, NULL);
2145}
2146
2147static struct file_operations rt6_stats_seq_fops = {
2148 .owner = THIS_MODULE,
2149 .open = rt6_stats_seq_open,
2150 .read = seq_read,
2151 .llseek = seq_lseek,
2152 .release = single_release,
2153};
2154#endif /* CONFIG_PROC_FS */
2155
2156#ifdef CONFIG_SYSCTL
2157
2158static int flush_delay;
2159
2160static
2161int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2162 void __user *buffer, size_t *lenp, loff_t *ppos)
2163{
2164 if (write) {
2165 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2166 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2167 return 0;
2168 } else
2169 return -EINVAL;
2170}
2171
2172ctl_table ipv6_route_table[] = {
2173 {
2174 .ctl_name = NET_IPV6_ROUTE_FLUSH,
2175 .procname = "flush",
2176 .data = &flush_delay,
2177 .maxlen = sizeof(int),
89c8b3a1 2178 .mode = 0200,
1da177e4
LT
2179 .proc_handler = &ipv6_sysctl_rtcache_flush
2180 },
2181 {
2182 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
2183 .procname = "gc_thresh",
2184 .data = &ip6_dst_ops.gc_thresh,
2185 .maxlen = sizeof(int),
2186 .mode = 0644,
2187 .proc_handler = &proc_dointvec,
2188 },
2189 {
2190 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
2191 .procname = "max_size",
2192 .data = &ip6_rt_max_size,
2193 .maxlen = sizeof(int),
2194 .mode = 0644,
2195 .proc_handler = &proc_dointvec,
2196 },
2197 {
2198 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2199 .procname = "gc_min_interval",
2200 .data = &ip6_rt_gc_min_interval,
2201 .maxlen = sizeof(int),
2202 .mode = 0644,
2203 .proc_handler = &proc_dointvec_jiffies,
2204 .strategy = &sysctl_jiffies,
2205 },
2206 {
2207 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
2208 .procname = "gc_timeout",
2209 .data = &ip6_rt_gc_timeout,
2210 .maxlen = sizeof(int),
2211 .mode = 0644,
2212 .proc_handler = &proc_dointvec_jiffies,
2213 .strategy = &sysctl_jiffies,
2214 },
2215 {
2216 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
2217 .procname = "gc_interval",
2218 .data = &ip6_rt_gc_interval,
2219 .maxlen = sizeof(int),
2220 .mode = 0644,
2221 .proc_handler = &proc_dointvec_jiffies,
2222 .strategy = &sysctl_jiffies,
2223 },
2224 {
2225 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
2226 .procname = "gc_elasticity",
2227 .data = &ip6_rt_gc_elasticity,
2228 .maxlen = sizeof(int),
2229 .mode = 0644,
2230 .proc_handler = &proc_dointvec_jiffies,
2231 .strategy = &sysctl_jiffies,
2232 },
2233 {
2234 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
2235 .procname = "mtu_expires",
2236 .data = &ip6_rt_mtu_expires,
2237 .maxlen = sizeof(int),
2238 .mode = 0644,
2239 .proc_handler = &proc_dointvec_jiffies,
2240 .strategy = &sysctl_jiffies,
2241 },
2242 {
2243 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
2244 .procname = "min_adv_mss",
2245 .data = &ip6_rt_min_advmss,
2246 .maxlen = sizeof(int),
2247 .mode = 0644,
2248 .proc_handler = &proc_dointvec_jiffies,
2249 .strategy = &sysctl_jiffies,
2250 },
2251 {
2252 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2253 .procname = "gc_min_interval_ms",
2254 .data = &ip6_rt_gc_min_interval,
2255 .maxlen = sizeof(int),
2256 .mode = 0644,
2257 .proc_handler = &proc_dointvec_ms_jiffies,
2258 .strategy = &sysctl_ms_jiffies,
2259 },
2260 { .ctl_name = 0 }
2261};
2262
2263#endif
2264
2265void __init ip6_route_init(void)
2266{
2267 struct proc_dir_entry *p;
2268
2269 ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2270 sizeof(struct rt6_info),
2271 0, SLAB_HWCACHE_ALIGN,
2272 NULL, NULL);
2273 if (!ip6_dst_ops.kmem_cachep)
2274 panic("cannot create ip6_dst_cache");
2275
2276 fib6_init();
2277#ifdef CONFIG_PROC_FS
2278 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2279 if (p)
2280 p->owner = THIS_MODULE;
2281
2282 proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2283#endif
2284#ifdef CONFIG_XFRM
2285 xfrm6_init();
2286#endif
2287}
2288
2289void ip6_route_cleanup(void)
2290{
2291#ifdef CONFIG_PROC_FS
2292 proc_net_remove("ipv6_route");
2293 proc_net_remove("rt6_stats");
2294#endif
2295#ifdef CONFIG_XFRM
2296 xfrm6_fini();
2297#endif
2298 rt6_ifdown(NULL);
2299 fib6_gc_cleanup();
2300 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2301}