2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/init.h>
38 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
52 #include <linux/rtnetlink.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
58 #include <asm/uaccess.h>
61 #include <linux/sysctl.h>
64 /* Set to 3 to get tracing. */
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #define RT6_TRACE(x...) do { ; } while (0)
75 #define CLONE_OFFLINK_ROUTE 0
77 #define RT6_SELECT_F_IFACE 0x1
78 #define RT6_SELECT_F_REACHABLE 0x2
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(void);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct sk_buff *skb);
98 static void ip6_link_failure(struct sk_buff *skb);
99 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103 struct in6_addr *gwaddr, int ifindex,
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106 struct in6_addr *gwaddr, int ifindex);
109 static struct dst_ops ip6_dst_ops = {
111 .protocol = __constant_htons(ETH_P_IPV6),
114 .check = ip6_dst_check,
115 .destroy = ip6_dst_destroy,
116 .ifdown = ip6_dst_ifdown,
117 .negative_advice = ip6_negative_advice,
118 .link_failure = ip6_link_failure,
119 .update_pmtu = ip6_rt_update_pmtu,
120 .entry_size = sizeof(struct rt6_info),
123 struct rt6_info ip6_null_entry = {
126 .__refcnt = ATOMIC_INIT(1),
128 .dev = &loopback_dev,
130 .error = -ENETUNREACH,
131 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
132 .input = ip6_pkt_discard,
133 .output = ip6_pkt_discard_out,
135 .path = (struct dst_entry*)&ip6_null_entry,
138 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
139 .rt6i_metric = ~(u32) 0,
140 .rt6i_ref = ATOMIC_INIT(1),
143 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
145 struct rt6_info ip6_prohibit_entry = {
148 .__refcnt = ATOMIC_INIT(1),
150 .dev = &loopback_dev,
153 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
154 .input = ip6_pkt_discard,
155 .output = ip6_pkt_discard_out,
157 .path = (struct dst_entry*)&ip6_prohibit_entry,
160 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
161 .rt6i_metric = ~(u32) 0,
162 .rt6i_ref = ATOMIC_INIT(1),
165 struct rt6_info ip6_blk_hole_entry = {
168 .__refcnt = ATOMIC_INIT(1),
170 .dev = &loopback_dev,
173 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
174 .input = ip6_pkt_discard,
175 .output = ip6_pkt_discard_out,
177 .path = (struct dst_entry*)&ip6_blk_hole_entry,
180 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
181 .rt6i_metric = ~(u32) 0,
182 .rt6i_ref = ATOMIC_INIT(1),
187 /* allocate dst with ip6_dst_ops */
188 static __inline__ struct rt6_info *ip6_dst_alloc(void)
190 return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
193 static void ip6_dst_destroy(struct dst_entry *dst)
195 struct rt6_info *rt = (struct rt6_info *)dst;
196 struct inet6_dev *idev = rt->rt6i_idev;
199 rt->rt6i_idev = NULL;
204 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
207 struct rt6_info *rt = (struct rt6_info *)dst;
208 struct inet6_dev *idev = rt->rt6i_idev;
210 if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
211 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
212 if (loopback_idev != NULL) {
213 rt->rt6i_idev = loopback_idev;
219 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
221 return (rt->rt6i_flags & RTF_EXPIRES &&
222 time_after(jiffies, rt->rt6i_expires));
225 static inline int rt6_need_strict(struct in6_addr *daddr)
227 return (ipv6_addr_type(daddr) &
228 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
232 * Route lookup. Any table->tb6_lock is implied.
235 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
239 struct rt6_info *local = NULL;
240 struct rt6_info *sprt;
243 for (sprt = rt; sprt; sprt = sprt->u.next) {
244 struct net_device *dev = sprt->rt6i_dev;
245 if (dev->ifindex == oif)
247 if (dev->flags & IFF_LOOPBACK) {
248 if (sprt->rt6i_idev == NULL ||
249 sprt->rt6i_idev->dev->ifindex != oif) {
252 if (local && (!oif ||
253 local->rt6i_idev->dev->ifindex == oif))
264 return &ip6_null_entry;
269 #ifdef CONFIG_IPV6_ROUTER_PREF
270 static void rt6_probe(struct rt6_info *rt)
272 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
274 * Okay, this does not seem to be appropriate
275 * for now, however, we need to check if it
276 * is really so; aka Router Reachability Probing.
278 * Router Reachability Probe MUST be rate-limited
279 * to no more than one per minute.
281 if (!neigh || (neigh->nud_state & NUD_VALID))
283 read_lock_bh(&neigh->lock);
284 if (!(neigh->nud_state & NUD_VALID) &&
285 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
286 struct in6_addr mcaddr;
287 struct in6_addr *target;
289 neigh->updated = jiffies;
290 read_unlock_bh(&neigh->lock);
292 target = (struct in6_addr *)&neigh->primary_key;
293 addrconf_addr_solict_mult(target, &mcaddr);
294 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
296 read_unlock_bh(&neigh->lock);
299 static inline void rt6_probe(struct rt6_info *rt)
306 * Default Router Selection (RFC 2461 6.3.6)
308 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
310 struct net_device *dev = rt->rt6i_dev;
311 if (!oif || dev->ifindex == oif)
313 if ((dev->flags & IFF_LOOPBACK) &&
314 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
319 static int inline rt6_check_neigh(struct rt6_info *rt)
321 struct neighbour *neigh = rt->rt6i_nexthop;
323 if (rt->rt6i_flags & RTF_NONEXTHOP ||
324 !(rt->rt6i_flags & RTF_GATEWAY))
327 read_lock_bh(&neigh->lock);
328 if (neigh->nud_state & NUD_VALID)
330 read_unlock_bh(&neigh->lock);
335 static int rt6_score_route(struct rt6_info *rt, int oif,
340 m = rt6_check_dev(rt, oif);
341 if (!m && (strict & RT6_SELECT_F_IFACE))
343 #ifdef CONFIG_IPV6_ROUTER_PREF
344 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
346 n = rt6_check_neigh(rt);
349 else if (!n && strict & RT6_SELECT_F_REACHABLE)
354 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
357 struct rt6_info *match = NULL, *last = NULL;
358 struct rt6_info *rt, *rt0 = *head;
362 RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
363 __FUNCTION__, head, head ? *head : NULL, oif);
365 for (rt = rt0, metric = rt0->rt6i_metric;
366 rt && rt->rt6i_metric == metric && (!last || rt != rt0);
370 if (rt6_check_expired(rt))
375 m = rt6_score_route(rt, oif, strict);
389 (strict & RT6_SELECT_F_REACHABLE) &&
390 last && last != rt0) {
391 /* no entries matched; do round-robin */
392 static DEFINE_SPINLOCK(lock);
395 rt0->u.next = last->u.next;
400 RT6_TRACE("%s() => %p, score=%d\n",
401 __FUNCTION__, match, mpri);
403 return (match ? match : &ip6_null_entry);
406 #ifdef CONFIG_IPV6_ROUTE_INFO
407 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
408 struct in6_addr *gwaddr)
410 struct route_info *rinfo = (struct route_info *) opt;
411 struct in6_addr prefix_buf, *prefix;
416 if (len < sizeof(struct route_info)) {
420 /* Sanity check for prefix_len and length */
421 if (rinfo->length > 3) {
423 } else if (rinfo->prefix_len > 128) {
425 } else if (rinfo->prefix_len > 64) {
426 if (rinfo->length < 2) {
429 } else if (rinfo->prefix_len > 0) {
430 if (rinfo->length < 1) {
435 pref = rinfo->route_pref;
436 if (pref == ICMPV6_ROUTER_PREF_INVALID)
437 pref = ICMPV6_ROUTER_PREF_MEDIUM;
439 lifetime = htonl(rinfo->lifetime);
440 if (lifetime == 0xffffffff) {
442 } else if (lifetime > 0x7fffffff/HZ) {
443 /* Avoid arithmetic overflow */
444 lifetime = 0x7fffffff/HZ - 1;
447 if (rinfo->length == 3)
448 prefix = (struct in6_addr *)rinfo->prefix;
450 /* this function is safe */
451 ipv6_addr_prefix(&prefix_buf,
452 (struct in6_addr *)rinfo->prefix,
454 prefix = &prefix_buf;
457 rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
459 if (rt && !lifetime) {
465 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
468 rt->rt6i_flags = RTF_ROUTEINFO |
469 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
472 if (lifetime == 0xffffffff) {
473 rt->rt6i_flags &= ~RTF_EXPIRES;
475 rt->rt6i_expires = jiffies + HZ * lifetime;
476 rt->rt6i_flags |= RTF_EXPIRES;
478 dst_release(&rt->u.dst);
484 #define BACKTRACK() \
485 if (rt == &ip6_null_entry && flags & RT6_F_STRICT) { \
486 while ((fn = fn->parent) != NULL) { \
487 if (fn->fn_flags & RTN_TL_ROOT) { \
488 dst_hold(&rt->u.dst); \
491 if (fn->fn_flags & RTN_RTINFO) \
496 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
497 struct flowi *fl, int flags)
499 struct fib6_node *fn;
502 read_lock_bh(&table->tb6_lock);
503 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
506 rt = rt6_device_match(rt, fl->oif, flags & RT6_F_STRICT);
508 dst_hold(&rt->u.dst);
510 read_unlock_bh(&table->tb6_lock);
512 rt->u.dst.lastuse = jiffies;
519 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
531 struct dst_entry *dst;
532 int flags = strict ? RT6_F_STRICT : 0;
534 dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
536 return (struct rt6_info *) dst;
543 /* ip6_ins_rt is called with FREE table->tb6_lock.
544 It takes new route entry, the addition fails by any reason the
545 route is freed. In any case, if caller does not hold it, it may
549 static int __ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
550 void *_rtattr, struct netlink_skb_parms *req)
553 struct fib6_table *table;
555 table = rt->rt6i_table;
556 write_lock_bh(&table->tb6_lock);
557 err = fib6_add(&table->tb6_root, rt, nlh, _rtattr, req);
558 write_unlock_bh(&table->tb6_lock);
563 int ip6_ins_rt(struct rt6_info *rt)
565 return __ip6_ins_rt(rt, NULL, NULL, NULL);
568 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
569 struct in6_addr *saddr)
577 rt = ip6_rt_copy(ort);
580 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
581 if (rt->rt6i_dst.plen != 128 &&
582 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
583 rt->rt6i_flags |= RTF_ANYCAST;
584 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
587 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
588 rt->rt6i_dst.plen = 128;
589 rt->rt6i_flags |= RTF_CACHE;
590 rt->u.dst.flags |= DST_HOST;
592 #ifdef CONFIG_IPV6_SUBTREES
593 if (rt->rt6i_src.plen && saddr) {
594 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
595 rt->rt6i_src.plen = 128;
599 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
606 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
608 struct rt6_info *rt = ip6_rt_copy(ort);
610 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
611 rt->rt6i_dst.plen = 128;
612 rt->rt6i_flags |= RTF_CACHE;
613 if (rt->rt6i_flags & RTF_REJECT)
614 rt->u.dst.error = ort->u.dst.error;
615 rt->u.dst.flags |= DST_HOST;
616 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
621 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
622 struct flowi *fl, int flags)
624 struct fib6_node *fn;
625 struct rt6_info *rt, *nrt;
629 int reachable = RT6_SELECT_F_REACHABLE;
631 if (flags & RT6_F_STRICT)
632 strict = RT6_SELECT_F_IFACE;
635 read_lock_bh(&table->tb6_lock);
638 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
641 rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
643 if (rt == &ip6_null_entry ||
644 rt->rt6i_flags & RTF_CACHE)
647 dst_hold(&rt->u.dst);
648 read_unlock_bh(&table->tb6_lock);
650 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
651 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
653 #if CLONE_OFFLINK_ROUTE
654 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
660 dst_release(&rt->u.dst);
661 rt = nrt ? : &ip6_null_entry;
663 dst_hold(&rt->u.dst);
665 err = ip6_ins_rt(nrt);
674 * Race condition! In the gap, when table->tb6_lock was
675 * released someone could insert this route. Relookup.
677 dst_release(&rt->u.dst);
685 dst_hold(&rt->u.dst);
686 read_unlock_bh(&table->tb6_lock);
688 rt->u.dst.lastuse = jiffies;
694 void ip6_route_input(struct sk_buff *skb)
696 struct ipv6hdr *iph = skb->nh.ipv6h;
698 .iif = skb->dev->ifindex,
703 .flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
706 .proto = iph->nexthdr,
710 if (rt6_need_strict(&iph->daddr))
711 flags |= RT6_F_STRICT;
713 skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
716 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
717 struct flowi *fl, int flags)
719 struct fib6_node *fn;
720 struct rt6_info *rt, *nrt;
724 int reachable = RT6_SELECT_F_REACHABLE;
726 if (flags & RT6_F_STRICT)
727 strict = RT6_SELECT_F_IFACE;
730 read_lock_bh(&table->tb6_lock);
733 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
736 rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
738 if (rt == &ip6_null_entry ||
739 rt->rt6i_flags & RTF_CACHE)
742 dst_hold(&rt->u.dst);
743 read_unlock_bh(&table->tb6_lock);
745 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
746 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
748 #if CLONE_OFFLINK_ROUTE
749 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
755 dst_release(&rt->u.dst);
756 rt = nrt ? : &ip6_null_entry;
758 dst_hold(&rt->u.dst);
760 err = ip6_ins_rt(nrt);
769 * Race condition! In the gap, when table->tb6_lock was
770 * released someone could insert this route. Relookup.
772 dst_release(&rt->u.dst);
780 dst_hold(&rt->u.dst);
781 read_unlock_bh(&table->tb6_lock);
783 rt->u.dst.lastuse = jiffies;
788 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
792 if (rt6_need_strict(&fl->fl6_dst))
793 flags |= RT6_F_STRICT;
795 return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
800 * Destination cache support functions
803 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
807 rt = (struct rt6_info *) dst;
809 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
815 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
817 struct rt6_info *rt = (struct rt6_info *) dst;
820 if (rt->rt6i_flags & RTF_CACHE)
828 static void ip6_link_failure(struct sk_buff *skb)
832 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
834 rt = (struct rt6_info *) skb->dst;
836 if (rt->rt6i_flags&RTF_CACHE) {
837 dst_set_expires(&rt->u.dst, 0);
838 rt->rt6i_flags |= RTF_EXPIRES;
839 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
840 rt->rt6i_node->fn_sernum = -1;
844 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
846 struct rt6_info *rt6 = (struct rt6_info*)dst;
848 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
849 rt6->rt6i_flags |= RTF_MODIFIED;
850 if (mtu < IPV6_MIN_MTU) {
852 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
854 dst->metrics[RTAX_MTU-1] = mtu;
855 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
859 static int ipv6_get_mtu(struct net_device *dev);
861 static inline unsigned int ipv6_advmss(unsigned int mtu)
863 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
865 if (mtu < ip6_rt_min_advmss)
866 mtu = ip6_rt_min_advmss;
869 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
870 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
871 * IPV6_MAXPLEN is also valid and means: "any MSS,
872 * rely only on pmtu discovery"
874 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
879 static struct dst_entry *ndisc_dst_gc_list;
880 static DEFINE_SPINLOCK(ndisc_lock);
882 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
883 struct neighbour *neigh,
884 struct in6_addr *addr,
885 int (*output)(struct sk_buff *))
888 struct inet6_dev *idev = in6_dev_get(dev);
890 if (unlikely(idev == NULL))
893 rt = ip6_dst_alloc();
894 if (unlikely(rt == NULL)) {
903 neigh = ndisc_get_neigh(dev, addr);
906 rt->rt6i_idev = idev;
907 rt->rt6i_nexthop = neigh;
908 atomic_set(&rt->u.dst.__refcnt, 1);
909 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
910 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
911 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
912 rt->u.dst.output = output;
914 #if 0 /* there's no chance to use these for ndisc */
915 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
918 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
919 rt->rt6i_dst.plen = 128;
922 spin_lock_bh(&ndisc_lock);
923 rt->u.dst.next = ndisc_dst_gc_list;
924 ndisc_dst_gc_list = &rt->u.dst;
925 spin_unlock_bh(&ndisc_lock);
927 fib6_force_start_gc();
930 return (struct dst_entry *)rt;
933 int ndisc_dst_gc(int *more)
935 struct dst_entry *dst, *next, **pprev;
941 spin_lock_bh(&ndisc_lock);
942 pprev = &ndisc_dst_gc_list;
944 while ((dst = *pprev) != NULL) {
945 if (!atomic_read(&dst->__refcnt)) {
955 spin_unlock_bh(&ndisc_lock);
960 static int ip6_dst_gc(void)
962 static unsigned expire = 30*HZ;
963 static unsigned long last_gc;
964 unsigned long now = jiffies;
966 if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
967 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
973 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
974 expire = ip6_rt_gc_timeout>>1;
977 expire -= expire>>ip6_rt_gc_elasticity;
978 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
981 /* Clean host part of a prefix. Not necessary in radix tree,
982 but results in cleaner routing tables.
984 Remove it only when all the things will work!
987 static int ipv6_get_mtu(struct net_device *dev)
989 int mtu = IPV6_MIN_MTU;
990 struct inet6_dev *idev;
992 idev = in6_dev_get(dev);
994 mtu = idev->cnf.mtu6;
1000 int ipv6_get_hoplimit(struct net_device *dev)
1002 int hoplimit = ipv6_devconf.hop_limit;
1003 struct inet6_dev *idev;
1005 idev = in6_dev_get(dev);
1007 hoplimit = idev->cnf.hop_limit;
1017 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
1018 void *_rtattr, struct netlink_skb_parms *req,
1023 struct rtattr **rta;
1024 struct rt6_info *rt = NULL;
1025 struct net_device *dev = NULL;
1026 struct inet6_dev *idev = NULL;
1027 struct fib6_table *table;
1030 rta = (struct rtattr **) _rtattr;
1032 if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
1034 #ifndef CONFIG_IPV6_SUBTREES
1035 if (rtmsg->rtmsg_src_len)
1038 if (rtmsg->rtmsg_ifindex) {
1040 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
1043 idev = in6_dev_get(dev);
1048 if (rtmsg->rtmsg_metric == 0)
1049 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
1051 table = fib6_new_table(table_id);
1052 if (table == NULL) {
1057 rt = ip6_dst_alloc();
1064 rt->u.dst.obsolete = -1;
1065 rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
1066 if (nlh && (r = NLMSG_DATA(nlh))) {
1067 rt->rt6i_protocol = r->rtm_protocol;
1069 rt->rt6i_protocol = RTPROT_BOOT;
1072 addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
1074 if (addr_type & IPV6_ADDR_MULTICAST)
1075 rt->u.dst.input = ip6_mc_input;
1077 rt->u.dst.input = ip6_forward;
1079 rt->u.dst.output = ip6_output;
1081 ipv6_addr_prefix(&rt->rt6i_dst.addr,
1082 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
1083 rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
1084 if (rt->rt6i_dst.plen == 128)
1085 rt->u.dst.flags = DST_HOST;
1087 #ifdef CONFIG_IPV6_SUBTREES
1088 ipv6_addr_prefix(&rt->rt6i_src.addr,
1089 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1090 rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
1093 rt->rt6i_metric = rtmsg->rtmsg_metric;
1095 /* We cannot add true routes via loopback here,
1096 they would result in kernel looping; promote them to reject routes
1098 if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
1099 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1100 /* hold loopback dev/idev if we haven't done so. */
1101 if (dev != &loopback_dev) {
1106 dev = &loopback_dev;
1108 idev = in6_dev_get(dev);
1114 rt->u.dst.output = ip6_pkt_discard_out;
1115 rt->u.dst.input = ip6_pkt_discard;
1116 rt->u.dst.error = -ENETUNREACH;
1117 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1121 if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
1122 struct in6_addr *gw_addr;
1125 gw_addr = &rtmsg->rtmsg_gateway;
1126 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
1127 gwa_type = ipv6_addr_type(gw_addr);
1129 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1130 struct rt6_info *grt;
1132 /* IPv6 strictly inhibits using not link-local
1133 addresses as nexthop address.
1134 Otherwise, router will not able to send redirects.
1135 It is very good, but in some (rare!) circumstances
1136 (SIT, PtP, NBMA NOARP links) it is handy to allow
1137 some exceptions. --ANK
1140 if (!(gwa_type&IPV6_ADDR_UNICAST))
1143 grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1145 err = -EHOSTUNREACH;
1149 if (dev != grt->rt6i_dev) {
1150 dst_release(&grt->u.dst);
1154 dev = grt->rt6i_dev;
1155 idev = grt->rt6i_idev;
1157 in6_dev_hold(grt->rt6i_idev);
1159 if (!(grt->rt6i_flags&RTF_GATEWAY))
1161 dst_release(&grt->u.dst);
1167 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1175 if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1176 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1177 if (IS_ERR(rt->rt6i_nexthop)) {
1178 err = PTR_ERR(rt->rt6i_nexthop);
1179 rt->rt6i_nexthop = NULL;
1184 rt->rt6i_flags = rtmsg->rtmsg_flags;
1187 if (rta && rta[RTA_METRICS-1]) {
1188 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1189 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1191 while (RTA_OK(attr, attrlen)) {
1192 unsigned flavor = attr->rta_type;
1194 if (flavor > RTAX_MAX) {
1198 rt->u.dst.metrics[flavor-1] =
1199 *(u32 *)RTA_DATA(attr);
1201 attr = RTA_NEXT(attr, attrlen);
1205 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1206 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1207 if (!rt->u.dst.metrics[RTAX_MTU-1])
1208 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1209 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1210 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1211 rt->u.dst.dev = dev;
1212 rt->rt6i_idev = idev;
1213 rt->rt6i_table = table;
1214 return __ip6_ins_rt(rt, nlh, _rtattr, req);
1222 dst_free((struct dst_entry *) rt);
1226 static int __ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
1227 void *_rtattr, struct netlink_skb_parms *req)
1230 struct fib6_table *table;
1232 if (rt == &ip6_null_entry)
1235 table = rt->rt6i_table;
1236 write_lock_bh(&table->tb6_lock);
1238 err = fib6_del(rt, nlh, _rtattr, req);
1239 dst_release(&rt->u.dst);
1241 write_unlock_bh(&table->tb6_lock);
1246 int ip6_del_rt(struct rt6_info *rt)
1248 return __ip6_del_rt(rt, NULL, NULL, NULL);
1251 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
1252 void *_rtattr, struct netlink_skb_parms *req,
1255 struct fib6_table *table;
1256 struct fib6_node *fn;
1257 struct rt6_info *rt;
1260 table = fib6_get_table(table_id);
1264 read_lock_bh(&table->tb6_lock);
1266 fn = fib6_locate(&table->tb6_root,
1267 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1268 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1271 for (rt = fn->leaf; rt; rt = rt->u.next) {
1272 if (rtmsg->rtmsg_ifindex &&
1273 (rt->rt6i_dev == NULL ||
1274 rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1276 if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1277 !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1279 if (rtmsg->rtmsg_metric &&
1280 rtmsg->rtmsg_metric != rt->rt6i_metric)
1282 dst_hold(&rt->u.dst);
1283 read_unlock_bh(&table->tb6_lock);
1285 return __ip6_del_rt(rt, nlh, _rtattr, req);
1288 read_unlock_bh(&table->tb6_lock);
1296 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1297 struct neighbour *neigh, u8 *lladdr, int on_link)
1299 struct rt6_info *rt, *nrt = NULL;
1300 struct fib6_node *fn;
1301 struct fib6_table *table;
1302 struct netevent_redirect netevent;
1304 /* TODO: Very lazy, might need to check all tables */
1305 table = fib6_get_table(RT6_TABLE_MAIN);
1310 * Get the "current" route for this destination and
1311 * check if the redirect has come from approriate router.
1313 * RFC 2461 specifies that redirects should only be
1314 * accepted if they come from the nexthop to the target.
1315 * Due to the way the routes are chosen, this notion
1316 * is a bit fuzzy and one might need to check all possible
1320 read_lock_bh(&table->tb6_lock);
1321 fn = fib6_lookup(&table->tb6_root, dest, NULL);
1323 for (rt = fn->leaf; rt; rt = rt->u.next) {
1325 * Current route is on-link; redirect is always invalid.
1327 * Seems, previous statement is not true. It could
1328 * be node, which looks for us as on-link (f.e. proxy ndisc)
1329 * But then router serving it might decide, that we should
1330 * know truth 8)8) --ANK (980726).
1332 if (rt6_check_expired(rt))
1334 if (!(rt->rt6i_flags & RTF_GATEWAY))
1336 if (neigh->dev != rt->rt6i_dev)
1338 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1343 dst_hold(&rt->u.dst);
1344 else if (rt6_need_strict(dest)) {
1345 while ((fn = fn->parent) != NULL) {
1346 if (fn->fn_flags & RTN_ROOT)
1348 if (fn->fn_flags & RTN_RTINFO)
1352 read_unlock_bh(&table->tb6_lock);
1355 if (net_ratelimit())
1356 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1357 "for redirect target\n");
1362 * We have finally decided to accept it.
1365 neigh_update(neigh, lladdr, NUD_STALE,
1366 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1367 NEIGH_UPDATE_F_OVERRIDE|
1368 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1369 NEIGH_UPDATE_F_ISROUTER))
1373 * Redirect received -> path was valid.
1374 * Look, redirects are sent only in response to data packets,
1375 * so that this nexthop apparently is reachable. --ANK
1377 dst_confirm(&rt->u.dst);
1379 /* Duplicate redirect: silently ignore. */
1380 if (neigh == rt->u.dst.neighbour)
1383 nrt = ip6_rt_copy(rt);
1387 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1389 nrt->rt6i_flags &= ~RTF_GATEWAY;
1391 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1392 nrt->rt6i_dst.plen = 128;
1393 nrt->u.dst.flags |= DST_HOST;
1395 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1396 nrt->rt6i_nexthop = neigh_clone(neigh);
1397 /* Reset pmtu, it may be better */
1398 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1399 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1401 if (ip6_ins_rt(nrt))
1404 netevent.old = &rt->u.dst;
1405 netevent.new = &nrt->u.dst;
1406 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1408 if (rt->rt6i_flags&RTF_CACHE) {
1414 dst_release(&rt->u.dst);
1419 * Handle ICMP "packet too big" messages
1420 * i.e. Path MTU discovery
1423 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1424 struct net_device *dev, u32 pmtu)
1426 struct rt6_info *rt, *nrt;
1429 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1433 if (pmtu >= dst_mtu(&rt->u.dst))
1436 if (pmtu < IPV6_MIN_MTU) {
1438 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1439 * MTU (1280) and a fragment header should always be included
1440 * after a node receiving Too Big message reporting PMTU is
1441 * less than the IPv6 Minimum Link MTU.
1443 pmtu = IPV6_MIN_MTU;
1447 /* New mtu received -> path was valid.
1448 They are sent only in response to data packets,
1449 so that this nexthop apparently is reachable. --ANK
1451 dst_confirm(&rt->u.dst);
1453 /* Host route. If it is static, it would be better
1454 not to override it, but add new one, so that
1455 when cache entry will expire old pmtu
1456 would return automatically.
1458 if (rt->rt6i_flags & RTF_CACHE) {
1459 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1461 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1462 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1463 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1468 Two cases are possible:
1469 1. It is connected route. Action: COW
1470 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1472 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1473 nrt = rt6_alloc_cow(rt, daddr, saddr);
1475 nrt = rt6_alloc_clone(rt, daddr);
1478 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1480 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1482 /* According to RFC 1981, detecting PMTU increase shouldn't be
1483 * happened within 5 mins, the recommended timer is 10 mins.
1484 * Here this route expiration time is set to ip6_rt_mtu_expires
1485 * which is 10 mins. After 10 mins the decreased pmtu is expired
1486 * and detecting PMTU increase will be automatically happened.
1488 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1489 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1494 dst_release(&rt->u.dst);
1498 * Misc support functions
1501 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1503 struct rt6_info *rt = ip6_dst_alloc();
1506 rt->u.dst.input = ort->u.dst.input;
1507 rt->u.dst.output = ort->u.dst.output;
1509 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1510 rt->u.dst.dev = ort->u.dst.dev;
1512 dev_hold(rt->u.dst.dev);
1513 rt->rt6i_idev = ort->rt6i_idev;
1515 in6_dev_hold(rt->rt6i_idev);
1516 rt->u.dst.lastuse = jiffies;
1517 rt->rt6i_expires = 0;
1519 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1520 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1521 rt->rt6i_metric = 0;
1523 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1524 #ifdef CONFIG_IPV6_SUBTREES
1525 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1527 rt->rt6i_table = ort->rt6i_table;
1532 #ifdef CONFIG_IPV6_ROUTE_INFO
1533 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1534 struct in6_addr *gwaddr, int ifindex)
1536 struct fib6_node *fn;
1537 struct rt6_info *rt = NULL;
1538 struct fib6_table *table;
1540 table = fib6_get_table(RT6_TABLE_INFO);
1544 write_lock_bh(&table->tb6_lock);
1545 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1549 for (rt = fn->leaf; rt; rt = rt->u.next) {
1550 if (rt->rt6i_dev->ifindex != ifindex)
1552 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1554 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1556 dst_hold(&rt->u.dst);
1560 write_unlock_bh(&table->tb6_lock);
1564 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1565 struct in6_addr *gwaddr, int ifindex,
1568 struct in6_rtmsg rtmsg;
1570 memset(&rtmsg, 0, sizeof(rtmsg));
1571 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1572 ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1573 rtmsg.rtmsg_dst_len = prefixlen;
1574 ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1575 rtmsg.rtmsg_metric = 1024;
1576 rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1577 /* We should treat it as a default route if prefix length is 0. */
1579 rtmsg.rtmsg_flags |= RTF_DEFAULT;
1580 rtmsg.rtmsg_ifindex = ifindex;
1582 ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_INFO);
1584 return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1588 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1590 struct rt6_info *rt;
1591 struct fib6_table *table;
1593 table = fib6_get_table(RT6_TABLE_DFLT);
1597 write_lock_bh(&table->tb6_lock);
1598 for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1599 if (dev == rt->rt6i_dev &&
1600 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1601 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1605 dst_hold(&rt->u.dst);
1606 write_unlock_bh(&table->tb6_lock);
1610 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1611 struct net_device *dev,
1614 struct in6_rtmsg rtmsg;
1616 memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1617 rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1618 ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1619 rtmsg.rtmsg_metric = 1024;
1620 rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1623 rtmsg.rtmsg_ifindex = dev->ifindex;
1625 ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_DFLT);
1626 return rt6_get_dflt_router(gwaddr, dev);
1629 void rt6_purge_dflt_routers(void)
1631 struct rt6_info *rt;
1632 struct fib6_table *table;
1634 /* NOTE: Keep consistent with rt6_get_dflt_router */
1635 table = fib6_get_table(RT6_TABLE_DFLT);
1640 read_lock_bh(&table->tb6_lock);
1641 for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1642 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1643 dst_hold(&rt->u.dst);
1644 read_unlock_bh(&table->tb6_lock);
1649 read_unlock_bh(&table->tb6_lock);
1652 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1654 struct in6_rtmsg rtmsg;
1658 case SIOCADDRT: /* Add a route */
1659 case SIOCDELRT: /* Delete a route */
1660 if (!capable(CAP_NET_ADMIN))
1662 err = copy_from_user(&rtmsg, arg,
1663 sizeof(struct in6_rtmsg));
1670 err = ip6_route_add(&rtmsg, NULL, NULL, NULL,
1674 err = ip6_route_del(&rtmsg, NULL, NULL, NULL,
1689 * Drop the packet on the floor
1692 static int ip6_pkt_discard(struct sk_buff *skb)
1694 int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1695 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1696 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1698 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1699 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1704 static int ip6_pkt_discard_out(struct sk_buff *skb)
1706 skb->dev = skb->dst->dev;
1707 return ip6_pkt_discard(skb);
1711 * Allocate a dst for local (unicast / anycast) address.
1714 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1715 const struct in6_addr *addr,
1718 struct rt6_info *rt = ip6_dst_alloc();
1721 return ERR_PTR(-ENOMEM);
1723 dev_hold(&loopback_dev);
1726 rt->u.dst.flags = DST_HOST;
1727 rt->u.dst.input = ip6_input;
1728 rt->u.dst.output = ip6_output;
1729 rt->rt6i_dev = &loopback_dev;
1730 rt->rt6i_idev = idev;
1731 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1732 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1733 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1734 rt->u.dst.obsolete = -1;
1736 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1738 rt->rt6i_flags |= RTF_ANYCAST;
1740 rt->rt6i_flags |= RTF_LOCAL;
1741 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1742 if (rt->rt6i_nexthop == NULL) {
1743 dst_free((struct dst_entry *) rt);
1744 return ERR_PTR(-ENOMEM);
1747 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1748 rt->rt6i_dst.plen = 128;
1749 rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1751 atomic_set(&rt->u.dst.__refcnt, 1);
1756 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1758 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1759 rt != &ip6_null_entry) {
1760 RT6_TRACE("deleted by ifdown %p\n", rt);
1766 void rt6_ifdown(struct net_device *dev)
1768 fib6_clean_all(fib6_ifdown, 0, dev);
1771 struct rt6_mtu_change_arg
1773 struct net_device *dev;
1777 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1779 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1780 struct inet6_dev *idev;
1782 /* In IPv6 pmtu discovery is not optional,
1783 so that RTAX_MTU lock cannot disable it.
1784 We still use this lock to block changes
1785 caused by addrconf/ndisc.
1788 idev = __in6_dev_get(arg->dev);
1792 /* For administrative MTU increase, there is no way to discover
1793 IPv6 PMTU increase, so PMTU increase should be updated here.
1794 Since RFC 1981 doesn't include administrative MTU increase
1795 update PMTU increase is a MUST. (i.e. jumbo frame)
1798 If new MTU is less than route PMTU, this new MTU will be the
1799 lowest MTU in the path, update the route PMTU to reflect PMTU
1800 decreases; if new MTU is greater than route PMTU, and the
1801 old MTU is the lowest MTU in the path, update the route PMTU
1802 to reflect the increase. In this case if the other nodes' MTU
1803 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1806 if (rt->rt6i_dev == arg->dev &&
1807 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1808 (dst_mtu(&rt->u.dst) > arg->mtu ||
1809 (dst_mtu(&rt->u.dst) < arg->mtu &&
1810 dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1811 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1812 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1816 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1818 struct rt6_mtu_change_arg arg = {
1823 fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1826 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1827 struct in6_rtmsg *rtmsg)
1829 memset(rtmsg, 0, sizeof(*rtmsg));
1831 rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1832 rtmsg->rtmsg_src_len = r->rtm_src_len;
1833 rtmsg->rtmsg_flags = RTF_UP;
1834 if (r->rtm_type == RTN_UNREACHABLE)
1835 rtmsg->rtmsg_flags |= RTF_REJECT;
1837 if (rta[RTA_GATEWAY-1]) {
1838 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1840 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1841 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1843 if (rta[RTA_DST-1]) {
1844 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1846 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1848 if (rta[RTA_SRC-1]) {
1849 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1851 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1853 if (rta[RTA_OIF-1]) {
1854 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1856 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1858 if (rta[RTA_PRIORITY-1]) {
1859 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1861 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1866 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1868 struct rtmsg *r = NLMSG_DATA(nlh);
1869 struct in6_rtmsg rtmsg;
1871 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1873 return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb),
1874 rtm_get_table(arg, r->rtm_table));
1877 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1879 struct rtmsg *r = NLMSG_DATA(nlh);
1880 struct in6_rtmsg rtmsg;
1882 if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1884 return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb),
1885 rtm_get_table(arg, r->rtm_table));
1888 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1889 struct in6_addr *dst, struct in6_addr *src,
1890 int iif, int type, u32 pid, u32 seq,
1891 int prefix, unsigned int flags)
1894 struct nlmsghdr *nlh;
1895 unsigned char *b = skb->tail;
1896 struct rta_cacheinfo ci;
1899 if (prefix) { /* user wants prefix routes only */
1900 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1901 /* success since this is not a prefix route */
1906 nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1907 rtm = NLMSG_DATA(nlh);
1908 rtm->rtm_family = AF_INET6;
1909 rtm->rtm_dst_len = rt->rt6i_dst.plen;
1910 rtm->rtm_src_len = rt->rt6i_src.plen;
1913 table = rt->rt6i_table->tb6_id;
1915 table = RT6_TABLE_UNSPEC;
1916 rtm->rtm_table = table;
1917 RTA_PUT_U32(skb, RTA_TABLE, table);
1918 if (rt->rt6i_flags&RTF_REJECT)
1919 rtm->rtm_type = RTN_UNREACHABLE;
1920 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1921 rtm->rtm_type = RTN_LOCAL;
1923 rtm->rtm_type = RTN_UNICAST;
1925 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1926 rtm->rtm_protocol = rt->rt6i_protocol;
1927 if (rt->rt6i_flags&RTF_DYNAMIC)
1928 rtm->rtm_protocol = RTPROT_REDIRECT;
1929 else if (rt->rt6i_flags & RTF_ADDRCONF)
1930 rtm->rtm_protocol = RTPROT_KERNEL;
1931 else if (rt->rt6i_flags&RTF_DEFAULT)
1932 rtm->rtm_protocol = RTPROT_RA;
1934 if (rt->rt6i_flags&RTF_CACHE)
1935 rtm->rtm_flags |= RTM_F_CLONED;
1938 RTA_PUT(skb, RTA_DST, 16, dst);
1939 rtm->rtm_dst_len = 128;
1940 } else if (rtm->rtm_dst_len)
1941 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1942 #ifdef CONFIG_IPV6_SUBTREES
1944 RTA_PUT(skb, RTA_SRC, 16, src);
1945 rtm->rtm_src_len = 128;
1946 } else if (rtm->rtm_src_len)
1947 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1950 RTA_PUT(skb, RTA_IIF, 4, &iif);
1952 struct in6_addr saddr_buf;
1953 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1954 RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1956 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1957 goto rtattr_failure;
1958 if (rt->u.dst.neighbour)
1959 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1961 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1962 RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1963 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1964 if (rt->rt6i_expires)
1965 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1968 ci.rta_used = rt->u.dst.__use;
1969 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1970 ci.rta_error = rt->u.dst.error;
1974 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1975 nlh->nlmsg_len = skb->tail - b;
1980 skb_trim(skb, b - skb->data);
1984 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1986 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1989 if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1990 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1991 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1995 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1996 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1997 prefix, NLM_F_MULTI);
2000 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2002 struct rtattr **rta = arg;
2005 struct sk_buff *skb;
2007 struct rt6_info *rt;
2009 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2013 /* Reserve room for dummy headers, this skb can pass
2014 through good chunk of routing engine.
2016 skb->mac.raw = skb->data;
2017 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2019 memset(&fl, 0, sizeof(fl));
2021 ipv6_addr_copy(&fl.fl6_src,
2022 (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
2024 ipv6_addr_copy(&fl.fl6_dst,
2025 (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
2028 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
2031 struct net_device *dev;
2032 dev = __dev_get_by_index(iif);
2041 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
2043 rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
2045 skb->dst = &rt->u.dst;
2047 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2048 err = rt6_fill_node(skb, rt,
2049 &fl.fl6_dst, &fl.fl6_src,
2051 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2052 nlh->nlmsg_seq, 0, 0);
2058 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2066 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
2067 struct netlink_skb_parms *req)
2069 struct sk_buff *skb;
2070 u32 pid = req ? req->pid : 0;
2071 u32 seq = nlh ? nlh->nlmsg_seq : 0;
2072 int payload = sizeof(struct rtmsg) + 256;
2075 skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2079 err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2085 err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2088 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2095 #ifdef CONFIG_PROC_FS
2097 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2108 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2110 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2113 if (arg->skip < arg->offset / RT6_INFO_LEN) {
2118 if (arg->len >= arg->length)
2121 for (i=0; i<16; i++) {
2122 sprintf(arg->buffer + arg->len, "%02x",
2123 rt->rt6i_dst.addr.s6_addr[i]);
2126 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2129 #ifdef CONFIG_IPV6_SUBTREES
2130 for (i=0; i<16; i++) {
2131 sprintf(arg->buffer + arg->len, "%02x",
2132 rt->rt6i_src.addr.s6_addr[i]);
2135 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2138 sprintf(arg->buffer + arg->len,
2139 "00000000000000000000000000000000 00 ");
2143 if (rt->rt6i_nexthop) {
2144 for (i=0; i<16; i++) {
2145 sprintf(arg->buffer + arg->len, "%02x",
2146 rt->rt6i_nexthop->primary_key[i]);
2150 sprintf(arg->buffer + arg->len,
2151 "00000000000000000000000000000000");
2154 arg->len += sprintf(arg->buffer + arg->len,
2155 " %08x %08x %08x %08x %8s\n",
2156 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2157 rt->u.dst.__use, rt->rt6i_flags,
2158 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2162 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2164 struct rt6_proc_arg arg = {
2170 fib6_clean_all(rt6_info_route, 0, &arg);
2174 *start += offset % RT6_INFO_LEN;
2176 arg.len -= offset % RT6_INFO_LEN;
2178 if (arg.len > length)
2186 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2188 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2189 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2190 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2191 rt6_stats.fib_rt_cache,
2192 atomic_read(&ip6_dst_ops.entries),
2193 rt6_stats.fib_discarded_routes);
2198 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2200 return single_open(file, rt6_stats_seq_show, NULL);
2203 static struct file_operations rt6_stats_seq_fops = {
2204 .owner = THIS_MODULE,
2205 .open = rt6_stats_seq_open,
2207 .llseek = seq_lseek,
2208 .release = single_release,
2210 #endif /* CONFIG_PROC_FS */
2212 #ifdef CONFIG_SYSCTL
2214 static int flush_delay;
2217 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2218 void __user *buffer, size_t *lenp, loff_t *ppos)
2221 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2222 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2228 ctl_table ipv6_route_table[] = {
2230 .ctl_name = NET_IPV6_ROUTE_FLUSH,
2231 .procname = "flush",
2232 .data = &flush_delay,
2233 .maxlen = sizeof(int),
2235 .proc_handler = &ipv6_sysctl_rtcache_flush
2238 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
2239 .procname = "gc_thresh",
2240 .data = &ip6_dst_ops.gc_thresh,
2241 .maxlen = sizeof(int),
2243 .proc_handler = &proc_dointvec,
2246 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
2247 .procname = "max_size",
2248 .data = &ip6_rt_max_size,
2249 .maxlen = sizeof(int),
2251 .proc_handler = &proc_dointvec,
2254 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2255 .procname = "gc_min_interval",
2256 .data = &ip6_rt_gc_min_interval,
2257 .maxlen = sizeof(int),
2259 .proc_handler = &proc_dointvec_jiffies,
2260 .strategy = &sysctl_jiffies,
2263 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
2264 .procname = "gc_timeout",
2265 .data = &ip6_rt_gc_timeout,
2266 .maxlen = sizeof(int),
2268 .proc_handler = &proc_dointvec_jiffies,
2269 .strategy = &sysctl_jiffies,
2272 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
2273 .procname = "gc_interval",
2274 .data = &ip6_rt_gc_interval,
2275 .maxlen = sizeof(int),
2277 .proc_handler = &proc_dointvec_jiffies,
2278 .strategy = &sysctl_jiffies,
2281 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
2282 .procname = "gc_elasticity",
2283 .data = &ip6_rt_gc_elasticity,
2284 .maxlen = sizeof(int),
2286 .proc_handler = &proc_dointvec_jiffies,
2287 .strategy = &sysctl_jiffies,
2290 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
2291 .procname = "mtu_expires",
2292 .data = &ip6_rt_mtu_expires,
2293 .maxlen = sizeof(int),
2295 .proc_handler = &proc_dointvec_jiffies,
2296 .strategy = &sysctl_jiffies,
2299 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
2300 .procname = "min_adv_mss",
2301 .data = &ip6_rt_min_advmss,
2302 .maxlen = sizeof(int),
2304 .proc_handler = &proc_dointvec_jiffies,
2305 .strategy = &sysctl_jiffies,
2308 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2309 .procname = "gc_min_interval_ms",
2310 .data = &ip6_rt_gc_min_interval,
2311 .maxlen = sizeof(int),
2313 .proc_handler = &proc_dointvec_ms_jiffies,
2314 .strategy = &sysctl_ms_jiffies,
2321 void __init ip6_route_init(void)
2323 struct proc_dir_entry *p;
2325 ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2326 sizeof(struct rt6_info),
2327 0, SLAB_HWCACHE_ALIGN,
2329 if (!ip6_dst_ops.kmem_cachep)
2330 panic("cannot create ip6_dst_cache");
2333 #ifdef CONFIG_PROC_FS
2334 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2336 p->owner = THIS_MODULE;
2338 proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2343 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2348 void ip6_route_cleanup(void)
2350 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2351 fib6_rules_cleanup();
2353 #ifdef CONFIG_PROC_FS
2354 proc_net_remove("ipv6_route");
2355 proc_net_remove("rt6_stats");
2362 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);