2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
18 * YOSHIFUJI Hideaki @USAGI
19 * reworked default router selection.
20 * - respect outgoing interface
21 * - select from (probably) reachable routers (i.e.
22 * routers in REACHABLE, STALE, DELAY or PROBE states).
23 * - always select the same router if it is (probably)
24 * reachable. otherwise, round-robin the list.
26 * Fixed routing subtrees.
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
54 #include <linux/rtnetlink.h>
57 #include <net/netevent.h>
58 #include <net/netlink.h>
60 #include <asm/uaccess.h>
63 #include <linux/sysctl.h>
66 /* Set to 3 to get tracing. */
70 #define RDBG(x) printk x
71 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
74 #define RT6_TRACE(x...) do { ; } while (0)
77 #define CLONE_OFFLINK_ROUTE 0
79 static int ip6_rt_max_size = 4096;
80 static int ip6_rt_gc_min_interval = HZ / 2;
81 static int ip6_rt_gc_timeout = 60*HZ;
82 int ip6_rt_gc_interval = 30*HZ;
83 static int ip6_rt_gc_elasticity = 9;
84 static int ip6_rt_mtu_expires = 10*60*HZ;
85 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
88 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
90 static void ip6_dst_destroy(struct dst_entry *);
91 static void ip6_dst_ifdown(struct dst_entry *,
92 struct net_device *dev, int how);
93 static int ip6_dst_gc(void);
95 static int ip6_pkt_discard(struct sk_buff *skb);
96 static int ip6_pkt_discard_out(struct sk_buff *skb);
97 static void ip6_link_failure(struct sk_buff *skb);
98 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
102 struct in6_addr *gwaddr, int ifindex,
104 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
105 struct in6_addr *gwaddr, int ifindex);
108 static struct dst_ops ip6_dst_ops = {
110 .protocol = __constant_htons(ETH_P_IPV6),
113 .check = ip6_dst_check,
114 .destroy = ip6_dst_destroy,
115 .ifdown = ip6_dst_ifdown,
116 .negative_advice = ip6_negative_advice,
117 .link_failure = ip6_link_failure,
118 .update_pmtu = ip6_rt_update_pmtu,
119 .entry_size = sizeof(struct rt6_info),
122 struct rt6_info ip6_null_entry = {
125 .__refcnt = ATOMIC_INIT(1),
127 .dev = &loopback_dev,
129 .error = -ENETUNREACH,
130 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
131 .input = ip6_pkt_discard,
132 .output = ip6_pkt_discard_out,
134 .path = (struct dst_entry*)&ip6_null_entry,
137 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
138 .rt6i_metric = ~(u32) 0,
139 .rt6i_ref = ATOMIC_INIT(1),
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
144 struct rt6_info ip6_prohibit_entry = {
147 .__refcnt = ATOMIC_INIT(1),
149 .dev = &loopback_dev,
152 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
153 .input = ip6_pkt_discard,
154 .output = ip6_pkt_discard_out,
156 .path = (struct dst_entry*)&ip6_prohibit_entry,
159 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
160 .rt6i_metric = ~(u32) 0,
161 .rt6i_ref = ATOMIC_INIT(1),
164 struct rt6_info ip6_blk_hole_entry = {
167 .__refcnt = ATOMIC_INIT(1),
169 .dev = &loopback_dev,
172 .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
173 .input = ip6_pkt_discard,
174 .output = ip6_pkt_discard_out,
176 .path = (struct dst_entry*)&ip6_blk_hole_entry,
179 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
180 .rt6i_metric = ~(u32) 0,
181 .rt6i_ref = ATOMIC_INIT(1),
186 /* allocate dst with ip6_dst_ops */
187 static __inline__ struct rt6_info *ip6_dst_alloc(void)
189 return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
192 static void ip6_dst_destroy(struct dst_entry *dst)
194 struct rt6_info *rt = (struct rt6_info *)dst;
195 struct inet6_dev *idev = rt->rt6i_idev;
198 rt->rt6i_idev = NULL;
203 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
206 struct rt6_info *rt = (struct rt6_info *)dst;
207 struct inet6_dev *idev = rt->rt6i_idev;
209 if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
210 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
211 if (loopback_idev != NULL) {
212 rt->rt6i_idev = loopback_idev;
218 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
220 return (rt->rt6i_flags & RTF_EXPIRES &&
221 time_after(jiffies, rt->rt6i_expires));
224 static inline int rt6_need_strict(struct in6_addr *daddr)
226 return (ipv6_addr_type(daddr) &
227 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
231 * Route lookup. Any table->tb6_lock is implied.
234 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
238 struct rt6_info *local = NULL;
239 struct rt6_info *sprt;
242 for (sprt = rt; sprt; sprt = sprt->u.next) {
243 struct net_device *dev = sprt->rt6i_dev;
244 if (dev->ifindex == oif)
246 if (dev->flags & IFF_LOOPBACK) {
247 if (sprt->rt6i_idev == NULL ||
248 sprt->rt6i_idev->dev->ifindex != oif) {
251 if (local && (!oif ||
252 local->rt6i_idev->dev->ifindex == oif))
263 return &ip6_null_entry;
268 #ifdef CONFIG_IPV6_ROUTER_PREF
269 static void rt6_probe(struct rt6_info *rt)
271 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
273 * Okay, this does not seem to be appropriate
274 * for now, however, we need to check if it
275 * is really so; aka Router Reachability Probing.
277 * Router Reachability Probe MUST be rate-limited
278 * to no more than one per minute.
280 if (!neigh || (neigh->nud_state & NUD_VALID))
282 read_lock_bh(&neigh->lock);
283 if (!(neigh->nud_state & NUD_VALID) &&
284 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
285 struct in6_addr mcaddr;
286 struct in6_addr *target;
288 neigh->updated = jiffies;
289 read_unlock_bh(&neigh->lock);
291 target = (struct in6_addr *)&neigh->primary_key;
292 addrconf_addr_solict_mult(target, &mcaddr);
293 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
295 read_unlock_bh(&neigh->lock);
298 static inline void rt6_probe(struct rt6_info *rt)
305 * Default Router Selection (RFC 2461 6.3.6)
307 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
309 struct net_device *dev = rt->rt6i_dev;
310 if (!oif || dev->ifindex == oif)
312 if ((dev->flags & IFF_LOOPBACK) &&
313 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
318 static int inline rt6_check_neigh(struct rt6_info *rt)
320 struct neighbour *neigh = rt->rt6i_nexthop;
322 if (rt->rt6i_flags & RTF_NONEXTHOP ||
323 !(rt->rt6i_flags & RTF_GATEWAY))
326 read_lock_bh(&neigh->lock);
327 if (neigh->nud_state & NUD_VALID)
329 read_unlock_bh(&neigh->lock);
334 static int rt6_score_route(struct rt6_info *rt, int oif,
339 m = rt6_check_dev(rt, oif);
340 if (!m && (strict & RT6_LOOKUP_F_IFACE))
342 #ifdef CONFIG_IPV6_ROUTER_PREF
343 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
345 n = rt6_check_neigh(rt);
348 else if (!n && strict & RT6_LOOKUP_F_REACHABLE)
353 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
356 struct rt6_info *match = NULL, *last = NULL;
357 struct rt6_info *rt, *rt0 = *head;
361 RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
362 __FUNCTION__, head, head ? *head : NULL, oif);
364 for (rt = rt0, metric = rt0->rt6i_metric;
365 rt && rt->rt6i_metric == metric && (!last || rt != rt0);
369 if (rt6_check_expired(rt))
374 m = rt6_score_route(rt, oif, strict);
388 (strict & RT6_LOOKUP_F_REACHABLE) &&
389 last && last != rt0) {
390 /* no entries matched; do round-robin */
391 static DEFINE_SPINLOCK(lock);
394 rt0->u.next = last->u.next;
399 RT6_TRACE("%s() => %p, score=%d\n",
400 __FUNCTION__, match, mpri);
402 return (match ? match : &ip6_null_entry);
405 #ifdef CONFIG_IPV6_ROUTE_INFO
406 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
407 struct in6_addr *gwaddr)
409 struct route_info *rinfo = (struct route_info *) opt;
410 struct in6_addr prefix_buf, *prefix;
415 if (len < sizeof(struct route_info)) {
419 /* Sanity check for prefix_len and length */
420 if (rinfo->length > 3) {
422 } else if (rinfo->prefix_len > 128) {
424 } else if (rinfo->prefix_len > 64) {
425 if (rinfo->length < 2) {
428 } else if (rinfo->prefix_len > 0) {
429 if (rinfo->length < 1) {
434 pref = rinfo->route_pref;
435 if (pref == ICMPV6_ROUTER_PREF_INVALID)
436 pref = ICMPV6_ROUTER_PREF_MEDIUM;
438 lifetime = htonl(rinfo->lifetime);
439 if (lifetime == 0xffffffff) {
441 } else if (lifetime > 0x7fffffff/HZ) {
442 /* Avoid arithmetic overflow */
443 lifetime = 0x7fffffff/HZ - 1;
446 if (rinfo->length == 3)
447 prefix = (struct in6_addr *)rinfo->prefix;
449 /* this function is safe */
450 ipv6_addr_prefix(&prefix_buf,
451 (struct in6_addr *)rinfo->prefix,
453 prefix = &prefix_buf;
456 rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
458 if (rt && !lifetime) {
464 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
467 rt->rt6i_flags = RTF_ROUTEINFO |
468 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
471 if (lifetime == 0xffffffff) {
472 rt->rt6i_flags &= ~RTF_EXPIRES;
474 rt->rt6i_expires = jiffies + HZ * lifetime;
475 rt->rt6i_flags |= RTF_EXPIRES;
477 dst_release(&rt->u.dst);
483 #define BACKTRACK(saddr) \
485 if (rt == &ip6_null_entry) { \
486 struct fib6_node *pn; \
488 if (fn->fn_flags & RTN_TL_ROOT) \
491 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
492 fn = fib6_lookup(pn->subtree, NULL, saddr); \
495 if (fn->fn_flags & RTN_RTINFO) \
501 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
502 struct flowi *fl, int flags)
504 struct fib6_node *fn;
507 read_lock_bh(&table->tb6_lock);
508 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
511 rt = rt6_device_match(rt, fl->oif, flags);
512 BACKTRACK(&fl->fl6_src);
514 dst_hold(&rt->u.dst);
515 read_unlock_bh(&table->tb6_lock);
517 rt->u.dst.lastuse = jiffies;
524 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
535 struct dst_entry *dst;
536 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
539 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
540 flags |= RT6_LOOKUP_F_HAS_SADDR;
543 dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
545 return (struct rt6_info *) dst;
552 /* ip6_ins_rt is called with FREE table->tb6_lock.
553 It takes new route entry, the addition fails by any reason the
554 route is freed. In any case, if caller does not hold it, it may
558 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
561 struct fib6_table *table;
563 table = rt->rt6i_table;
564 write_lock_bh(&table->tb6_lock);
565 err = fib6_add(&table->tb6_root, rt, info);
566 write_unlock_bh(&table->tb6_lock);
571 int ip6_ins_rt(struct rt6_info *rt)
573 return __ip6_ins_rt(rt, NULL);
576 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
577 struct in6_addr *saddr)
585 rt = ip6_rt_copy(ort);
588 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
589 if (rt->rt6i_dst.plen != 128 &&
590 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
591 rt->rt6i_flags |= RTF_ANYCAST;
592 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
595 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
596 rt->rt6i_dst.plen = 128;
597 rt->rt6i_flags |= RTF_CACHE;
598 rt->u.dst.flags |= DST_HOST;
600 #ifdef CONFIG_IPV6_SUBTREES
601 if (rt->rt6i_src.plen && saddr) {
602 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
603 rt->rt6i_src.plen = 128;
607 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
614 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
616 struct rt6_info *rt = ip6_rt_copy(ort);
618 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
619 rt->rt6i_dst.plen = 128;
620 rt->rt6i_flags |= RTF_CACHE;
621 rt->u.dst.flags |= DST_HOST;
622 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
627 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
628 struct flowi *fl, int flags)
630 struct fib6_node *fn;
631 struct rt6_info *rt, *nrt;
635 int reachable = RT6_LOOKUP_F_REACHABLE;
637 strict |= flags & RT6_LOOKUP_F_IFACE;
640 read_lock_bh(&table->tb6_lock);
643 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
646 rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
647 BACKTRACK(&fl->fl6_src);
648 if (rt == &ip6_null_entry ||
649 rt->rt6i_flags & RTF_CACHE)
652 dst_hold(&rt->u.dst);
653 read_unlock_bh(&table->tb6_lock);
655 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
656 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
658 #if CLONE_OFFLINK_ROUTE
659 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
665 dst_release(&rt->u.dst);
666 rt = nrt ? : &ip6_null_entry;
668 dst_hold(&rt->u.dst);
670 err = ip6_ins_rt(nrt);
679 * Race condition! In the gap, when table->tb6_lock was
680 * released someone could insert this route. Relookup.
682 dst_release(&rt->u.dst);
690 dst_hold(&rt->u.dst);
691 read_unlock_bh(&table->tb6_lock);
693 rt->u.dst.lastuse = jiffies;
699 void ip6_route_input(struct sk_buff *skb)
701 struct ipv6hdr *iph = skb->nh.ipv6h;
702 int flags = RT6_LOOKUP_F_HAS_SADDR;
704 .iif = skb->dev->ifindex,
709 #ifdef CONFIG_IPV6_ROUTE_FWMARK
710 .fwmark = skb->nfmark,
712 .flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
715 .proto = iph->nexthdr,
718 if (rt6_need_strict(&iph->daddr))
719 flags |= RT6_LOOKUP_F_IFACE;
721 skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
724 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
725 struct flowi *fl, int flags)
727 struct fib6_node *fn;
728 struct rt6_info *rt, *nrt;
732 int reachable = RT6_LOOKUP_F_REACHABLE;
734 strict |= flags & RT6_LOOKUP_F_IFACE;
737 read_lock_bh(&table->tb6_lock);
740 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
743 rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
744 BACKTRACK(&fl->fl6_src);
745 if (rt == &ip6_null_entry ||
746 rt->rt6i_flags & RTF_CACHE)
749 dst_hold(&rt->u.dst);
750 read_unlock_bh(&table->tb6_lock);
752 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
753 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
755 #if CLONE_OFFLINK_ROUTE
756 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
762 dst_release(&rt->u.dst);
763 rt = nrt ? : &ip6_null_entry;
765 dst_hold(&rt->u.dst);
767 err = ip6_ins_rt(nrt);
776 * Race condition! In the gap, when table->tb6_lock was
777 * released someone could insert this route. Relookup.
779 dst_release(&rt->u.dst);
787 dst_hold(&rt->u.dst);
788 read_unlock_bh(&table->tb6_lock);
790 rt->u.dst.lastuse = jiffies;
795 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
799 if (rt6_need_strict(&fl->fl6_dst))
800 flags |= RT6_LOOKUP_F_IFACE;
802 if (!ipv6_addr_any(&fl->fl6_src))
803 flags |= RT6_LOOKUP_F_HAS_SADDR;
805 return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
810 * Destination cache support functions
813 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
817 rt = (struct rt6_info *) dst;
819 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
825 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
827 struct rt6_info *rt = (struct rt6_info *) dst;
830 if (rt->rt6i_flags & RTF_CACHE)
838 static void ip6_link_failure(struct sk_buff *skb)
842 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
844 rt = (struct rt6_info *) skb->dst;
846 if (rt->rt6i_flags&RTF_CACHE) {
847 dst_set_expires(&rt->u.dst, 0);
848 rt->rt6i_flags |= RTF_EXPIRES;
849 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
850 rt->rt6i_node->fn_sernum = -1;
854 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
856 struct rt6_info *rt6 = (struct rt6_info*)dst;
858 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
859 rt6->rt6i_flags |= RTF_MODIFIED;
860 if (mtu < IPV6_MIN_MTU) {
862 dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
864 dst->metrics[RTAX_MTU-1] = mtu;
865 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
869 static int ipv6_get_mtu(struct net_device *dev);
871 static inline unsigned int ipv6_advmss(unsigned int mtu)
873 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
875 if (mtu < ip6_rt_min_advmss)
876 mtu = ip6_rt_min_advmss;
879 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
880 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
881 * IPV6_MAXPLEN is also valid and means: "any MSS,
882 * rely only on pmtu discovery"
884 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
889 static struct dst_entry *ndisc_dst_gc_list;
890 static DEFINE_SPINLOCK(ndisc_lock);
892 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
893 struct neighbour *neigh,
894 struct in6_addr *addr,
895 int (*output)(struct sk_buff *))
898 struct inet6_dev *idev = in6_dev_get(dev);
900 if (unlikely(idev == NULL))
903 rt = ip6_dst_alloc();
904 if (unlikely(rt == NULL)) {
913 neigh = ndisc_get_neigh(dev, addr);
916 rt->rt6i_idev = idev;
917 rt->rt6i_nexthop = neigh;
918 atomic_set(&rt->u.dst.__refcnt, 1);
919 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
920 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
921 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
922 rt->u.dst.output = output;
924 #if 0 /* there's no chance to use these for ndisc */
925 rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
928 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
929 rt->rt6i_dst.plen = 128;
932 spin_lock_bh(&ndisc_lock);
933 rt->u.dst.next = ndisc_dst_gc_list;
934 ndisc_dst_gc_list = &rt->u.dst;
935 spin_unlock_bh(&ndisc_lock);
937 fib6_force_start_gc();
940 return (struct dst_entry *)rt;
943 int ndisc_dst_gc(int *more)
945 struct dst_entry *dst, *next, **pprev;
951 spin_lock_bh(&ndisc_lock);
952 pprev = &ndisc_dst_gc_list;
954 while ((dst = *pprev) != NULL) {
955 if (!atomic_read(&dst->__refcnt)) {
965 spin_unlock_bh(&ndisc_lock);
970 static int ip6_dst_gc(void)
972 static unsigned expire = 30*HZ;
973 static unsigned long last_gc;
974 unsigned long now = jiffies;
976 if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
977 atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
983 if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
984 expire = ip6_rt_gc_timeout>>1;
987 expire -= expire>>ip6_rt_gc_elasticity;
988 return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
991 /* Clean host part of a prefix. Not necessary in radix tree,
992 but results in cleaner routing tables.
994 Remove it only when all the things will work!
997 static int ipv6_get_mtu(struct net_device *dev)
999 int mtu = IPV6_MIN_MTU;
1000 struct inet6_dev *idev;
1002 idev = in6_dev_get(dev);
1004 mtu = idev->cnf.mtu6;
1010 int ipv6_get_hoplimit(struct net_device *dev)
1012 int hoplimit = ipv6_devconf.hop_limit;
1013 struct inet6_dev *idev;
1015 idev = in6_dev_get(dev);
1017 hoplimit = idev->cnf.hop_limit;
1027 int ip6_route_add(struct fib6_config *cfg)
1030 struct rt6_info *rt = NULL;
1031 struct net_device *dev = NULL;
1032 struct inet6_dev *idev = NULL;
1033 struct fib6_table *table;
1036 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1038 #ifndef CONFIG_IPV6_SUBTREES
1039 if (cfg->fc_src_len)
1042 if (cfg->fc_ifindex) {
1044 dev = dev_get_by_index(cfg->fc_ifindex);
1047 idev = in6_dev_get(dev);
1052 if (cfg->fc_metric == 0)
1053 cfg->fc_metric = IP6_RT_PRIO_USER;
1055 table = fib6_new_table(cfg->fc_table);
1056 if (table == NULL) {
1061 rt = ip6_dst_alloc();
1068 rt->u.dst.obsolete = -1;
1069 rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1071 if (cfg->fc_protocol == RTPROT_UNSPEC)
1072 cfg->fc_protocol = RTPROT_BOOT;
1073 rt->rt6i_protocol = cfg->fc_protocol;
1075 addr_type = ipv6_addr_type(&cfg->fc_dst);
1077 if (addr_type & IPV6_ADDR_MULTICAST)
1078 rt->u.dst.input = ip6_mc_input;
1080 rt->u.dst.input = ip6_forward;
1082 rt->u.dst.output = ip6_output;
1084 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1085 rt->rt6i_dst.plen = cfg->fc_dst_len;
1086 if (rt->rt6i_dst.plen == 128)
1087 rt->u.dst.flags = DST_HOST;
1089 #ifdef CONFIG_IPV6_SUBTREES
1090 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1091 rt->rt6i_src.plen = cfg->fc_src_len;
1094 rt->rt6i_metric = cfg->fc_metric;
1096 /* We cannot add true routes via loopback here,
1097 they would result in kernel looping; promote them to reject routes
1099 if ((cfg->fc_flags & RTF_REJECT) ||
1100 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1101 /* hold loopback dev/idev if we haven't done so. */
1102 if (dev != &loopback_dev) {
1107 dev = &loopback_dev;
1109 idev = in6_dev_get(dev);
1115 rt->u.dst.output = ip6_pkt_discard_out;
1116 rt->u.dst.input = ip6_pkt_discard;
1117 rt->u.dst.error = -ENETUNREACH;
1118 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1122 if (cfg->fc_flags & RTF_GATEWAY) {
1123 struct in6_addr *gw_addr;
1126 gw_addr = &cfg->fc_gateway;
1127 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1128 gwa_type = ipv6_addr_type(gw_addr);
1130 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1131 struct rt6_info *grt;
1133 /* IPv6 strictly inhibits using not link-local
1134 addresses as nexthop address.
1135 Otherwise, router will not able to send redirects.
1136 It is very good, but in some (rare!) circumstances
1137 (SIT, PtP, NBMA NOARP links) it is handy to allow
1138 some exceptions. --ANK
1141 if (!(gwa_type&IPV6_ADDR_UNICAST))
1144 grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1146 err = -EHOSTUNREACH;
1150 if (dev != grt->rt6i_dev) {
1151 dst_release(&grt->u.dst);
1155 dev = grt->rt6i_dev;
1156 idev = grt->rt6i_idev;
1158 in6_dev_hold(grt->rt6i_idev);
1160 if (!(grt->rt6i_flags&RTF_GATEWAY))
1162 dst_release(&grt->u.dst);
1168 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1176 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1177 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1178 if (IS_ERR(rt->rt6i_nexthop)) {
1179 err = PTR_ERR(rt->rt6i_nexthop);
1180 rt->rt6i_nexthop = NULL;
1185 rt->rt6i_flags = cfg->fc_flags;
1192 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1193 int type = nla->nla_type;
1196 if (type > RTAX_MAX) {
1201 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1206 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1207 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1208 if (!rt->u.dst.metrics[RTAX_MTU-1])
1209 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1210 if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1211 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1212 rt->u.dst.dev = dev;
1213 rt->rt6i_idev = idev;
1214 rt->rt6i_table = table;
1215 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1223 dst_free((struct dst_entry *) rt);
1227 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1230 struct fib6_table *table;
1232 if (rt == &ip6_null_entry)
1235 table = rt->rt6i_table;
1236 write_lock_bh(&table->tb6_lock);
1238 err = fib6_del(rt, info);
1239 dst_release(&rt->u.dst);
1241 write_unlock_bh(&table->tb6_lock);
1246 int ip6_del_rt(struct rt6_info *rt)
1248 return __ip6_del_rt(rt, NULL);
1251 static int ip6_route_del(struct fib6_config *cfg)
1253 struct fib6_table *table;
1254 struct fib6_node *fn;
1255 struct rt6_info *rt;
1258 table = fib6_get_table(cfg->fc_table);
1262 read_lock_bh(&table->tb6_lock);
1264 fn = fib6_locate(&table->tb6_root,
1265 &cfg->fc_dst, cfg->fc_dst_len,
1266 &cfg->fc_src, cfg->fc_src_len);
1269 for (rt = fn->leaf; rt; rt = rt->u.next) {
1270 if (cfg->fc_ifindex &&
1271 (rt->rt6i_dev == NULL ||
1272 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1274 if (cfg->fc_flags & RTF_GATEWAY &&
1275 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1277 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1279 dst_hold(&rt->u.dst);
1280 read_unlock_bh(&table->tb6_lock);
1282 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1285 read_unlock_bh(&table->tb6_lock);
1293 struct ip6rd_flowi {
1295 struct in6_addr gateway;
1298 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1302 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1303 struct rt6_info *rt;
1304 struct fib6_node *fn;
1307 * Get the "current" route for this destination and
1308 * check if the redirect has come from approriate router.
1310 * RFC 2461 specifies that redirects should only be
1311 * accepted if they come from the nexthop to the target.
1312 * Due to the way the routes are chosen, this notion
1313 * is a bit fuzzy and one might need to check all possible
1317 read_lock_bh(&table->tb6_lock);
1318 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1320 for (rt = fn->leaf; rt; rt = rt->u.next) {
1322 * Current route is on-link; redirect is always invalid.
1324 * Seems, previous statement is not true. It could
1325 * be node, which looks for us as on-link (f.e. proxy ndisc)
1326 * But then router serving it might decide, that we should
1327 * know truth 8)8) --ANK (980726).
1329 if (rt6_check_expired(rt))
1331 if (!(rt->rt6i_flags & RTF_GATEWAY))
1333 if (fl->oif != rt->rt6i_dev->ifindex)
1335 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1341 rt = &ip6_null_entry;
1342 BACKTRACK(&fl->fl6_src);
1344 dst_hold(&rt->u.dst);
1346 read_unlock_bh(&table->tb6_lock);
1351 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1352 struct in6_addr *src,
1353 struct in6_addr *gateway,
1354 struct net_device *dev)
1356 int flags = RT6_LOOKUP_F_HAS_SADDR;
1357 struct ip6rd_flowi rdfl = {
1359 .oif = dev->ifindex,
1367 .gateway = *gateway,
1370 if (rt6_need_strict(dest))
1371 flags |= RT6_LOOKUP_F_IFACE;
1373 return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1376 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1377 struct in6_addr *saddr,
1378 struct neighbour *neigh, u8 *lladdr, int on_link)
1380 struct rt6_info *rt, *nrt = NULL;
1381 struct netevent_redirect netevent;
1383 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1385 if (rt == &ip6_null_entry) {
1386 if (net_ratelimit())
1387 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1388 "for redirect target\n");
1393 * We have finally decided to accept it.
1396 neigh_update(neigh, lladdr, NUD_STALE,
1397 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1398 NEIGH_UPDATE_F_OVERRIDE|
1399 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1400 NEIGH_UPDATE_F_ISROUTER))
1404 * Redirect received -> path was valid.
1405 * Look, redirects are sent only in response to data packets,
1406 * so that this nexthop apparently is reachable. --ANK
1408 dst_confirm(&rt->u.dst);
1410 /* Duplicate redirect: silently ignore. */
1411 if (neigh == rt->u.dst.neighbour)
1414 nrt = ip6_rt_copy(rt);
1418 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1420 nrt->rt6i_flags &= ~RTF_GATEWAY;
1422 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1423 nrt->rt6i_dst.plen = 128;
1424 nrt->u.dst.flags |= DST_HOST;
1426 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1427 nrt->rt6i_nexthop = neigh_clone(neigh);
1428 /* Reset pmtu, it may be better */
1429 nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1430 nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1432 if (ip6_ins_rt(nrt))
1435 netevent.old = &rt->u.dst;
1436 netevent.new = &nrt->u.dst;
1437 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1439 if (rt->rt6i_flags&RTF_CACHE) {
1445 dst_release(&rt->u.dst);
1450 * Handle ICMP "packet too big" messages
1451 * i.e. Path MTU discovery
1454 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1455 struct net_device *dev, u32 pmtu)
1457 struct rt6_info *rt, *nrt;
1460 rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1464 if (pmtu >= dst_mtu(&rt->u.dst))
1467 if (pmtu < IPV6_MIN_MTU) {
1469 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1470 * MTU (1280) and a fragment header should always be included
1471 * after a node receiving Too Big message reporting PMTU is
1472 * less than the IPv6 Minimum Link MTU.
1474 pmtu = IPV6_MIN_MTU;
1478 /* New mtu received -> path was valid.
1479 They are sent only in response to data packets,
1480 so that this nexthop apparently is reachable. --ANK
1482 dst_confirm(&rt->u.dst);
1484 /* Host route. If it is static, it would be better
1485 not to override it, but add new one, so that
1486 when cache entry will expire old pmtu
1487 would return automatically.
1489 if (rt->rt6i_flags & RTF_CACHE) {
1490 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1492 rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1493 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1494 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1499 Two cases are possible:
1500 1. It is connected route. Action: COW
1501 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1503 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1504 nrt = rt6_alloc_cow(rt, daddr, saddr);
1506 nrt = rt6_alloc_clone(rt, daddr);
1509 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1511 nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1513 /* According to RFC 1981, detecting PMTU increase shouldn't be
1514 * happened within 5 mins, the recommended timer is 10 mins.
1515 * Here this route expiration time is set to ip6_rt_mtu_expires
1516 * which is 10 mins. After 10 mins the decreased pmtu is expired
1517 * and detecting PMTU increase will be automatically happened.
1519 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1520 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1525 dst_release(&rt->u.dst);
1529 * Misc support functions
1532 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1534 struct rt6_info *rt = ip6_dst_alloc();
1537 rt->u.dst.input = ort->u.dst.input;
1538 rt->u.dst.output = ort->u.dst.output;
1540 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1541 rt->u.dst.error = ort->u.dst.error;
1542 rt->u.dst.dev = ort->u.dst.dev;
1544 dev_hold(rt->u.dst.dev);
1545 rt->rt6i_idev = ort->rt6i_idev;
1547 in6_dev_hold(rt->rt6i_idev);
1548 rt->u.dst.lastuse = jiffies;
1549 rt->rt6i_expires = 0;
1551 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1552 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1553 rt->rt6i_metric = 0;
1555 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1556 #ifdef CONFIG_IPV6_SUBTREES
1557 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1559 rt->rt6i_table = ort->rt6i_table;
1564 #ifdef CONFIG_IPV6_ROUTE_INFO
1565 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1566 struct in6_addr *gwaddr, int ifindex)
1568 struct fib6_node *fn;
1569 struct rt6_info *rt = NULL;
1570 struct fib6_table *table;
1572 table = fib6_get_table(RT6_TABLE_INFO);
1576 write_lock_bh(&table->tb6_lock);
1577 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1581 for (rt = fn->leaf; rt; rt = rt->u.next) {
1582 if (rt->rt6i_dev->ifindex != ifindex)
1584 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1586 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1588 dst_hold(&rt->u.dst);
1592 write_unlock_bh(&table->tb6_lock);
1596 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1597 struct in6_addr *gwaddr, int ifindex,
1600 struct fib6_config cfg = {
1601 .fc_table = RT6_TABLE_INFO,
1603 .fc_ifindex = ifindex,
1604 .fc_dst_len = prefixlen,
1605 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1606 RTF_UP | RTF_PREF(pref),
1609 ipv6_addr_copy(&cfg.fc_dst, prefix);
1610 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1612 /* We should treat it as a default route if prefix length is 0. */
1614 cfg.fc_flags |= RTF_DEFAULT;
1616 ip6_route_add(&cfg);
1618 return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1622 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1624 struct rt6_info *rt;
1625 struct fib6_table *table;
1627 table = fib6_get_table(RT6_TABLE_DFLT);
1631 write_lock_bh(&table->tb6_lock);
1632 for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1633 if (dev == rt->rt6i_dev &&
1634 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1635 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1639 dst_hold(&rt->u.dst);
1640 write_unlock_bh(&table->tb6_lock);
1644 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1645 struct net_device *dev,
1648 struct fib6_config cfg = {
1649 .fc_table = RT6_TABLE_DFLT,
1651 .fc_ifindex = dev->ifindex,
1652 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1653 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1656 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1658 ip6_route_add(&cfg);
1660 return rt6_get_dflt_router(gwaddr, dev);
1663 void rt6_purge_dflt_routers(void)
1665 struct rt6_info *rt;
1666 struct fib6_table *table;
1668 /* NOTE: Keep consistent with rt6_get_dflt_router */
1669 table = fib6_get_table(RT6_TABLE_DFLT);
1674 read_lock_bh(&table->tb6_lock);
1675 for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1676 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1677 dst_hold(&rt->u.dst);
1678 read_unlock_bh(&table->tb6_lock);
1683 read_unlock_bh(&table->tb6_lock);
1686 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1687 struct fib6_config *cfg)
1689 memset(cfg, 0, sizeof(*cfg));
1691 cfg->fc_table = RT6_TABLE_MAIN;
1692 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1693 cfg->fc_metric = rtmsg->rtmsg_metric;
1694 cfg->fc_expires = rtmsg->rtmsg_info;
1695 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1696 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1697 cfg->fc_flags = rtmsg->rtmsg_flags;
1699 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1700 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1701 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1704 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1706 struct fib6_config cfg;
1707 struct in6_rtmsg rtmsg;
1711 case SIOCADDRT: /* Add a route */
1712 case SIOCDELRT: /* Delete a route */
1713 if (!capable(CAP_NET_ADMIN))
1715 err = copy_from_user(&rtmsg, arg,
1716 sizeof(struct in6_rtmsg));
1720 rtmsg_to_fib6_config(&rtmsg, &cfg);
1725 err = ip6_route_add(&cfg);
1728 err = ip6_route_del(&cfg);
1742 * Drop the packet on the floor
1745 static int ip6_pkt_discard(struct sk_buff *skb)
1747 int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1748 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1749 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1751 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1752 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1757 static int ip6_pkt_discard_out(struct sk_buff *skb)
1759 skb->dev = skb->dst->dev;
1760 return ip6_pkt_discard(skb);
1764 * Allocate a dst for local (unicast / anycast) address.
1767 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1768 const struct in6_addr *addr,
1771 struct rt6_info *rt = ip6_dst_alloc();
1774 return ERR_PTR(-ENOMEM);
1776 dev_hold(&loopback_dev);
1779 rt->u.dst.flags = DST_HOST;
1780 rt->u.dst.input = ip6_input;
1781 rt->u.dst.output = ip6_output;
1782 rt->rt6i_dev = &loopback_dev;
1783 rt->rt6i_idev = idev;
1784 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1785 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1786 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1787 rt->u.dst.obsolete = -1;
1789 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1791 rt->rt6i_flags |= RTF_ANYCAST;
1793 rt->rt6i_flags |= RTF_LOCAL;
1794 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1795 if (rt->rt6i_nexthop == NULL) {
1796 dst_free((struct dst_entry *) rt);
1797 return ERR_PTR(-ENOMEM);
1800 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1801 rt->rt6i_dst.plen = 128;
1802 rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1804 atomic_set(&rt->u.dst.__refcnt, 1);
1809 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1811 if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1812 rt != &ip6_null_entry) {
1813 RT6_TRACE("deleted by ifdown %p\n", rt);
1819 void rt6_ifdown(struct net_device *dev)
1821 fib6_clean_all(fib6_ifdown, 0, dev);
1824 struct rt6_mtu_change_arg
1826 struct net_device *dev;
1830 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1832 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1833 struct inet6_dev *idev;
1835 /* In IPv6 pmtu discovery is not optional,
1836 so that RTAX_MTU lock cannot disable it.
1837 We still use this lock to block changes
1838 caused by addrconf/ndisc.
1841 idev = __in6_dev_get(arg->dev);
1845 /* For administrative MTU increase, there is no way to discover
1846 IPv6 PMTU increase, so PMTU increase should be updated here.
1847 Since RFC 1981 doesn't include administrative MTU increase
1848 update PMTU increase is a MUST. (i.e. jumbo frame)
1851 If new MTU is less than route PMTU, this new MTU will be the
1852 lowest MTU in the path, update the route PMTU to reflect PMTU
1853 decreases; if new MTU is greater than route PMTU, and the
1854 old MTU is the lowest MTU in the path, update the route PMTU
1855 to reflect the increase. In this case if the other nodes' MTU
1856 also have the lowest MTU, TOO BIG MESSAGE will be lead to
1859 if (rt->rt6i_dev == arg->dev &&
1860 !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1861 (dst_mtu(&rt->u.dst) > arg->mtu ||
1862 (dst_mtu(&rt->u.dst) < arg->mtu &&
1863 dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1864 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1865 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1869 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1871 struct rt6_mtu_change_arg arg = {
1876 fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1879 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1880 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
1881 [RTA_OIF] = { .type = NLA_U32 },
1882 [RTA_IIF] = { .type = NLA_U32 },
1883 [RTA_PRIORITY] = { .type = NLA_U32 },
1884 [RTA_METRICS] = { .type = NLA_NESTED },
1887 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1888 struct fib6_config *cfg)
1891 struct nlattr *tb[RTA_MAX+1];
1894 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1899 rtm = nlmsg_data(nlh);
1900 memset(cfg, 0, sizeof(*cfg));
1902 cfg->fc_table = rtm->rtm_table;
1903 cfg->fc_dst_len = rtm->rtm_dst_len;
1904 cfg->fc_src_len = rtm->rtm_src_len;
1905 cfg->fc_flags = RTF_UP;
1906 cfg->fc_protocol = rtm->rtm_protocol;
1908 if (rtm->rtm_type == RTN_UNREACHABLE)
1909 cfg->fc_flags |= RTF_REJECT;
1911 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1912 cfg->fc_nlinfo.nlh = nlh;
1914 if (tb[RTA_GATEWAY]) {
1915 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1916 cfg->fc_flags |= RTF_GATEWAY;
1920 int plen = (rtm->rtm_dst_len + 7) >> 3;
1922 if (nla_len(tb[RTA_DST]) < plen)
1925 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1929 int plen = (rtm->rtm_src_len + 7) >> 3;
1931 if (nla_len(tb[RTA_SRC]) < plen)
1934 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1938 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1940 if (tb[RTA_PRIORITY])
1941 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1943 if (tb[RTA_METRICS]) {
1944 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1945 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1949 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1956 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1958 struct fib6_config cfg;
1961 err = rtm_to_fib6_config(skb, nlh, &cfg);
1965 return ip6_route_del(&cfg);
1968 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1970 struct fib6_config cfg;
1973 err = rtm_to_fib6_config(skb, nlh, &cfg);
1977 return ip6_route_add(&cfg);
1980 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1981 struct in6_addr *dst, struct in6_addr *src,
1982 int iif, int type, u32 pid, u32 seq,
1983 int prefix, unsigned int flags)
1986 struct nlmsghdr *nlh;
1987 struct rta_cacheinfo ci;
1990 if (prefix) { /* user wants prefix routes only */
1991 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1992 /* success since this is not a prefix route */
1997 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2001 rtm = nlmsg_data(nlh);
2002 rtm->rtm_family = AF_INET6;
2003 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2004 rtm->rtm_src_len = rt->rt6i_src.plen;
2007 table = rt->rt6i_table->tb6_id;
2009 table = RT6_TABLE_UNSPEC;
2010 rtm->rtm_table = table;
2011 NLA_PUT_U32(skb, RTA_TABLE, table);
2012 if (rt->rt6i_flags&RTF_REJECT)
2013 rtm->rtm_type = RTN_UNREACHABLE;
2014 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2015 rtm->rtm_type = RTN_LOCAL;
2017 rtm->rtm_type = RTN_UNICAST;
2019 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2020 rtm->rtm_protocol = rt->rt6i_protocol;
2021 if (rt->rt6i_flags&RTF_DYNAMIC)
2022 rtm->rtm_protocol = RTPROT_REDIRECT;
2023 else if (rt->rt6i_flags & RTF_ADDRCONF)
2024 rtm->rtm_protocol = RTPROT_KERNEL;
2025 else if (rt->rt6i_flags&RTF_DEFAULT)
2026 rtm->rtm_protocol = RTPROT_RA;
2028 if (rt->rt6i_flags&RTF_CACHE)
2029 rtm->rtm_flags |= RTM_F_CLONED;
2032 NLA_PUT(skb, RTA_DST, 16, dst);
2033 rtm->rtm_dst_len = 128;
2034 } else if (rtm->rtm_dst_len)
2035 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2036 #ifdef CONFIG_IPV6_SUBTREES
2038 NLA_PUT(skb, RTA_SRC, 16, src);
2039 rtm->rtm_src_len = 128;
2040 } else if (rtm->rtm_src_len)
2041 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2044 NLA_PUT_U32(skb, RTA_IIF, iif);
2046 struct in6_addr saddr_buf;
2047 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2048 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2051 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2052 goto nla_put_failure;
2054 if (rt->u.dst.neighbour)
2055 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2058 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2060 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2061 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2062 if (rt->rt6i_expires)
2063 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
2066 ci.rta_used = rt->u.dst.__use;
2067 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2068 ci.rta_error = rt->u.dst.error;
2072 NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2074 return nlmsg_end(skb, nlh);
2077 return nlmsg_cancel(skb, nlh);
2080 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2082 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2085 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2086 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2087 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2091 return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2092 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2093 prefix, NLM_F_MULTI);
2096 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2098 struct nlattr *tb[RTA_MAX+1];
2099 struct rt6_info *rt;
2100 struct sk_buff *skb;
2105 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2110 memset(&fl, 0, sizeof(fl));
2113 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2116 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2120 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2123 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2127 iif = nla_get_u32(tb[RTA_IIF]);
2130 fl.oif = nla_get_u32(tb[RTA_OIF]);
2133 struct net_device *dev;
2134 dev = __dev_get_by_index(iif);
2141 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2147 /* Reserve room for dummy headers, this skb can pass
2148 through good chunk of routing engine.
2150 skb->mac.raw = skb->data;
2151 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2153 rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2154 skb->dst = &rt->u.dst;
2156 err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2157 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2158 nlh->nlmsg_seq, 0, 0);
2164 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2169 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2171 struct sk_buff *skb;
2172 u32 pid = 0, seq = 0;
2173 struct nlmsghdr *nlh = NULL;
2174 int payload = sizeof(struct rtmsg) + 256;
2181 seq = nlh->nlmsg_seq;
2184 skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2188 err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2194 err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2197 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2204 #ifdef CONFIG_PROC_FS
2206 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2217 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2219 struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2222 if (arg->skip < arg->offset / RT6_INFO_LEN) {
2227 if (arg->len >= arg->length)
2230 for (i=0; i<16; i++) {
2231 sprintf(arg->buffer + arg->len, "%02x",
2232 rt->rt6i_dst.addr.s6_addr[i]);
2235 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2238 #ifdef CONFIG_IPV6_SUBTREES
2239 for (i=0; i<16; i++) {
2240 sprintf(arg->buffer + arg->len, "%02x",
2241 rt->rt6i_src.addr.s6_addr[i]);
2244 arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2247 sprintf(arg->buffer + arg->len,
2248 "00000000000000000000000000000000 00 ");
2252 if (rt->rt6i_nexthop) {
2253 for (i=0; i<16; i++) {
2254 sprintf(arg->buffer + arg->len, "%02x",
2255 rt->rt6i_nexthop->primary_key[i]);
2259 sprintf(arg->buffer + arg->len,
2260 "00000000000000000000000000000000");
2263 arg->len += sprintf(arg->buffer + arg->len,
2264 " %08x %08x %08x %08x %8s\n",
2265 rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2266 rt->u.dst.__use, rt->rt6i_flags,
2267 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2271 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2273 struct rt6_proc_arg arg = {
2279 fib6_clean_all(rt6_info_route, 0, &arg);
2283 *start += offset % RT6_INFO_LEN;
2285 arg.len -= offset % RT6_INFO_LEN;
2287 if (arg.len > length)
2295 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2297 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2298 rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2299 rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2300 rt6_stats.fib_rt_cache,
2301 atomic_read(&ip6_dst_ops.entries),
2302 rt6_stats.fib_discarded_routes);
2307 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2309 return single_open(file, rt6_stats_seq_show, NULL);
2312 static struct file_operations rt6_stats_seq_fops = {
2313 .owner = THIS_MODULE,
2314 .open = rt6_stats_seq_open,
2316 .llseek = seq_lseek,
2317 .release = single_release,
2319 #endif /* CONFIG_PROC_FS */
2321 #ifdef CONFIG_SYSCTL
2323 static int flush_delay;
2326 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2327 void __user *buffer, size_t *lenp, loff_t *ppos)
2330 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2331 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2337 ctl_table ipv6_route_table[] = {
2339 .ctl_name = NET_IPV6_ROUTE_FLUSH,
2340 .procname = "flush",
2341 .data = &flush_delay,
2342 .maxlen = sizeof(int),
2344 .proc_handler = &ipv6_sysctl_rtcache_flush
2347 .ctl_name = NET_IPV6_ROUTE_GC_THRESH,
2348 .procname = "gc_thresh",
2349 .data = &ip6_dst_ops.gc_thresh,
2350 .maxlen = sizeof(int),
2352 .proc_handler = &proc_dointvec,
2355 .ctl_name = NET_IPV6_ROUTE_MAX_SIZE,
2356 .procname = "max_size",
2357 .data = &ip6_rt_max_size,
2358 .maxlen = sizeof(int),
2360 .proc_handler = &proc_dointvec,
2363 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2364 .procname = "gc_min_interval",
2365 .data = &ip6_rt_gc_min_interval,
2366 .maxlen = sizeof(int),
2368 .proc_handler = &proc_dointvec_jiffies,
2369 .strategy = &sysctl_jiffies,
2372 .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT,
2373 .procname = "gc_timeout",
2374 .data = &ip6_rt_gc_timeout,
2375 .maxlen = sizeof(int),
2377 .proc_handler = &proc_dointvec_jiffies,
2378 .strategy = &sysctl_jiffies,
2381 .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL,
2382 .procname = "gc_interval",
2383 .data = &ip6_rt_gc_interval,
2384 .maxlen = sizeof(int),
2386 .proc_handler = &proc_dointvec_jiffies,
2387 .strategy = &sysctl_jiffies,
2390 .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY,
2391 .procname = "gc_elasticity",
2392 .data = &ip6_rt_gc_elasticity,
2393 .maxlen = sizeof(int),
2395 .proc_handler = &proc_dointvec_jiffies,
2396 .strategy = &sysctl_jiffies,
2399 .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES,
2400 .procname = "mtu_expires",
2401 .data = &ip6_rt_mtu_expires,
2402 .maxlen = sizeof(int),
2404 .proc_handler = &proc_dointvec_jiffies,
2405 .strategy = &sysctl_jiffies,
2408 .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS,
2409 .procname = "min_adv_mss",
2410 .data = &ip6_rt_min_advmss,
2411 .maxlen = sizeof(int),
2413 .proc_handler = &proc_dointvec_jiffies,
2414 .strategy = &sysctl_jiffies,
2417 .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2418 .procname = "gc_min_interval_ms",
2419 .data = &ip6_rt_gc_min_interval,
2420 .maxlen = sizeof(int),
2422 .proc_handler = &proc_dointvec_ms_jiffies,
2423 .strategy = &sysctl_ms_jiffies,
2430 void __init ip6_route_init(void)
2432 struct proc_dir_entry *p;
2434 ip6_dst_ops.kmem_cachep =
2435 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2436 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
2438 #ifdef CONFIG_PROC_FS
2439 p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2441 p->owner = THIS_MODULE;
2443 proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2448 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2453 void ip6_route_cleanup(void)
2455 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2456 fib6_rules_cleanup();
2458 #ifdef CONFIG_PROC_FS
2459 proc_net_remove("ipv6_route");
2460 proc_net_remove("rt6_stats");
2467 kmem_cache_destroy(ip6_dst_ops.kmem_cachep);