]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv6/route.c
[IPV6]: Convert /proc/net/ipv6_route to seq_file interface
[net-next-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <net/net_namespace.h>
44 #include <net/snmp.h>
45 #include <net/ipv6.h>
46 #include <net/ip6_fib.h>
47 #include <net/ip6_route.h>
48 #include <net/ndisc.h>
49 #include <net/addrconf.h>
50 #include <net/tcp.h>
51 #include <linux/rtnetlink.h>
52 #include <net/dst.h>
53 #include <net/xfrm.h>
54 #include <net/netevent.h>
55 #include <net/netlink.h>
56
57 #include <asm/uaccess.h>
58
59 #ifdef CONFIG_SYSCTL
60 #include <linux/sysctl.h>
61 #endif
62
63 /* Set to 3 to get tracing. */
64 #define RT6_DEBUG 2
65
66 #if RT6_DEBUG >= 3
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
69 #else
70 #define RDBG(x)
71 #define RT6_TRACE(x...) do { ; } while (0)
72 #endif
73
74 #define CLONE_OFFLINK_ROUTE 0
75
76 static int ip6_rt_max_size = 4096;
77 static int ip6_rt_gc_min_interval = HZ / 2;
78 static int ip6_rt_gc_timeout = 60*HZ;
79 int ip6_rt_gc_interval = 30*HZ;
80 static int ip6_rt_gc_elasticity = 9;
81 static int ip6_rt_mtu_expires = 10*60*HZ;
82 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
83
84 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
85 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
86 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
87 static void             ip6_dst_destroy(struct dst_entry *);
88 static void             ip6_dst_ifdown(struct dst_entry *,
89                                        struct net_device *dev, int how);
90 static int               ip6_dst_gc(void);
91
92 static int              ip6_pkt_discard(struct sk_buff *skb);
93 static int              ip6_pkt_discard_out(struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
96
97 #ifdef CONFIG_IPV6_ROUTE_INFO
98 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
99                                            struct in6_addr *gwaddr, int ifindex,
100                                            unsigned pref);
101 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
102                                            struct in6_addr *gwaddr, int ifindex);
103 #endif
104
105 static struct dst_ops ip6_dst_ops = {
106         .family                 =       AF_INET6,
107         .protocol               =       __constant_htons(ETH_P_IPV6),
108         .gc                     =       ip6_dst_gc,
109         .gc_thresh              =       1024,
110         .check                  =       ip6_dst_check,
111         .destroy                =       ip6_dst_destroy,
112         .ifdown                 =       ip6_dst_ifdown,
113         .negative_advice        =       ip6_negative_advice,
114         .link_failure           =       ip6_link_failure,
115         .update_pmtu            =       ip6_rt_update_pmtu,
116         .entry_size             =       sizeof(struct rt6_info),
117 };
118
119 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
120 {
121 }
122
123 static struct dst_ops ip6_dst_blackhole_ops = {
124         .family                 =       AF_INET6,
125         .protocol               =       __constant_htons(ETH_P_IPV6),
126         .destroy                =       ip6_dst_destroy,
127         .check                  =       ip6_dst_check,
128         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
129         .entry_size             =       sizeof(struct rt6_info),
130 };
131
132 struct rt6_info ip6_null_entry = {
133         .u = {
134                 .dst = {
135                         .__refcnt       = ATOMIC_INIT(1),
136                         .__use          = 1,
137                         .obsolete       = -1,
138                         .error          = -ENETUNREACH,
139                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
140                         .input          = ip6_pkt_discard,
141                         .output         = ip6_pkt_discard_out,
142                         .ops            = &ip6_dst_ops,
143                         .path           = (struct dst_entry*)&ip6_null_entry,
144                 }
145         },
146         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
147         .rt6i_metric    = ~(u32) 0,
148         .rt6i_ref       = ATOMIC_INIT(1),
149 };
150
151 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
152
153 static int ip6_pkt_prohibit(struct sk_buff *skb);
154 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
155 static int ip6_pkt_blk_hole(struct sk_buff *skb);
156
157 struct rt6_info ip6_prohibit_entry = {
158         .u = {
159                 .dst = {
160                         .__refcnt       = ATOMIC_INIT(1),
161                         .__use          = 1,
162                         .obsolete       = -1,
163                         .error          = -EACCES,
164                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
165                         .input          = ip6_pkt_prohibit,
166                         .output         = ip6_pkt_prohibit_out,
167                         .ops            = &ip6_dst_ops,
168                         .path           = (struct dst_entry*)&ip6_prohibit_entry,
169                 }
170         },
171         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
172         .rt6i_metric    = ~(u32) 0,
173         .rt6i_ref       = ATOMIC_INIT(1),
174 };
175
176 struct rt6_info ip6_blk_hole_entry = {
177         .u = {
178                 .dst = {
179                         .__refcnt       = ATOMIC_INIT(1),
180                         .__use          = 1,
181                         .obsolete       = -1,
182                         .error          = -EINVAL,
183                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
184                         .input          = ip6_pkt_blk_hole,
185                         .output         = ip6_pkt_blk_hole,
186                         .ops            = &ip6_dst_ops,
187                         .path           = (struct dst_entry*)&ip6_blk_hole_entry,
188                 }
189         },
190         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
191         .rt6i_metric    = ~(u32) 0,
192         .rt6i_ref       = ATOMIC_INIT(1),
193 };
194
195 #endif
196
197 /* allocate dst with ip6_dst_ops */
198 static __inline__ struct rt6_info *ip6_dst_alloc(void)
199 {
200         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
201 }
202
203 static void ip6_dst_destroy(struct dst_entry *dst)
204 {
205         struct rt6_info *rt = (struct rt6_info *)dst;
206         struct inet6_dev *idev = rt->rt6i_idev;
207
208         if (idev != NULL) {
209                 rt->rt6i_idev = NULL;
210                 in6_dev_put(idev);
211         }
212 }
213
214 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
215                            int how)
216 {
217         struct rt6_info *rt = (struct rt6_info *)dst;
218         struct inet6_dev *idev = rt->rt6i_idev;
219
220         if (dev != init_net.loopback_dev && idev != NULL && idev->dev == dev) {
221                 struct inet6_dev *loopback_idev = in6_dev_get(init_net.loopback_dev);
222                 if (loopback_idev != NULL) {
223                         rt->rt6i_idev = loopback_idev;
224                         in6_dev_put(idev);
225                 }
226         }
227 }
228
229 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
230 {
231         return (rt->rt6i_flags & RTF_EXPIRES &&
232                 time_after(jiffies, rt->rt6i_expires));
233 }
234
235 static inline int rt6_need_strict(struct in6_addr *daddr)
236 {
237         return (ipv6_addr_type(daddr) &
238                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
239 }
240
241 /*
242  *      Route lookup. Any table->tb6_lock is implied.
243  */
244
245 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
246                                                     int oif,
247                                                     int strict)
248 {
249         struct rt6_info *local = NULL;
250         struct rt6_info *sprt;
251
252         if (oif) {
253                 for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
254                         struct net_device *dev = sprt->rt6i_dev;
255                         if (dev->ifindex == oif)
256                                 return sprt;
257                         if (dev->flags & IFF_LOOPBACK) {
258                                 if (sprt->rt6i_idev == NULL ||
259                                     sprt->rt6i_idev->dev->ifindex != oif) {
260                                         if (strict && oif)
261                                                 continue;
262                                         if (local && (!oif ||
263                                                       local->rt6i_idev->dev->ifindex == oif))
264                                                 continue;
265                                 }
266                                 local = sprt;
267                         }
268                 }
269
270                 if (local)
271                         return local;
272
273                 if (strict)
274                         return &ip6_null_entry;
275         }
276         return rt;
277 }
278
279 #ifdef CONFIG_IPV6_ROUTER_PREF
280 static void rt6_probe(struct rt6_info *rt)
281 {
282         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
283         /*
284          * Okay, this does not seem to be appropriate
285          * for now, however, we need to check if it
286          * is really so; aka Router Reachability Probing.
287          *
288          * Router Reachability Probe MUST be rate-limited
289          * to no more than one per minute.
290          */
291         if (!neigh || (neigh->nud_state & NUD_VALID))
292                 return;
293         read_lock_bh(&neigh->lock);
294         if (!(neigh->nud_state & NUD_VALID) &&
295             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
296                 struct in6_addr mcaddr;
297                 struct in6_addr *target;
298
299                 neigh->updated = jiffies;
300                 read_unlock_bh(&neigh->lock);
301
302                 target = (struct in6_addr *)&neigh->primary_key;
303                 addrconf_addr_solict_mult(target, &mcaddr);
304                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
305         } else
306                 read_unlock_bh(&neigh->lock);
307 }
308 #else
309 static inline void rt6_probe(struct rt6_info *rt)
310 {
311         return;
312 }
313 #endif
314
315 /*
316  * Default Router Selection (RFC 2461 6.3.6)
317  */
318 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
319 {
320         struct net_device *dev = rt->rt6i_dev;
321         if (!oif || dev->ifindex == oif)
322                 return 2;
323         if ((dev->flags & IFF_LOOPBACK) &&
324             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
325                 return 1;
326         return 0;
327 }
328
329 static inline int rt6_check_neigh(struct rt6_info *rt)
330 {
331         struct neighbour *neigh = rt->rt6i_nexthop;
332         int m = 0;
333         if (rt->rt6i_flags & RTF_NONEXTHOP ||
334             !(rt->rt6i_flags & RTF_GATEWAY))
335                 m = 1;
336         else if (neigh) {
337                 read_lock_bh(&neigh->lock);
338                 if (neigh->nud_state & NUD_VALID)
339                         m = 2;
340                 else if (!(neigh->nud_state & NUD_FAILED))
341                         m = 1;
342                 read_unlock_bh(&neigh->lock);
343         }
344         return m;
345 }
346
347 static int rt6_score_route(struct rt6_info *rt, int oif,
348                            int strict)
349 {
350         int m, n;
351
352         m = rt6_check_dev(rt, oif);
353         if (!m && (strict & RT6_LOOKUP_F_IFACE))
354                 return -1;
355 #ifdef CONFIG_IPV6_ROUTER_PREF
356         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
357 #endif
358         n = rt6_check_neigh(rt);
359         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
360                 return -1;
361         return m;
362 }
363
364 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
365                                    int *mpri, struct rt6_info *match)
366 {
367         int m;
368
369         if (rt6_check_expired(rt))
370                 goto out;
371
372         m = rt6_score_route(rt, oif, strict);
373         if (m < 0)
374                 goto out;
375
376         if (m > *mpri) {
377                 if (strict & RT6_LOOKUP_F_REACHABLE)
378                         rt6_probe(match);
379                 *mpri = m;
380                 match = rt;
381         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
382                 rt6_probe(rt);
383         }
384
385 out:
386         return match;
387 }
388
389 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
390                                      struct rt6_info *rr_head,
391                                      u32 metric, int oif, int strict)
392 {
393         struct rt6_info *rt, *match;
394         int mpri = -1;
395
396         match = NULL;
397         for (rt = rr_head; rt && rt->rt6i_metric == metric;
398              rt = rt->u.dst.rt6_next)
399                 match = find_match(rt, oif, strict, &mpri, match);
400         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
401              rt = rt->u.dst.rt6_next)
402                 match = find_match(rt, oif, strict, &mpri, match);
403
404         return match;
405 }
406
407 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
408 {
409         struct rt6_info *match, *rt0;
410
411         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
412                   __FUNCTION__, fn->leaf, oif);
413
414         rt0 = fn->rr_ptr;
415         if (!rt0)
416                 fn->rr_ptr = rt0 = fn->leaf;
417
418         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
419
420         if (!match &&
421             (strict & RT6_LOOKUP_F_REACHABLE)) {
422                 struct rt6_info *next = rt0->u.dst.rt6_next;
423
424                 /* no entries matched; do round-robin */
425                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
426                         next = fn->leaf;
427
428                 if (next != rt0)
429                         fn->rr_ptr = next;
430         }
431
432         RT6_TRACE("%s() => %p\n",
433                   __FUNCTION__, match);
434
435         return (match ? match : &ip6_null_entry);
436 }
437
438 #ifdef CONFIG_IPV6_ROUTE_INFO
439 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
440                   struct in6_addr *gwaddr)
441 {
442         struct route_info *rinfo = (struct route_info *) opt;
443         struct in6_addr prefix_buf, *prefix;
444         unsigned int pref;
445         u32 lifetime;
446         struct rt6_info *rt;
447
448         if (len < sizeof(struct route_info)) {
449                 return -EINVAL;
450         }
451
452         /* Sanity check for prefix_len and length */
453         if (rinfo->length > 3) {
454                 return -EINVAL;
455         } else if (rinfo->prefix_len > 128) {
456                 return -EINVAL;
457         } else if (rinfo->prefix_len > 64) {
458                 if (rinfo->length < 2) {
459                         return -EINVAL;
460                 }
461         } else if (rinfo->prefix_len > 0) {
462                 if (rinfo->length < 1) {
463                         return -EINVAL;
464                 }
465         }
466
467         pref = rinfo->route_pref;
468         if (pref == ICMPV6_ROUTER_PREF_INVALID)
469                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
470
471         lifetime = ntohl(rinfo->lifetime);
472         if (lifetime == 0xffffffff) {
473                 /* infinity */
474         } else if (lifetime > 0x7fffffff/HZ) {
475                 /* Avoid arithmetic overflow */
476                 lifetime = 0x7fffffff/HZ - 1;
477         }
478
479         if (rinfo->length == 3)
480                 prefix = (struct in6_addr *)rinfo->prefix;
481         else {
482                 /* this function is safe */
483                 ipv6_addr_prefix(&prefix_buf,
484                                  (struct in6_addr *)rinfo->prefix,
485                                  rinfo->prefix_len);
486                 prefix = &prefix_buf;
487         }
488
489         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
490
491         if (rt && !lifetime) {
492                 ip6_del_rt(rt);
493                 rt = NULL;
494         }
495
496         if (!rt && lifetime)
497                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
498                                         pref);
499         else if (rt)
500                 rt->rt6i_flags = RTF_ROUTEINFO |
501                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
502
503         if (rt) {
504                 if (lifetime == 0xffffffff) {
505                         rt->rt6i_flags &= ~RTF_EXPIRES;
506                 } else {
507                         rt->rt6i_expires = jiffies + HZ * lifetime;
508                         rt->rt6i_flags |= RTF_EXPIRES;
509                 }
510                 dst_release(&rt->u.dst);
511         }
512         return 0;
513 }
514 #endif
515
516 #define BACKTRACK(saddr) \
517 do { \
518         if (rt == &ip6_null_entry) { \
519                 struct fib6_node *pn; \
520                 while (1) { \
521                         if (fn->fn_flags & RTN_TL_ROOT) \
522                                 goto out; \
523                         pn = fn->parent; \
524                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
525                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
526                         else \
527                                 fn = pn; \
528                         if (fn->fn_flags & RTN_RTINFO) \
529                                 goto restart; \
530                 } \
531         } \
532 } while(0)
533
534 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
535                                              struct flowi *fl, int flags)
536 {
537         struct fib6_node *fn;
538         struct rt6_info *rt;
539
540         read_lock_bh(&table->tb6_lock);
541         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
542 restart:
543         rt = fn->leaf;
544         rt = rt6_device_match(rt, fl->oif, flags);
545         BACKTRACK(&fl->fl6_src);
546 out:
547         dst_hold(&rt->u.dst);
548         read_unlock_bh(&table->tb6_lock);
549
550         rt->u.dst.lastuse = jiffies;
551         rt->u.dst.__use++;
552
553         return rt;
554
555 }
556
557 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
558                             int oif, int strict)
559 {
560         struct flowi fl = {
561                 .oif = oif,
562                 .nl_u = {
563                         .ip6_u = {
564                                 .daddr = *daddr,
565                         },
566                 },
567         };
568         struct dst_entry *dst;
569         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
570
571         if (saddr) {
572                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
573                 flags |= RT6_LOOKUP_F_HAS_SADDR;
574         }
575
576         dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
577         if (dst->error == 0)
578                 return (struct rt6_info *) dst;
579
580         dst_release(dst);
581
582         return NULL;
583 }
584
585 EXPORT_SYMBOL(rt6_lookup);
586
587 /* ip6_ins_rt is called with FREE table->tb6_lock.
588    It takes new route entry, the addition fails by any reason the
589    route is freed. In any case, if caller does not hold it, it may
590    be destroyed.
591  */
592
593 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
594 {
595         int err;
596         struct fib6_table *table;
597
598         table = rt->rt6i_table;
599         write_lock_bh(&table->tb6_lock);
600         err = fib6_add(&table->tb6_root, rt, info);
601         write_unlock_bh(&table->tb6_lock);
602
603         return err;
604 }
605
606 int ip6_ins_rt(struct rt6_info *rt)
607 {
608         return __ip6_ins_rt(rt, NULL);
609 }
610
611 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
612                                       struct in6_addr *saddr)
613 {
614         struct rt6_info *rt;
615
616         /*
617          *      Clone the route.
618          */
619
620         rt = ip6_rt_copy(ort);
621
622         if (rt) {
623                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
624                         if (rt->rt6i_dst.plen != 128 &&
625                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
626                                 rt->rt6i_flags |= RTF_ANYCAST;
627                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
628                 }
629
630                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
631                 rt->rt6i_dst.plen = 128;
632                 rt->rt6i_flags |= RTF_CACHE;
633                 rt->u.dst.flags |= DST_HOST;
634
635 #ifdef CONFIG_IPV6_SUBTREES
636                 if (rt->rt6i_src.plen && saddr) {
637                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
638                         rt->rt6i_src.plen = 128;
639                 }
640 #endif
641
642                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
643
644         }
645
646         return rt;
647 }
648
649 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
650 {
651         struct rt6_info *rt = ip6_rt_copy(ort);
652         if (rt) {
653                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
654                 rt->rt6i_dst.plen = 128;
655                 rt->rt6i_flags |= RTF_CACHE;
656                 rt->u.dst.flags |= DST_HOST;
657                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
658         }
659         return rt;
660 }
661
662 static struct rt6_info *ip6_pol_route(struct fib6_table *table, int oif,
663                                             struct flowi *fl, int flags)
664 {
665         struct fib6_node *fn;
666         struct rt6_info *rt, *nrt;
667         int strict = 0;
668         int attempts = 3;
669         int err;
670         int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
671
672         strict |= flags & RT6_LOOKUP_F_IFACE;
673
674 relookup:
675         read_lock_bh(&table->tb6_lock);
676
677 restart_2:
678         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
679
680 restart:
681         rt = rt6_select(fn, oif, strict | reachable);
682         BACKTRACK(&fl->fl6_src);
683         if (rt == &ip6_null_entry ||
684             rt->rt6i_flags & RTF_CACHE)
685                 goto out;
686
687         dst_hold(&rt->u.dst);
688         read_unlock_bh(&table->tb6_lock);
689
690         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
691                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
692         else {
693 #if CLONE_OFFLINK_ROUTE
694                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
695 #else
696                 goto out2;
697 #endif
698         }
699
700         dst_release(&rt->u.dst);
701         rt = nrt ? : &ip6_null_entry;
702
703         dst_hold(&rt->u.dst);
704         if (nrt) {
705                 err = ip6_ins_rt(nrt);
706                 if (!err)
707                         goto out2;
708         }
709
710         if (--attempts <= 0)
711                 goto out2;
712
713         /*
714          * Race condition! In the gap, when table->tb6_lock was
715          * released someone could insert this route.  Relookup.
716          */
717         dst_release(&rt->u.dst);
718         goto relookup;
719
720 out:
721         if (reachable) {
722                 reachable = 0;
723                 goto restart_2;
724         }
725         dst_hold(&rt->u.dst);
726         read_unlock_bh(&table->tb6_lock);
727 out2:
728         rt->u.dst.lastuse = jiffies;
729         rt->u.dst.__use++;
730
731         return rt;
732 }
733
734 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
735                                             struct flowi *fl, int flags)
736 {
737         return ip6_pol_route(table, fl->iif, fl, flags);
738 }
739
740 void ip6_route_input(struct sk_buff *skb)
741 {
742         struct ipv6hdr *iph = ipv6_hdr(skb);
743         int flags = RT6_LOOKUP_F_HAS_SADDR;
744         struct flowi fl = {
745                 .iif = skb->dev->ifindex,
746                 .nl_u = {
747                         .ip6_u = {
748                                 .daddr = iph->daddr,
749                                 .saddr = iph->saddr,
750                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
751                         },
752                 },
753                 .mark = skb->mark,
754                 .proto = iph->nexthdr,
755         };
756
757         if (rt6_need_strict(&iph->daddr))
758                 flags |= RT6_LOOKUP_F_IFACE;
759
760         skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
761 }
762
763 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
764                                              struct flowi *fl, int flags)
765 {
766         return ip6_pol_route(table, fl->oif, fl, flags);
767 }
768
769 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
770 {
771         int flags = 0;
772
773         if (rt6_need_strict(&fl->fl6_dst))
774                 flags |= RT6_LOOKUP_F_IFACE;
775
776         if (!ipv6_addr_any(&fl->fl6_src))
777                 flags |= RT6_LOOKUP_F_HAS_SADDR;
778
779         return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
780 }
781
782 EXPORT_SYMBOL(ip6_route_output);
783
784 static int ip6_blackhole_output(struct sk_buff *skb)
785 {
786         kfree_skb(skb);
787         return 0;
788 }
789
790 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
791 {
792         struct rt6_info *ort = (struct rt6_info *) *dstp;
793         struct rt6_info *rt = (struct rt6_info *)
794                 dst_alloc(&ip6_dst_blackhole_ops);
795         struct dst_entry *new = NULL;
796
797         if (rt) {
798                 new = &rt->u.dst;
799
800                 atomic_set(&new->__refcnt, 1);
801                 new->__use = 1;
802                 new->input = ip6_blackhole_output;
803                 new->output = ip6_blackhole_output;
804
805                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
806                 new->dev = ort->u.dst.dev;
807                 if (new->dev)
808                         dev_hold(new->dev);
809                 rt->rt6i_idev = ort->rt6i_idev;
810                 if (rt->rt6i_idev)
811                         in6_dev_hold(rt->rt6i_idev);
812                 rt->rt6i_expires = 0;
813
814                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
815                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
816                 rt->rt6i_metric = 0;
817
818                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
819 #ifdef CONFIG_IPV6_SUBTREES
820                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
821 #endif
822
823                 dst_free(new);
824         }
825
826         dst_release(*dstp);
827         *dstp = new;
828         return (new ? 0 : -ENOMEM);
829 }
830 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
831
832 /*
833  *      Destination cache support functions
834  */
835
836 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
837 {
838         struct rt6_info *rt;
839
840         rt = (struct rt6_info *) dst;
841
842         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
843                 return dst;
844
845         return NULL;
846 }
847
848 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
849 {
850         struct rt6_info *rt = (struct rt6_info *) dst;
851
852         if (rt) {
853                 if (rt->rt6i_flags & RTF_CACHE)
854                         ip6_del_rt(rt);
855                 else
856                         dst_release(dst);
857         }
858         return NULL;
859 }
860
861 static void ip6_link_failure(struct sk_buff *skb)
862 {
863         struct rt6_info *rt;
864
865         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
866
867         rt = (struct rt6_info *) skb->dst;
868         if (rt) {
869                 if (rt->rt6i_flags&RTF_CACHE) {
870                         dst_set_expires(&rt->u.dst, 0);
871                         rt->rt6i_flags |= RTF_EXPIRES;
872                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
873                         rt->rt6i_node->fn_sernum = -1;
874         }
875 }
876
877 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
878 {
879         struct rt6_info *rt6 = (struct rt6_info*)dst;
880
881         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
882                 rt6->rt6i_flags |= RTF_MODIFIED;
883                 if (mtu < IPV6_MIN_MTU) {
884                         mtu = IPV6_MIN_MTU;
885                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
886                 }
887                 dst->metrics[RTAX_MTU-1] = mtu;
888                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
889         }
890 }
891
892 static int ipv6_get_mtu(struct net_device *dev);
893
894 static inline unsigned int ipv6_advmss(unsigned int mtu)
895 {
896         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
897
898         if (mtu < ip6_rt_min_advmss)
899                 mtu = ip6_rt_min_advmss;
900
901         /*
902          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
903          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
904          * IPV6_MAXPLEN is also valid and means: "any MSS,
905          * rely only on pmtu discovery"
906          */
907         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
908                 mtu = IPV6_MAXPLEN;
909         return mtu;
910 }
911
912 static struct dst_entry *ndisc_dst_gc_list;
913 static DEFINE_SPINLOCK(ndisc_lock);
914
915 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
916                                   struct neighbour *neigh,
917                                   struct in6_addr *addr,
918                                   int (*output)(struct sk_buff *))
919 {
920         struct rt6_info *rt;
921         struct inet6_dev *idev = in6_dev_get(dev);
922
923         if (unlikely(idev == NULL))
924                 return NULL;
925
926         rt = ip6_dst_alloc();
927         if (unlikely(rt == NULL)) {
928                 in6_dev_put(idev);
929                 goto out;
930         }
931
932         dev_hold(dev);
933         if (neigh)
934                 neigh_hold(neigh);
935         else
936                 neigh = ndisc_get_neigh(dev, addr);
937
938         rt->rt6i_dev      = dev;
939         rt->rt6i_idev     = idev;
940         rt->rt6i_nexthop  = neigh;
941         atomic_set(&rt->u.dst.__refcnt, 1);
942         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
943         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
944         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
945         rt->u.dst.output  = output;
946
947 #if 0   /* there's no chance to use these for ndisc */
948         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
949                                 ? DST_HOST
950                                 : 0;
951         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
952         rt->rt6i_dst.plen = 128;
953 #endif
954
955         spin_lock_bh(&ndisc_lock);
956         rt->u.dst.next = ndisc_dst_gc_list;
957         ndisc_dst_gc_list = &rt->u.dst;
958         spin_unlock_bh(&ndisc_lock);
959
960         fib6_force_start_gc();
961
962 out:
963         return &rt->u.dst;
964 }
965
966 int ndisc_dst_gc(int *more)
967 {
968         struct dst_entry *dst, *next, **pprev;
969         int freed;
970
971         next = NULL;
972         freed = 0;
973
974         spin_lock_bh(&ndisc_lock);
975         pprev = &ndisc_dst_gc_list;
976
977         while ((dst = *pprev) != NULL) {
978                 if (!atomic_read(&dst->__refcnt)) {
979                         *pprev = dst->next;
980                         dst_free(dst);
981                         freed++;
982                 } else {
983                         pprev = &dst->next;
984                         (*more)++;
985                 }
986         }
987
988         spin_unlock_bh(&ndisc_lock);
989
990         return freed;
991 }
992
993 static int ip6_dst_gc(void)
994 {
995         static unsigned expire = 30*HZ;
996         static unsigned long last_gc;
997         unsigned long now = jiffies;
998
999         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
1000             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
1001                 goto out;
1002
1003         expire++;
1004         fib6_run_gc(expire);
1005         last_gc = now;
1006         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
1007                 expire = ip6_rt_gc_timeout>>1;
1008
1009 out:
1010         expire -= expire>>ip6_rt_gc_elasticity;
1011         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
1012 }
1013
1014 /* Clean host part of a prefix. Not necessary in radix tree,
1015    but results in cleaner routing tables.
1016
1017    Remove it only when all the things will work!
1018  */
1019
1020 static int ipv6_get_mtu(struct net_device *dev)
1021 {
1022         int mtu = IPV6_MIN_MTU;
1023         struct inet6_dev *idev;
1024
1025         idev = in6_dev_get(dev);
1026         if (idev) {
1027                 mtu = idev->cnf.mtu6;
1028                 in6_dev_put(idev);
1029         }
1030         return mtu;
1031 }
1032
1033 int ipv6_get_hoplimit(struct net_device *dev)
1034 {
1035         int hoplimit = ipv6_devconf.hop_limit;
1036         struct inet6_dev *idev;
1037
1038         idev = in6_dev_get(dev);
1039         if (idev) {
1040                 hoplimit = idev->cnf.hop_limit;
1041                 in6_dev_put(idev);
1042         }
1043         return hoplimit;
1044 }
1045
1046 /*
1047  *
1048  */
1049
1050 int ip6_route_add(struct fib6_config *cfg)
1051 {
1052         int err;
1053         struct rt6_info *rt = NULL;
1054         struct net_device *dev = NULL;
1055         struct inet6_dev *idev = NULL;
1056         struct fib6_table *table;
1057         int addr_type;
1058
1059         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1060                 return -EINVAL;
1061 #ifndef CONFIG_IPV6_SUBTREES
1062         if (cfg->fc_src_len)
1063                 return -EINVAL;
1064 #endif
1065         if (cfg->fc_ifindex) {
1066                 err = -ENODEV;
1067                 dev = dev_get_by_index(&init_net, cfg->fc_ifindex);
1068                 if (!dev)
1069                         goto out;
1070                 idev = in6_dev_get(dev);
1071                 if (!idev)
1072                         goto out;
1073         }
1074
1075         if (cfg->fc_metric == 0)
1076                 cfg->fc_metric = IP6_RT_PRIO_USER;
1077
1078         table = fib6_new_table(cfg->fc_table);
1079         if (table == NULL) {
1080                 err = -ENOBUFS;
1081                 goto out;
1082         }
1083
1084         rt = ip6_dst_alloc();
1085
1086         if (rt == NULL) {
1087                 err = -ENOMEM;
1088                 goto out;
1089         }
1090
1091         rt->u.dst.obsolete = -1;
1092         rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1093
1094         if (cfg->fc_protocol == RTPROT_UNSPEC)
1095                 cfg->fc_protocol = RTPROT_BOOT;
1096         rt->rt6i_protocol = cfg->fc_protocol;
1097
1098         addr_type = ipv6_addr_type(&cfg->fc_dst);
1099
1100         if (addr_type & IPV6_ADDR_MULTICAST)
1101                 rt->u.dst.input = ip6_mc_input;
1102         else
1103                 rt->u.dst.input = ip6_forward;
1104
1105         rt->u.dst.output = ip6_output;
1106
1107         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1108         rt->rt6i_dst.plen = cfg->fc_dst_len;
1109         if (rt->rt6i_dst.plen == 128)
1110                rt->u.dst.flags = DST_HOST;
1111
1112 #ifdef CONFIG_IPV6_SUBTREES
1113         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1114         rt->rt6i_src.plen = cfg->fc_src_len;
1115 #endif
1116
1117         rt->rt6i_metric = cfg->fc_metric;
1118
1119         /* We cannot add true routes via loopback here,
1120            they would result in kernel looping; promote them to reject routes
1121          */
1122         if ((cfg->fc_flags & RTF_REJECT) ||
1123             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1124                 /* hold loopback dev/idev if we haven't done so. */
1125                 if (dev != init_net.loopback_dev) {
1126                         if (dev) {
1127                                 dev_put(dev);
1128                                 in6_dev_put(idev);
1129                         }
1130                         dev = init_net.loopback_dev;
1131                         dev_hold(dev);
1132                         idev = in6_dev_get(dev);
1133                         if (!idev) {
1134                                 err = -ENODEV;
1135                                 goto out;
1136                         }
1137                 }
1138                 rt->u.dst.output = ip6_pkt_discard_out;
1139                 rt->u.dst.input = ip6_pkt_discard;
1140                 rt->u.dst.error = -ENETUNREACH;
1141                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1142                 goto install_route;
1143         }
1144
1145         if (cfg->fc_flags & RTF_GATEWAY) {
1146                 struct in6_addr *gw_addr;
1147                 int gwa_type;
1148
1149                 gw_addr = &cfg->fc_gateway;
1150                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1151                 gwa_type = ipv6_addr_type(gw_addr);
1152
1153                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1154                         struct rt6_info *grt;
1155
1156                         /* IPv6 strictly inhibits using not link-local
1157                            addresses as nexthop address.
1158                            Otherwise, router will not able to send redirects.
1159                            It is very good, but in some (rare!) circumstances
1160                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1161                            some exceptions. --ANK
1162                          */
1163                         err = -EINVAL;
1164                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1165                                 goto out;
1166
1167                         grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1168
1169                         err = -EHOSTUNREACH;
1170                         if (grt == NULL)
1171                                 goto out;
1172                         if (dev) {
1173                                 if (dev != grt->rt6i_dev) {
1174                                         dst_release(&grt->u.dst);
1175                                         goto out;
1176                                 }
1177                         } else {
1178                                 dev = grt->rt6i_dev;
1179                                 idev = grt->rt6i_idev;
1180                                 dev_hold(dev);
1181                                 in6_dev_hold(grt->rt6i_idev);
1182                         }
1183                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1184                                 err = 0;
1185                         dst_release(&grt->u.dst);
1186
1187                         if (err)
1188                                 goto out;
1189                 }
1190                 err = -EINVAL;
1191                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1192                         goto out;
1193         }
1194
1195         err = -ENODEV;
1196         if (dev == NULL)
1197                 goto out;
1198
1199         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1200                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1201                 if (IS_ERR(rt->rt6i_nexthop)) {
1202                         err = PTR_ERR(rt->rt6i_nexthop);
1203                         rt->rt6i_nexthop = NULL;
1204                         goto out;
1205                 }
1206         }
1207
1208         rt->rt6i_flags = cfg->fc_flags;
1209
1210 install_route:
1211         if (cfg->fc_mx) {
1212                 struct nlattr *nla;
1213                 int remaining;
1214
1215                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1216                         int type = nla_type(nla);
1217
1218                         if (type) {
1219                                 if (type > RTAX_MAX) {
1220                                         err = -EINVAL;
1221                                         goto out;
1222                                 }
1223
1224                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1225                         }
1226                 }
1227         }
1228
1229         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1230                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1231         if (!rt->u.dst.metrics[RTAX_MTU-1])
1232                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1233         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1234                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1235         rt->u.dst.dev = dev;
1236         rt->rt6i_idev = idev;
1237         rt->rt6i_table = table;
1238         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1239
1240 out:
1241         if (dev)
1242                 dev_put(dev);
1243         if (idev)
1244                 in6_dev_put(idev);
1245         if (rt)
1246                 dst_free(&rt->u.dst);
1247         return err;
1248 }
1249
1250 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1251 {
1252         int err;
1253         struct fib6_table *table;
1254
1255         if (rt == &ip6_null_entry)
1256                 return -ENOENT;
1257
1258         table = rt->rt6i_table;
1259         write_lock_bh(&table->tb6_lock);
1260
1261         err = fib6_del(rt, info);
1262         dst_release(&rt->u.dst);
1263
1264         write_unlock_bh(&table->tb6_lock);
1265
1266         return err;
1267 }
1268
1269 int ip6_del_rt(struct rt6_info *rt)
1270 {
1271         return __ip6_del_rt(rt, NULL);
1272 }
1273
1274 static int ip6_route_del(struct fib6_config *cfg)
1275 {
1276         struct fib6_table *table;
1277         struct fib6_node *fn;
1278         struct rt6_info *rt;
1279         int err = -ESRCH;
1280
1281         table = fib6_get_table(cfg->fc_table);
1282         if (table == NULL)
1283                 return err;
1284
1285         read_lock_bh(&table->tb6_lock);
1286
1287         fn = fib6_locate(&table->tb6_root,
1288                          &cfg->fc_dst, cfg->fc_dst_len,
1289                          &cfg->fc_src, cfg->fc_src_len);
1290
1291         if (fn) {
1292                 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1293                         if (cfg->fc_ifindex &&
1294                             (rt->rt6i_dev == NULL ||
1295                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1296                                 continue;
1297                         if (cfg->fc_flags & RTF_GATEWAY &&
1298                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1299                                 continue;
1300                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1301                                 continue;
1302                         dst_hold(&rt->u.dst);
1303                         read_unlock_bh(&table->tb6_lock);
1304
1305                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1306                 }
1307         }
1308         read_unlock_bh(&table->tb6_lock);
1309
1310         return err;
1311 }
1312
1313 /*
1314  *      Handle redirects
1315  */
1316 struct ip6rd_flowi {
1317         struct flowi fl;
1318         struct in6_addr gateway;
1319 };
1320
1321 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1322                                              struct flowi *fl,
1323                                              int flags)
1324 {
1325         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1326         struct rt6_info *rt;
1327         struct fib6_node *fn;
1328
1329         /*
1330          * Get the "current" route for this destination and
1331          * check if the redirect has come from approriate router.
1332          *
1333          * RFC 2461 specifies that redirects should only be
1334          * accepted if they come from the nexthop to the target.
1335          * Due to the way the routes are chosen, this notion
1336          * is a bit fuzzy and one might need to check all possible
1337          * routes.
1338          */
1339
1340         read_lock_bh(&table->tb6_lock);
1341         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1342 restart:
1343         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1344                 /*
1345                  * Current route is on-link; redirect is always invalid.
1346                  *
1347                  * Seems, previous statement is not true. It could
1348                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1349                  * But then router serving it might decide, that we should
1350                  * know truth 8)8) --ANK (980726).
1351                  */
1352                 if (rt6_check_expired(rt))
1353                         continue;
1354                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1355                         continue;
1356                 if (fl->oif != rt->rt6i_dev->ifindex)
1357                         continue;
1358                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1359                         continue;
1360                 break;
1361         }
1362
1363         if (!rt)
1364                 rt = &ip6_null_entry;
1365         BACKTRACK(&fl->fl6_src);
1366 out:
1367         dst_hold(&rt->u.dst);
1368
1369         read_unlock_bh(&table->tb6_lock);
1370
1371         return rt;
1372 };
1373
1374 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1375                                            struct in6_addr *src,
1376                                            struct in6_addr *gateway,
1377                                            struct net_device *dev)
1378 {
1379         int flags = RT6_LOOKUP_F_HAS_SADDR;
1380         struct ip6rd_flowi rdfl = {
1381                 .fl = {
1382                         .oif = dev->ifindex,
1383                         .nl_u = {
1384                                 .ip6_u = {
1385                                         .daddr = *dest,
1386                                         .saddr = *src,
1387                                 },
1388                         },
1389                 },
1390                 .gateway = *gateway,
1391         };
1392
1393         if (rt6_need_strict(dest))
1394                 flags |= RT6_LOOKUP_F_IFACE;
1395
1396         return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1397 }
1398
1399 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1400                   struct in6_addr *saddr,
1401                   struct neighbour *neigh, u8 *lladdr, int on_link)
1402 {
1403         struct rt6_info *rt, *nrt = NULL;
1404         struct netevent_redirect netevent;
1405
1406         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1407
1408         if (rt == &ip6_null_entry) {
1409                 if (net_ratelimit())
1410                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1411                                "for redirect target\n");
1412                 goto out;
1413         }
1414
1415         /*
1416          *      We have finally decided to accept it.
1417          */
1418
1419         neigh_update(neigh, lladdr, NUD_STALE,
1420                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1421                      NEIGH_UPDATE_F_OVERRIDE|
1422                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1423                                      NEIGH_UPDATE_F_ISROUTER))
1424                      );
1425
1426         /*
1427          * Redirect received -> path was valid.
1428          * Look, redirects are sent only in response to data packets,
1429          * so that this nexthop apparently is reachable. --ANK
1430          */
1431         dst_confirm(&rt->u.dst);
1432
1433         /* Duplicate redirect: silently ignore. */
1434         if (neigh == rt->u.dst.neighbour)
1435                 goto out;
1436
1437         nrt = ip6_rt_copy(rt);
1438         if (nrt == NULL)
1439                 goto out;
1440
1441         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1442         if (on_link)
1443                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1444
1445         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1446         nrt->rt6i_dst.plen = 128;
1447         nrt->u.dst.flags |= DST_HOST;
1448
1449         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1450         nrt->rt6i_nexthop = neigh_clone(neigh);
1451         /* Reset pmtu, it may be better */
1452         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1453         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1454
1455         if (ip6_ins_rt(nrt))
1456                 goto out;
1457
1458         netevent.old = &rt->u.dst;
1459         netevent.new = &nrt->u.dst;
1460         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1461
1462         if (rt->rt6i_flags&RTF_CACHE) {
1463                 ip6_del_rt(rt);
1464                 return;
1465         }
1466
1467 out:
1468         dst_release(&rt->u.dst);
1469         return;
1470 }
1471
1472 /*
1473  *      Handle ICMP "packet too big" messages
1474  *      i.e. Path MTU discovery
1475  */
1476
1477 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1478                         struct net_device *dev, u32 pmtu)
1479 {
1480         struct rt6_info *rt, *nrt;
1481         int allfrag = 0;
1482
1483         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1484         if (rt == NULL)
1485                 return;
1486
1487         if (pmtu >= dst_mtu(&rt->u.dst))
1488                 goto out;
1489
1490         if (pmtu < IPV6_MIN_MTU) {
1491                 /*
1492                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1493                  * MTU (1280) and a fragment header should always be included
1494                  * after a node receiving Too Big message reporting PMTU is
1495                  * less than the IPv6 Minimum Link MTU.
1496                  */
1497                 pmtu = IPV6_MIN_MTU;
1498                 allfrag = 1;
1499         }
1500
1501         /* New mtu received -> path was valid.
1502            They are sent only in response to data packets,
1503            so that this nexthop apparently is reachable. --ANK
1504          */
1505         dst_confirm(&rt->u.dst);
1506
1507         /* Host route. If it is static, it would be better
1508            not to override it, but add new one, so that
1509            when cache entry will expire old pmtu
1510            would return automatically.
1511          */
1512         if (rt->rt6i_flags & RTF_CACHE) {
1513                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1514                 if (allfrag)
1515                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1516                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1517                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1518                 goto out;
1519         }
1520
1521         /* Network route.
1522            Two cases are possible:
1523            1. It is connected route. Action: COW
1524            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1525          */
1526         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1527                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1528         else
1529                 nrt = rt6_alloc_clone(rt, daddr);
1530
1531         if (nrt) {
1532                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1533                 if (allfrag)
1534                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1535
1536                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1537                  * happened within 5 mins, the recommended timer is 10 mins.
1538                  * Here this route expiration time is set to ip6_rt_mtu_expires
1539                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1540                  * and detecting PMTU increase will be automatically happened.
1541                  */
1542                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1543                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1544
1545                 ip6_ins_rt(nrt);
1546         }
1547 out:
1548         dst_release(&rt->u.dst);
1549 }
1550
1551 /*
1552  *      Misc support functions
1553  */
1554
1555 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1556 {
1557         struct rt6_info *rt = ip6_dst_alloc();
1558
1559         if (rt) {
1560                 rt->u.dst.input = ort->u.dst.input;
1561                 rt->u.dst.output = ort->u.dst.output;
1562
1563                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1564                 rt->u.dst.error = ort->u.dst.error;
1565                 rt->u.dst.dev = ort->u.dst.dev;
1566                 if (rt->u.dst.dev)
1567                         dev_hold(rt->u.dst.dev);
1568                 rt->rt6i_idev = ort->rt6i_idev;
1569                 if (rt->rt6i_idev)
1570                         in6_dev_hold(rt->rt6i_idev);
1571                 rt->u.dst.lastuse = jiffies;
1572                 rt->rt6i_expires = 0;
1573
1574                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1575                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1576                 rt->rt6i_metric = 0;
1577
1578                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1579 #ifdef CONFIG_IPV6_SUBTREES
1580                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1581 #endif
1582                 rt->rt6i_table = ort->rt6i_table;
1583         }
1584         return rt;
1585 }
1586
1587 #ifdef CONFIG_IPV6_ROUTE_INFO
1588 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1589                                            struct in6_addr *gwaddr, int ifindex)
1590 {
1591         struct fib6_node *fn;
1592         struct rt6_info *rt = NULL;
1593         struct fib6_table *table;
1594
1595         table = fib6_get_table(RT6_TABLE_INFO);
1596         if (table == NULL)
1597                 return NULL;
1598
1599         write_lock_bh(&table->tb6_lock);
1600         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1601         if (!fn)
1602                 goto out;
1603
1604         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1605                 if (rt->rt6i_dev->ifindex != ifindex)
1606                         continue;
1607                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1608                         continue;
1609                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1610                         continue;
1611                 dst_hold(&rt->u.dst);
1612                 break;
1613         }
1614 out:
1615         write_unlock_bh(&table->tb6_lock);
1616         return rt;
1617 }
1618
1619 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1620                                            struct in6_addr *gwaddr, int ifindex,
1621                                            unsigned pref)
1622 {
1623         struct fib6_config cfg = {
1624                 .fc_table       = RT6_TABLE_INFO,
1625                 .fc_metric      = 1024,
1626                 .fc_ifindex     = ifindex,
1627                 .fc_dst_len     = prefixlen,
1628                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1629                                   RTF_UP | RTF_PREF(pref),
1630         };
1631
1632         ipv6_addr_copy(&cfg.fc_dst, prefix);
1633         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1634
1635         /* We should treat it as a default route if prefix length is 0. */
1636         if (!prefixlen)
1637                 cfg.fc_flags |= RTF_DEFAULT;
1638
1639         ip6_route_add(&cfg);
1640
1641         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1642 }
1643 #endif
1644
1645 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1646 {
1647         struct rt6_info *rt;
1648         struct fib6_table *table;
1649
1650         table = fib6_get_table(RT6_TABLE_DFLT);
1651         if (table == NULL)
1652                 return NULL;
1653
1654         write_lock_bh(&table->tb6_lock);
1655         for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1656                 if (dev == rt->rt6i_dev &&
1657                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1658                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1659                         break;
1660         }
1661         if (rt)
1662                 dst_hold(&rt->u.dst);
1663         write_unlock_bh(&table->tb6_lock);
1664         return rt;
1665 }
1666
1667 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1668                                      struct net_device *dev,
1669                                      unsigned int pref)
1670 {
1671         struct fib6_config cfg = {
1672                 .fc_table       = RT6_TABLE_DFLT,
1673                 .fc_metric      = 1024,
1674                 .fc_ifindex     = dev->ifindex,
1675                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1676                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1677         };
1678
1679         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1680
1681         ip6_route_add(&cfg);
1682
1683         return rt6_get_dflt_router(gwaddr, dev);
1684 }
1685
1686 void rt6_purge_dflt_routers(void)
1687 {
1688         struct rt6_info *rt;
1689         struct fib6_table *table;
1690
1691         /* NOTE: Keep consistent with rt6_get_dflt_router */
1692         table = fib6_get_table(RT6_TABLE_DFLT);
1693         if (table == NULL)
1694                 return;
1695
1696 restart:
1697         read_lock_bh(&table->tb6_lock);
1698         for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1699                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1700                         dst_hold(&rt->u.dst);
1701                         read_unlock_bh(&table->tb6_lock);
1702                         ip6_del_rt(rt);
1703                         goto restart;
1704                 }
1705         }
1706         read_unlock_bh(&table->tb6_lock);
1707 }
1708
1709 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1710                                  struct fib6_config *cfg)
1711 {
1712         memset(cfg, 0, sizeof(*cfg));
1713
1714         cfg->fc_table = RT6_TABLE_MAIN;
1715         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1716         cfg->fc_metric = rtmsg->rtmsg_metric;
1717         cfg->fc_expires = rtmsg->rtmsg_info;
1718         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1719         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1720         cfg->fc_flags = rtmsg->rtmsg_flags;
1721
1722         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1723         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1724         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1725 }
1726
1727 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1728 {
1729         struct fib6_config cfg;
1730         struct in6_rtmsg rtmsg;
1731         int err;
1732
1733         switch(cmd) {
1734         case SIOCADDRT:         /* Add a route */
1735         case SIOCDELRT:         /* Delete a route */
1736                 if (!capable(CAP_NET_ADMIN))
1737                         return -EPERM;
1738                 err = copy_from_user(&rtmsg, arg,
1739                                      sizeof(struct in6_rtmsg));
1740                 if (err)
1741                         return -EFAULT;
1742
1743                 rtmsg_to_fib6_config(&rtmsg, &cfg);
1744
1745                 rtnl_lock();
1746                 switch (cmd) {
1747                 case SIOCADDRT:
1748                         err = ip6_route_add(&cfg);
1749                         break;
1750                 case SIOCDELRT:
1751                         err = ip6_route_del(&cfg);
1752                         break;
1753                 default:
1754                         err = -EINVAL;
1755                 }
1756                 rtnl_unlock();
1757
1758                 return err;
1759         }
1760
1761         return -EINVAL;
1762 }
1763
1764 /*
1765  *      Drop the packet on the floor
1766  */
1767
1768 static inline int ip6_pkt_drop(struct sk_buff *skb, int code,
1769                                int ipstats_mib_noroutes)
1770 {
1771         int type;
1772         switch (ipstats_mib_noroutes) {
1773         case IPSTATS_MIB_INNOROUTES:
1774                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1775                 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1776                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1777                         break;
1778                 }
1779                 /* FALLTHROUGH */
1780         case IPSTATS_MIB_OUTNOROUTES:
1781                 IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
1782                 break;
1783         }
1784         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1785         kfree_skb(skb);
1786         return 0;
1787 }
1788
1789 static int ip6_pkt_discard(struct sk_buff *skb)
1790 {
1791         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1792 }
1793
1794 static int ip6_pkt_discard_out(struct sk_buff *skb)
1795 {
1796         skb->dev = skb->dst->dev;
1797         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1798 }
1799
1800 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1801
1802 static int ip6_pkt_prohibit(struct sk_buff *skb)
1803 {
1804         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1805 }
1806
1807 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1808 {
1809         skb->dev = skb->dst->dev;
1810         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1811 }
1812
1813 static int ip6_pkt_blk_hole(struct sk_buff *skb)
1814 {
1815         kfree_skb(skb);
1816         return 0;
1817 }
1818
1819 #endif
1820
1821 /*
1822  *      Allocate a dst for local (unicast / anycast) address.
1823  */
1824
1825 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1826                                     const struct in6_addr *addr,
1827                                     int anycast)
1828 {
1829         struct rt6_info *rt = ip6_dst_alloc();
1830
1831         if (rt == NULL)
1832                 return ERR_PTR(-ENOMEM);
1833
1834         dev_hold(init_net.loopback_dev);
1835         in6_dev_hold(idev);
1836
1837         rt->u.dst.flags = DST_HOST;
1838         rt->u.dst.input = ip6_input;
1839         rt->u.dst.output = ip6_output;
1840         rt->rt6i_dev = init_net.loopback_dev;
1841         rt->rt6i_idev = idev;
1842         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1843         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1844         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1845         rt->u.dst.obsolete = -1;
1846
1847         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1848         if (anycast)
1849                 rt->rt6i_flags |= RTF_ANYCAST;
1850         else
1851                 rt->rt6i_flags |= RTF_LOCAL;
1852         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1853         if (rt->rt6i_nexthop == NULL) {
1854                 dst_free(&rt->u.dst);
1855                 return ERR_PTR(-ENOMEM);
1856         }
1857
1858         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1859         rt->rt6i_dst.plen = 128;
1860         rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1861
1862         atomic_set(&rt->u.dst.__refcnt, 1);
1863
1864         return rt;
1865 }
1866
1867 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1868 {
1869         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1870             rt != &ip6_null_entry) {
1871                 RT6_TRACE("deleted by ifdown %p\n", rt);
1872                 return -1;
1873         }
1874         return 0;
1875 }
1876
1877 void rt6_ifdown(struct net_device *dev)
1878 {
1879         fib6_clean_all(fib6_ifdown, 0, dev);
1880 }
1881
1882 struct rt6_mtu_change_arg
1883 {
1884         struct net_device *dev;
1885         unsigned mtu;
1886 };
1887
1888 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1889 {
1890         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1891         struct inet6_dev *idev;
1892
1893         /* In IPv6 pmtu discovery is not optional,
1894            so that RTAX_MTU lock cannot disable it.
1895            We still use this lock to block changes
1896            caused by addrconf/ndisc.
1897         */
1898
1899         idev = __in6_dev_get(arg->dev);
1900         if (idev == NULL)
1901                 return 0;
1902
1903         /* For administrative MTU increase, there is no way to discover
1904            IPv6 PMTU increase, so PMTU increase should be updated here.
1905            Since RFC 1981 doesn't include administrative MTU increase
1906            update PMTU increase is a MUST. (i.e. jumbo frame)
1907          */
1908         /*
1909            If new MTU is less than route PMTU, this new MTU will be the
1910            lowest MTU in the path, update the route PMTU to reflect PMTU
1911            decreases; if new MTU is greater than route PMTU, and the
1912            old MTU is the lowest MTU in the path, update the route PMTU
1913            to reflect the increase. In this case if the other nodes' MTU
1914            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1915            PMTU discouvery.
1916          */
1917         if (rt->rt6i_dev == arg->dev &&
1918             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1919             (dst_mtu(&rt->u.dst) > arg->mtu ||
1920              (dst_mtu(&rt->u.dst) < arg->mtu &&
1921               dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1922                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1923                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1924         }
1925         return 0;
1926 }
1927
1928 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1929 {
1930         struct rt6_mtu_change_arg arg = {
1931                 .dev = dev,
1932                 .mtu = mtu,
1933         };
1934
1935         fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1936 }
1937
1938 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
1939         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1940         [RTA_OIF]               = { .type = NLA_U32 },
1941         [RTA_IIF]               = { .type = NLA_U32 },
1942         [RTA_PRIORITY]          = { .type = NLA_U32 },
1943         [RTA_METRICS]           = { .type = NLA_NESTED },
1944 };
1945
1946 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1947                               struct fib6_config *cfg)
1948 {
1949         struct rtmsg *rtm;
1950         struct nlattr *tb[RTA_MAX+1];
1951         int err;
1952
1953         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1954         if (err < 0)
1955                 goto errout;
1956
1957         err = -EINVAL;
1958         rtm = nlmsg_data(nlh);
1959         memset(cfg, 0, sizeof(*cfg));
1960
1961         cfg->fc_table = rtm->rtm_table;
1962         cfg->fc_dst_len = rtm->rtm_dst_len;
1963         cfg->fc_src_len = rtm->rtm_src_len;
1964         cfg->fc_flags = RTF_UP;
1965         cfg->fc_protocol = rtm->rtm_protocol;
1966
1967         if (rtm->rtm_type == RTN_UNREACHABLE)
1968                 cfg->fc_flags |= RTF_REJECT;
1969
1970         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1971         cfg->fc_nlinfo.nlh = nlh;
1972
1973         if (tb[RTA_GATEWAY]) {
1974                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1975                 cfg->fc_flags |= RTF_GATEWAY;
1976         }
1977
1978         if (tb[RTA_DST]) {
1979                 int plen = (rtm->rtm_dst_len + 7) >> 3;
1980
1981                 if (nla_len(tb[RTA_DST]) < plen)
1982                         goto errout;
1983
1984                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1985         }
1986
1987         if (tb[RTA_SRC]) {
1988                 int plen = (rtm->rtm_src_len + 7) >> 3;
1989
1990                 if (nla_len(tb[RTA_SRC]) < plen)
1991                         goto errout;
1992
1993                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1994         }
1995
1996         if (tb[RTA_OIF])
1997                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1998
1999         if (tb[RTA_PRIORITY])
2000                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2001
2002         if (tb[RTA_METRICS]) {
2003                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2004                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2005         }
2006
2007         if (tb[RTA_TABLE])
2008                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2009
2010         err = 0;
2011 errout:
2012         return err;
2013 }
2014
2015 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2016 {
2017         struct fib6_config cfg;
2018         int err;
2019
2020         err = rtm_to_fib6_config(skb, nlh, &cfg);
2021         if (err < 0)
2022                 return err;
2023
2024         return ip6_route_del(&cfg);
2025 }
2026
2027 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2028 {
2029         struct fib6_config cfg;
2030         int err;
2031
2032         err = rtm_to_fib6_config(skb, nlh, &cfg);
2033         if (err < 0)
2034                 return err;
2035
2036         return ip6_route_add(&cfg);
2037 }
2038
2039 static inline size_t rt6_nlmsg_size(void)
2040 {
2041         return NLMSG_ALIGN(sizeof(struct rtmsg))
2042                + nla_total_size(16) /* RTA_SRC */
2043                + nla_total_size(16) /* RTA_DST */
2044                + nla_total_size(16) /* RTA_GATEWAY */
2045                + nla_total_size(16) /* RTA_PREFSRC */
2046                + nla_total_size(4) /* RTA_TABLE */
2047                + nla_total_size(4) /* RTA_IIF */
2048                + nla_total_size(4) /* RTA_OIF */
2049                + nla_total_size(4) /* RTA_PRIORITY */
2050                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2051                + nla_total_size(sizeof(struct rta_cacheinfo));
2052 }
2053
2054 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2055                          struct in6_addr *dst, struct in6_addr *src,
2056                          int iif, int type, u32 pid, u32 seq,
2057                          int prefix, unsigned int flags)
2058 {
2059         struct rtmsg *rtm;
2060         struct nlmsghdr *nlh;
2061         long expires;
2062         u32 table;
2063
2064         if (prefix) {   /* user wants prefix routes only */
2065                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2066                         /* success since this is not a prefix route */
2067                         return 1;
2068                 }
2069         }
2070
2071         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2072         if (nlh == NULL)
2073                 return -EMSGSIZE;
2074
2075         rtm = nlmsg_data(nlh);
2076         rtm->rtm_family = AF_INET6;
2077         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2078         rtm->rtm_src_len = rt->rt6i_src.plen;
2079         rtm->rtm_tos = 0;
2080         if (rt->rt6i_table)
2081                 table = rt->rt6i_table->tb6_id;
2082         else
2083                 table = RT6_TABLE_UNSPEC;
2084         rtm->rtm_table = table;
2085         NLA_PUT_U32(skb, RTA_TABLE, table);
2086         if (rt->rt6i_flags&RTF_REJECT)
2087                 rtm->rtm_type = RTN_UNREACHABLE;
2088         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2089                 rtm->rtm_type = RTN_LOCAL;
2090         else
2091                 rtm->rtm_type = RTN_UNICAST;
2092         rtm->rtm_flags = 0;
2093         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2094         rtm->rtm_protocol = rt->rt6i_protocol;
2095         if (rt->rt6i_flags&RTF_DYNAMIC)
2096                 rtm->rtm_protocol = RTPROT_REDIRECT;
2097         else if (rt->rt6i_flags & RTF_ADDRCONF)
2098                 rtm->rtm_protocol = RTPROT_KERNEL;
2099         else if (rt->rt6i_flags&RTF_DEFAULT)
2100                 rtm->rtm_protocol = RTPROT_RA;
2101
2102         if (rt->rt6i_flags&RTF_CACHE)
2103                 rtm->rtm_flags |= RTM_F_CLONED;
2104
2105         if (dst) {
2106                 NLA_PUT(skb, RTA_DST, 16, dst);
2107                 rtm->rtm_dst_len = 128;
2108         } else if (rtm->rtm_dst_len)
2109                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2110 #ifdef CONFIG_IPV6_SUBTREES
2111         if (src) {
2112                 NLA_PUT(skb, RTA_SRC, 16, src);
2113                 rtm->rtm_src_len = 128;
2114         } else if (rtm->rtm_src_len)
2115                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2116 #endif
2117         if (iif)
2118                 NLA_PUT_U32(skb, RTA_IIF, iif);
2119         else if (dst) {
2120                 struct in6_addr saddr_buf;
2121                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2122                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2123         }
2124
2125         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2126                 goto nla_put_failure;
2127
2128         if (rt->u.dst.neighbour)
2129                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2130
2131         if (rt->u.dst.dev)
2132                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2133
2134         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2135
2136         expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
2137         if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2138                                expires, rt->u.dst.error) < 0)
2139                 goto nla_put_failure;
2140
2141         return nlmsg_end(skb, nlh);
2142
2143 nla_put_failure:
2144         nlmsg_cancel(skb, nlh);
2145         return -EMSGSIZE;
2146 }
2147
2148 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2149 {
2150         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2151         int prefix;
2152
2153         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2154                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2155                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2156         } else
2157                 prefix = 0;
2158
2159         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2160                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2161                      prefix, NLM_F_MULTI);
2162 }
2163
2164 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2165 {
2166         struct nlattr *tb[RTA_MAX+1];
2167         struct rt6_info *rt;
2168         struct sk_buff *skb;
2169         struct rtmsg *rtm;
2170         struct flowi fl;
2171         int err, iif = 0;
2172
2173         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2174         if (err < 0)
2175                 goto errout;
2176
2177         err = -EINVAL;
2178         memset(&fl, 0, sizeof(fl));
2179
2180         if (tb[RTA_SRC]) {
2181                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2182                         goto errout;
2183
2184                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2185         }
2186
2187         if (tb[RTA_DST]) {
2188                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2189                         goto errout;
2190
2191                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2192         }
2193
2194         if (tb[RTA_IIF])
2195                 iif = nla_get_u32(tb[RTA_IIF]);
2196
2197         if (tb[RTA_OIF])
2198                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2199
2200         if (iif) {
2201                 struct net_device *dev;
2202                 dev = __dev_get_by_index(&init_net, iif);
2203                 if (!dev) {
2204                         err = -ENODEV;
2205                         goto errout;
2206                 }
2207         }
2208
2209         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2210         if (skb == NULL) {
2211                 err = -ENOBUFS;
2212                 goto errout;
2213         }
2214
2215         /* Reserve room for dummy headers, this skb can pass
2216            through good chunk of routing engine.
2217          */
2218         skb_reset_mac_header(skb);
2219         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2220
2221         rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2222         skb->dst = &rt->u.dst;
2223
2224         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2225                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2226                             nlh->nlmsg_seq, 0, 0);
2227         if (err < 0) {
2228                 kfree_skb(skb);
2229                 goto errout;
2230         }
2231
2232         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2233 errout:
2234         return err;
2235 }
2236
2237 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2238 {
2239         struct sk_buff *skb;
2240         u32 pid = 0, seq = 0;
2241         struct nlmsghdr *nlh = NULL;
2242         int err = -ENOBUFS;
2243
2244         if (info) {
2245                 pid = info->pid;
2246                 nlh = info->nlh;
2247                 if (nlh)
2248                         seq = nlh->nlmsg_seq;
2249         }
2250
2251         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2252         if (skb == NULL)
2253                 goto errout;
2254
2255         err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2256         if (err < 0) {
2257                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2258                 WARN_ON(err == -EMSGSIZE);
2259                 kfree_skb(skb);
2260                 goto errout;
2261         }
2262         err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2263 errout:
2264         if (err < 0)
2265                 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2266 }
2267
2268 /*
2269  *      /proc
2270  */
2271
2272 #ifdef CONFIG_PROC_FS
2273
2274 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2275
2276 struct rt6_proc_arg
2277 {
2278         char *buffer;
2279         int offset;
2280         int length;
2281         int skip;
2282         int len;
2283 };
2284
2285 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2286 {
2287         struct seq_file *m = p_arg;
2288
2289         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_dst.addr),
2290                    rt->rt6i_dst.plen);
2291
2292 #ifdef CONFIG_IPV6_SUBTREES
2293         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_src.addr),
2294                    rt->rt6i_src.plen);
2295 #else
2296         seq_puts(m, "00000000000000000000000000000000 00 ");
2297 #endif
2298
2299         if (rt->rt6i_nexthop) {
2300                 seq_printf(m, NIP6_SEQFMT,
2301                            NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2302         } else {
2303                 seq_puts(m, "00000000000000000000000000000000");
2304         }
2305         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2306                    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2307                    rt->u.dst.__use, rt->rt6i_flags,
2308                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2309         return 0;
2310 }
2311
2312 static int ipv6_route_show(struct seq_file *m, void *v)
2313 {
2314         fib6_clean_all(rt6_info_route, 0, m);
2315         return 0;
2316 }
2317
2318 static int ipv6_route_open(struct inode *inode, struct file *file)
2319 {
2320         return single_open(file, ipv6_route_show, NULL);
2321 }
2322
2323 static const struct file_operations ipv6_route_proc_fops = {
2324         .owner          = THIS_MODULE,
2325         .open           = ipv6_route_open,
2326         .read           = seq_read,
2327         .llseek         = seq_lseek,
2328         .release        = single_release,
2329 };
2330
2331 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2332 {
2333         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2334                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2335                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2336                       rt6_stats.fib_rt_cache,
2337                       atomic_read(&ip6_dst_ops.entries),
2338                       rt6_stats.fib_discarded_routes);
2339
2340         return 0;
2341 }
2342
2343 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2344 {
2345         return single_open(file, rt6_stats_seq_show, NULL);
2346 }
2347
2348 static const struct file_operations rt6_stats_seq_fops = {
2349         .owner   = THIS_MODULE,
2350         .open    = rt6_stats_seq_open,
2351         .read    = seq_read,
2352         .llseek  = seq_lseek,
2353         .release = single_release,
2354 };
2355 #endif  /* CONFIG_PROC_FS */
2356
2357 #ifdef CONFIG_SYSCTL
2358
2359 static int flush_delay;
2360
2361 static
2362 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2363                               void __user *buffer, size_t *lenp, loff_t *ppos)
2364 {
2365         if (write) {
2366                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2367                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2368                 return 0;
2369         } else
2370                 return -EINVAL;
2371 }
2372
2373 ctl_table ipv6_route_table[] = {
2374         {
2375                 .procname       =       "flush",
2376                 .data           =       &flush_delay,
2377                 .maxlen         =       sizeof(int),
2378                 .mode           =       0200,
2379                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2380         },
2381         {
2382                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2383                 .procname       =       "gc_thresh",
2384                 .data           =       &ip6_dst_ops.gc_thresh,
2385                 .maxlen         =       sizeof(int),
2386                 .mode           =       0644,
2387                 .proc_handler   =       &proc_dointvec,
2388         },
2389         {
2390                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2391                 .procname       =       "max_size",
2392                 .data           =       &ip6_rt_max_size,
2393                 .maxlen         =       sizeof(int),
2394                 .mode           =       0644,
2395                 .proc_handler   =       &proc_dointvec,
2396         },
2397         {
2398                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2399                 .procname       =       "gc_min_interval",
2400                 .data           =       &ip6_rt_gc_min_interval,
2401                 .maxlen         =       sizeof(int),
2402                 .mode           =       0644,
2403                 .proc_handler   =       &proc_dointvec_jiffies,
2404                 .strategy       =       &sysctl_jiffies,
2405         },
2406         {
2407                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2408                 .procname       =       "gc_timeout",
2409                 .data           =       &ip6_rt_gc_timeout,
2410                 .maxlen         =       sizeof(int),
2411                 .mode           =       0644,
2412                 .proc_handler   =       &proc_dointvec_jiffies,
2413                 .strategy       =       &sysctl_jiffies,
2414         },
2415         {
2416                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2417                 .procname       =       "gc_interval",
2418                 .data           =       &ip6_rt_gc_interval,
2419                 .maxlen         =       sizeof(int),
2420                 .mode           =       0644,
2421                 .proc_handler   =       &proc_dointvec_jiffies,
2422                 .strategy       =       &sysctl_jiffies,
2423         },
2424         {
2425                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2426                 .procname       =       "gc_elasticity",
2427                 .data           =       &ip6_rt_gc_elasticity,
2428                 .maxlen         =       sizeof(int),
2429                 .mode           =       0644,
2430                 .proc_handler   =       &proc_dointvec_jiffies,
2431                 .strategy       =       &sysctl_jiffies,
2432         },
2433         {
2434                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2435                 .procname       =       "mtu_expires",
2436                 .data           =       &ip6_rt_mtu_expires,
2437                 .maxlen         =       sizeof(int),
2438                 .mode           =       0644,
2439                 .proc_handler   =       &proc_dointvec_jiffies,
2440                 .strategy       =       &sysctl_jiffies,
2441         },
2442         {
2443                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2444                 .procname       =       "min_adv_mss",
2445                 .data           =       &ip6_rt_min_advmss,
2446                 .maxlen         =       sizeof(int),
2447                 .mode           =       0644,
2448                 .proc_handler   =       &proc_dointvec_jiffies,
2449                 .strategy       =       &sysctl_jiffies,
2450         },
2451         {
2452                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2453                 .procname       =       "gc_min_interval_ms",
2454                 .data           =       &ip6_rt_gc_min_interval,
2455                 .maxlen         =       sizeof(int),
2456                 .mode           =       0644,
2457                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2458                 .strategy       =       &sysctl_ms_jiffies,
2459         },
2460         { .ctl_name = 0 }
2461 };
2462
2463 #endif
2464
2465 void __init ip6_route_init(void)
2466 {
2467         ip6_dst_ops.kmem_cachep =
2468                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2469                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2470         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops.kmem_cachep;
2471
2472         fib6_init();
2473         proc_net_fops_create(&init_net, "ipv6_route", 0, &ipv6_route_proc_fops);
2474         proc_net_fops_create(&init_net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2475 #ifdef CONFIG_XFRM
2476         xfrm6_init();
2477 #endif
2478 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2479         fib6_rules_init();
2480 #endif
2481
2482         __rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL);
2483         __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL);
2484         __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL);
2485 }
2486
2487 void ip6_route_cleanup(void)
2488 {
2489 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2490         fib6_rules_cleanup();
2491 #endif
2492 #ifdef CONFIG_PROC_FS
2493         proc_net_remove(&init_net, "ipv6_route");
2494         proc_net_remove(&init_net, "rt6_stats");
2495 #endif
2496 #ifdef CONFIG_XFRM
2497         xfrm6_fini();
2498 #endif
2499         rt6_ifdown(NULL);
2500         fib6_gc_cleanup();
2501         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2502 }