]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv6/route.c
Merge branch 'upstream-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/linvil...
[net-next-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/init.h>
38 #include <linux/netlink.h>
39 #include <linux/if_arp.h>
40
41 #ifdef  CONFIG_PROC_FS
42 #include <linux/proc_fs.h>
43 #include <linux/seq_file.h>
44 #endif
45
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 #define RT6_SELECT_F_IFACE      0x1
78 #define RT6_SELECT_F_REACHABLE  0x2
79
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(void);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct sk_buff *skb);
98 static void             ip6_link_failure(struct sk_buff *skb);
99 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103                                            struct in6_addr *gwaddr, int ifindex,
104                                            unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106                                            struct in6_addr *gwaddr, int ifindex);
107 #endif
108
109 static struct dst_ops ip6_dst_ops = {
110         .family                 =       AF_INET6,
111         .protocol               =       __constant_htons(ETH_P_IPV6),
112         .gc                     =       ip6_dst_gc,
113         .gc_thresh              =       1024,
114         .check                  =       ip6_dst_check,
115         .destroy                =       ip6_dst_destroy,
116         .ifdown                 =       ip6_dst_ifdown,
117         .negative_advice        =       ip6_negative_advice,
118         .link_failure           =       ip6_link_failure,
119         .update_pmtu            =       ip6_rt_update_pmtu,
120         .entry_size             =       sizeof(struct rt6_info),
121 };
122
123 struct rt6_info ip6_null_entry = {
124         .u = {
125                 .dst = {
126                         .__refcnt       = ATOMIC_INIT(1),
127                         .__use          = 1,
128                         .dev            = &loopback_dev,
129                         .obsolete       = -1,
130                         .error          = -ENETUNREACH,
131                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
132                         .input          = ip6_pkt_discard,
133                         .output         = ip6_pkt_discard_out,
134                         .ops            = &ip6_dst_ops,
135                         .path           = (struct dst_entry*)&ip6_null_entry,
136                 }
137         },
138         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
139         .rt6i_metric    = ~(u32) 0,
140         .rt6i_ref       = ATOMIC_INIT(1),
141 };
142
143 struct fib6_node ip6_routing_table = {
144         .leaf           = &ip6_null_entry,
145         .fn_flags       = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
146 };
147
148 /* Protects all the ip6 fib */
149
150 DEFINE_RWLOCK(rt6_lock);
151
152
153 /* allocate dst with ip6_dst_ops */
154 static __inline__ struct rt6_info *ip6_dst_alloc(void)
155 {
156         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
157 }
158
159 static void ip6_dst_destroy(struct dst_entry *dst)
160 {
161         struct rt6_info *rt = (struct rt6_info *)dst;
162         struct inet6_dev *idev = rt->rt6i_idev;
163
164         if (idev != NULL) {
165                 rt->rt6i_idev = NULL;
166                 in6_dev_put(idev);
167         }       
168 }
169
170 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
171                            int how)
172 {
173         struct rt6_info *rt = (struct rt6_info *)dst;
174         struct inet6_dev *idev = rt->rt6i_idev;
175
176         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178                 if (loopback_idev != NULL) {
179                         rt->rt6i_idev = loopback_idev;
180                         in6_dev_put(idev);
181                 }
182         }
183 }
184
185 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
186 {
187         return (rt->rt6i_flags & RTF_EXPIRES &&
188                 time_after(jiffies, rt->rt6i_expires));
189 }
190
191 /*
192  *      Route lookup. Any rt6_lock is implied.
193  */
194
195 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
196                                                     int oif,
197                                                     int strict)
198 {
199         struct rt6_info *local = NULL;
200         struct rt6_info *sprt;
201
202         if (oif) {
203                 for (sprt = rt; sprt; sprt = sprt->u.next) {
204                         struct net_device *dev = sprt->rt6i_dev;
205                         if (dev->ifindex == oif)
206                                 return sprt;
207                         if (dev->flags & IFF_LOOPBACK) {
208                                 if (sprt->rt6i_idev == NULL ||
209                                     sprt->rt6i_idev->dev->ifindex != oif) {
210                                         if (strict && oif)
211                                                 continue;
212                                         if (local && (!oif || 
213                                                       local->rt6i_idev->dev->ifindex == oif))
214                                                 continue;
215                                 }
216                                 local = sprt;
217                         }
218                 }
219
220                 if (local)
221                         return local;
222
223                 if (strict)
224                         return &ip6_null_entry;
225         }
226         return rt;
227 }
228
229 #ifdef CONFIG_IPV6_ROUTER_PREF
230 static void rt6_probe(struct rt6_info *rt)
231 {
232         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
233         /*
234          * Okay, this does not seem to be appropriate
235          * for now, however, we need to check if it
236          * is really so; aka Router Reachability Probing.
237          *
238          * Router Reachability Probe MUST be rate-limited
239          * to no more than one per minute.
240          */
241         if (!neigh || (neigh->nud_state & NUD_VALID))
242                 return;
243         read_lock_bh(&neigh->lock);
244         if (!(neigh->nud_state & NUD_VALID) &&
245             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
246                 struct in6_addr mcaddr;
247                 struct in6_addr *target;
248
249                 neigh->updated = jiffies;
250                 read_unlock_bh(&neigh->lock);
251
252                 target = (struct in6_addr *)&neigh->primary_key;
253                 addrconf_addr_solict_mult(target, &mcaddr);
254                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
255         } else
256                 read_unlock_bh(&neigh->lock);
257 }
258 #else
259 static inline void rt6_probe(struct rt6_info *rt)
260 {
261         return;
262 }
263 #endif
264
265 /*
266  * Default Router Selection (RFC 2461 6.3.6)
267  */
268 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
269 {
270         struct net_device *dev = rt->rt6i_dev;
271         if (!oif || dev->ifindex == oif)
272                 return 2;
273         if ((dev->flags & IFF_LOOPBACK) &&
274             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
275                 return 1;
276         return 0;
277 }
278
279 static int inline rt6_check_neigh(struct rt6_info *rt)
280 {
281         struct neighbour *neigh = rt->rt6i_nexthop;
282         int m = 0;
283         if (rt->rt6i_flags & RTF_NONEXTHOP ||
284             !(rt->rt6i_flags & RTF_GATEWAY))
285                 m = 1;
286         else if (neigh) {
287                 read_lock_bh(&neigh->lock);
288                 if (neigh->nud_state & NUD_VALID)
289                         m = 2;
290                 read_unlock_bh(&neigh->lock);
291         }
292         return m;
293 }
294
295 static int rt6_score_route(struct rt6_info *rt, int oif,
296                            int strict)
297 {
298         int m, n;
299                 
300         m = rt6_check_dev(rt, oif);
301         if (!m && (strict & RT6_SELECT_F_IFACE))
302                 return -1;
303 #ifdef CONFIG_IPV6_ROUTER_PREF
304         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
305 #endif
306         n = rt6_check_neigh(rt);
307         if (n > 1)
308                 m |= 16;
309         else if (!n && strict & RT6_SELECT_F_REACHABLE)
310                 return -1;
311         return m;
312 }
313
314 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
315                                    int strict)
316 {
317         struct rt6_info *match = NULL, *last = NULL;
318         struct rt6_info *rt, *rt0 = *head;
319         u32 metric;
320         int mpri = -1;
321
322         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
323                   __FUNCTION__, head, head ? *head : NULL, oif);
324
325         for (rt = rt0, metric = rt0->rt6i_metric;
326              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
327              rt = rt->u.next) {
328                 int m;
329
330                 if (rt6_check_expired(rt))
331                         continue;
332
333                 last = rt;
334
335                 m = rt6_score_route(rt, oif, strict);
336                 if (m < 0)
337                         continue;
338
339                 if (m > mpri) {
340                         rt6_probe(match);
341                         match = rt;
342                         mpri = m;
343                 } else {
344                         rt6_probe(rt);
345                 }
346         }
347
348         if (!match &&
349             (strict & RT6_SELECT_F_REACHABLE) &&
350             last && last != rt0) {
351                 /* no entries matched; do round-robin */
352                 static DEFINE_SPINLOCK(lock);
353                 spin_lock(&lock);
354                 *head = rt0->u.next;
355                 rt0->u.next = last->u.next;
356                 last->u.next = rt0;
357                 spin_unlock(&lock);
358         }
359
360         RT6_TRACE("%s() => %p, score=%d\n",
361                   __FUNCTION__, match, mpri);
362
363         return (match ? match : &ip6_null_entry);
364 }
365
366 #ifdef CONFIG_IPV6_ROUTE_INFO
367 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
368                   struct in6_addr *gwaddr)
369 {
370         struct route_info *rinfo = (struct route_info *) opt;
371         struct in6_addr prefix_buf, *prefix;
372         unsigned int pref;
373         u32 lifetime;
374         struct rt6_info *rt;
375
376         if (len < sizeof(struct route_info)) {
377                 return -EINVAL;
378         }
379
380         /* Sanity check for prefix_len and length */
381         if (rinfo->length > 3) {
382                 return -EINVAL;
383         } else if (rinfo->prefix_len > 128) {
384                 return -EINVAL;
385         } else if (rinfo->prefix_len > 64) {
386                 if (rinfo->length < 2) {
387                         return -EINVAL;
388                 }
389         } else if (rinfo->prefix_len > 0) {
390                 if (rinfo->length < 1) {
391                         return -EINVAL;
392                 }
393         }
394
395         pref = rinfo->route_pref;
396         if (pref == ICMPV6_ROUTER_PREF_INVALID)
397                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
398
399         lifetime = htonl(rinfo->lifetime);
400         if (lifetime == 0xffffffff) {
401                 /* infinity */
402         } else if (lifetime > 0x7fffffff/HZ) {
403                 /* Avoid arithmetic overflow */
404                 lifetime = 0x7fffffff/HZ - 1;
405         }
406
407         if (rinfo->length == 3)
408                 prefix = (struct in6_addr *)rinfo->prefix;
409         else {
410                 /* this function is safe */
411                 ipv6_addr_prefix(&prefix_buf,
412                                  (struct in6_addr *)rinfo->prefix,
413                                  rinfo->prefix_len);
414                 prefix = &prefix_buf;
415         }
416
417         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
418
419         if (rt && !lifetime) {
420                 ip6_del_rt(rt, NULL, NULL, NULL);
421                 rt = NULL;
422         }
423
424         if (!rt && lifetime)
425                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
426                                         pref);
427         else if (rt)
428                 rt->rt6i_flags = RTF_ROUTEINFO |
429                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
430
431         if (rt) {
432                 if (lifetime == 0xffffffff) {
433                         rt->rt6i_flags &= ~RTF_EXPIRES;
434                 } else {
435                         rt->rt6i_expires = jiffies + HZ * lifetime;
436                         rt->rt6i_flags |= RTF_EXPIRES;
437                 }
438                 dst_release(&rt->u.dst);
439         }
440         return 0;
441 }
442 #endif
443
444 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
445                             int oif, int strict)
446 {
447         struct fib6_node *fn;
448         struct rt6_info *rt;
449
450         read_lock_bh(&rt6_lock);
451         fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
452         rt = rt6_device_match(fn->leaf, oif, strict);
453         dst_hold(&rt->u.dst);
454         rt->u.dst.__use++;
455         read_unlock_bh(&rt6_lock);
456
457         rt->u.dst.lastuse = jiffies;
458         if (rt->u.dst.error == 0)
459                 return rt;
460         dst_release(&rt->u.dst);
461         return NULL;
462 }
463
464 /* ip6_ins_rt is called with FREE rt6_lock.
465    It takes new route entry, the addition fails by any reason the
466    route is freed. In any case, if caller does not hold it, it may
467    be destroyed.
468  */
469
470 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
471                 void *_rtattr, struct netlink_skb_parms *req)
472 {
473         int err;
474
475         write_lock_bh(&rt6_lock);
476         err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
477         write_unlock_bh(&rt6_lock);
478
479         return err;
480 }
481
482 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
483                                       struct in6_addr *saddr)
484 {
485         struct rt6_info *rt;
486
487         /*
488          *      Clone the route.
489          */
490
491         rt = ip6_rt_copy(ort);
492
493         if (rt) {
494                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
495                         if (rt->rt6i_dst.plen != 128 &&
496                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
497                                 rt->rt6i_flags |= RTF_ANYCAST;
498                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
499                 }
500
501                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
502                 rt->rt6i_dst.plen = 128;
503                 rt->rt6i_flags |= RTF_CACHE;
504                 rt->u.dst.flags |= DST_HOST;
505
506 #ifdef CONFIG_IPV6_SUBTREES
507                 if (rt->rt6i_src.plen && saddr) {
508                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
509                         rt->rt6i_src.plen = 128;
510                 }
511 #endif
512
513                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
514
515         }
516
517         return rt;
518 }
519
520 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
521 {
522         struct rt6_info *rt = ip6_rt_copy(ort);
523         if (rt) {
524                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
525                 rt->rt6i_dst.plen = 128;
526                 rt->rt6i_flags |= RTF_CACHE;
527                 if (rt->rt6i_flags & RTF_REJECT)
528                         rt->u.dst.error = ort->u.dst.error;
529                 rt->u.dst.flags |= DST_HOST;
530                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
531         }
532         return rt;
533 }
534
535 #define BACKTRACK() \
536 if (rt == &ip6_null_entry) { \
537        while ((fn = fn->parent) != NULL) { \
538                 if (fn->fn_flags & RTN_ROOT) { \
539                         goto out; \
540                 } \
541                 if (fn->fn_flags & RTN_RTINFO) \
542                         goto restart; \
543         } \
544 }
545
546
547 void ip6_route_input(struct sk_buff *skb)
548 {
549         struct fib6_node *fn;
550         struct rt6_info *rt, *nrt;
551         int strict;
552         int attempts = 3;
553         int err;
554         int reachable = RT6_SELECT_F_REACHABLE;
555
556         strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
557
558 relookup:
559         read_lock_bh(&rt6_lock);
560
561 restart_2:
562         fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
563                          &skb->nh.ipv6h->saddr);
564
565 restart:
566         rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
567         BACKTRACK();
568         if (rt == &ip6_null_entry ||
569             rt->rt6i_flags & RTF_CACHE)
570                 goto out;
571
572         dst_hold(&rt->u.dst);
573         read_unlock_bh(&rt6_lock);
574
575         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
576                 nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
577         else {
578 #if CLONE_OFFLINK_ROUTE
579                 nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
580 #else
581                 goto out2;
582 #endif
583         }
584
585         dst_release(&rt->u.dst);
586         rt = nrt ? : &ip6_null_entry;
587
588         dst_hold(&rt->u.dst);
589         if (nrt) {
590                 err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
591                 if (!err)
592                         goto out2;
593         }
594
595         if (--attempts <= 0)
596                 goto out2;
597
598         /*
599          * Race condition! In the gap, when rt6_lock was
600          * released someone could insert this route.  Relookup.
601          */
602         dst_release(&rt->u.dst);
603         goto relookup;
604
605 out:
606         if (reachable) {
607                 reachable = 0;
608                 goto restart_2;
609         }
610         dst_hold(&rt->u.dst);
611         read_unlock_bh(&rt6_lock);
612 out2:
613         rt->u.dst.lastuse = jiffies;
614         rt->u.dst.__use++;
615         skb->dst = (struct dst_entry *) rt;
616         return;
617 }
618
619 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
620 {
621         struct fib6_node *fn;
622         struct rt6_info *rt, *nrt;
623         int strict;
624         int attempts = 3;
625         int err;
626         int reachable = RT6_SELECT_F_REACHABLE;
627
628         strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
629
630 relookup:
631         read_lock_bh(&rt6_lock);
632
633 restart_2:
634         fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
635
636 restart:
637         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
638         BACKTRACK();
639         if (rt == &ip6_null_entry ||
640             rt->rt6i_flags & RTF_CACHE)
641                 goto out;
642
643         dst_hold(&rt->u.dst);
644         read_unlock_bh(&rt6_lock);
645
646         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
647                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
648         else {
649 #if CLONE_OFFLINK_ROUTE
650                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
651 #else
652                 goto out2;
653 #endif
654         }
655
656         dst_release(&rt->u.dst);
657         rt = nrt ? : &ip6_null_entry;
658
659         dst_hold(&rt->u.dst);
660         if (nrt) {
661                 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
662                 if (!err)
663                         goto out2;
664         }
665
666         if (--attempts <= 0)
667                 goto out2;
668
669         /*
670          * Race condition! In the gap, when rt6_lock was
671          * released someone could insert this route.  Relookup.
672          */
673         dst_release(&rt->u.dst);
674         goto relookup;
675
676 out:
677         if (reachable) {
678                 reachable = 0;
679                 goto restart_2;
680         }
681         dst_hold(&rt->u.dst);
682         read_unlock_bh(&rt6_lock);
683 out2:
684         rt->u.dst.lastuse = jiffies;
685         rt->u.dst.__use++;
686         return &rt->u.dst;
687 }
688
689
690 /*
691  *      Destination cache support functions
692  */
693
694 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
695 {
696         struct rt6_info *rt;
697
698         rt = (struct rt6_info *) dst;
699
700         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
701                 return dst;
702
703         return NULL;
704 }
705
706 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
707 {
708         struct rt6_info *rt = (struct rt6_info *) dst;
709
710         if (rt) {
711                 if (rt->rt6i_flags & RTF_CACHE)
712                         ip6_del_rt(rt, NULL, NULL, NULL);
713                 else
714                         dst_release(dst);
715         }
716         return NULL;
717 }
718
719 static void ip6_link_failure(struct sk_buff *skb)
720 {
721         struct rt6_info *rt;
722
723         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
724
725         rt = (struct rt6_info *) skb->dst;
726         if (rt) {
727                 if (rt->rt6i_flags&RTF_CACHE) {
728                         dst_set_expires(&rt->u.dst, 0);
729                         rt->rt6i_flags |= RTF_EXPIRES;
730                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
731                         rt->rt6i_node->fn_sernum = -1;
732         }
733 }
734
735 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
736 {
737         struct rt6_info *rt6 = (struct rt6_info*)dst;
738
739         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
740                 rt6->rt6i_flags |= RTF_MODIFIED;
741                 if (mtu < IPV6_MIN_MTU) {
742                         mtu = IPV6_MIN_MTU;
743                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
744                 }
745                 dst->metrics[RTAX_MTU-1] = mtu;
746                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
747         }
748 }
749
750 /* Protected by rt6_lock.  */
751 static struct dst_entry *ndisc_dst_gc_list;
752 static int ipv6_get_mtu(struct net_device *dev);
753
754 static inline unsigned int ipv6_advmss(unsigned int mtu)
755 {
756         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
757
758         if (mtu < ip6_rt_min_advmss)
759                 mtu = ip6_rt_min_advmss;
760
761         /*
762          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
763          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
764          * IPV6_MAXPLEN is also valid and means: "any MSS, 
765          * rely only on pmtu discovery"
766          */
767         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
768                 mtu = IPV6_MAXPLEN;
769         return mtu;
770 }
771
772 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
773                                   struct neighbour *neigh,
774                                   struct in6_addr *addr,
775                                   int (*output)(struct sk_buff *))
776 {
777         struct rt6_info *rt;
778         struct inet6_dev *idev = in6_dev_get(dev);
779
780         if (unlikely(idev == NULL))
781                 return NULL;
782
783         rt = ip6_dst_alloc();
784         if (unlikely(rt == NULL)) {
785                 in6_dev_put(idev);
786                 goto out;
787         }
788
789         dev_hold(dev);
790         if (neigh)
791                 neigh_hold(neigh);
792         else
793                 neigh = ndisc_get_neigh(dev, addr);
794
795         rt->rt6i_dev      = dev;
796         rt->rt6i_idev     = idev;
797         rt->rt6i_nexthop  = neigh;
798         atomic_set(&rt->u.dst.__refcnt, 1);
799         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
800         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
801         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
802         rt->u.dst.output  = output;
803
804 #if 0   /* there's no chance to use these for ndisc */
805         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
806                                 ? DST_HOST 
807                                 : 0;
808         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
809         rt->rt6i_dst.plen = 128;
810 #endif
811
812         write_lock_bh(&rt6_lock);
813         rt->u.dst.next = ndisc_dst_gc_list;
814         ndisc_dst_gc_list = &rt->u.dst;
815         write_unlock_bh(&rt6_lock);
816
817         fib6_force_start_gc();
818
819 out:
820         return (struct dst_entry *)rt;
821 }
822
823 int ndisc_dst_gc(int *more)
824 {
825         struct dst_entry *dst, *next, **pprev;
826         int freed;
827
828         next = NULL;
829         pprev = &ndisc_dst_gc_list;
830         freed = 0;
831         while ((dst = *pprev) != NULL) {
832                 if (!atomic_read(&dst->__refcnt)) {
833                         *pprev = dst->next;
834                         dst_free(dst);
835                         freed++;
836                 } else {
837                         pprev = &dst->next;
838                         (*more)++;
839                 }
840         }
841
842         return freed;
843 }
844
845 static int ip6_dst_gc(void)
846 {
847         static unsigned expire = 30*HZ;
848         static unsigned long last_gc;
849         unsigned long now = jiffies;
850
851         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
852             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
853                 goto out;
854
855         expire++;
856         fib6_run_gc(expire);
857         last_gc = now;
858         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
859                 expire = ip6_rt_gc_timeout>>1;
860
861 out:
862         expire -= expire>>ip6_rt_gc_elasticity;
863         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
864 }
865
866 /* Clean host part of a prefix. Not necessary in radix tree,
867    but results in cleaner routing tables.
868
869    Remove it only when all the things will work!
870  */
871
872 static int ipv6_get_mtu(struct net_device *dev)
873 {
874         int mtu = IPV6_MIN_MTU;
875         struct inet6_dev *idev;
876
877         idev = in6_dev_get(dev);
878         if (idev) {
879                 mtu = idev->cnf.mtu6;
880                 in6_dev_put(idev);
881         }
882         return mtu;
883 }
884
885 int ipv6_get_hoplimit(struct net_device *dev)
886 {
887         int hoplimit = ipv6_devconf.hop_limit;
888         struct inet6_dev *idev;
889
890         idev = in6_dev_get(dev);
891         if (idev) {
892                 hoplimit = idev->cnf.hop_limit;
893                 in6_dev_put(idev);
894         }
895         return hoplimit;
896 }
897
898 /*
899  *
900  */
901
902 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, 
903                 void *_rtattr, struct netlink_skb_parms *req)
904 {
905         int err;
906         struct rtmsg *r;
907         struct rtattr **rta;
908         struct rt6_info *rt = NULL;
909         struct net_device *dev = NULL;
910         struct inet6_dev *idev = NULL;
911         int addr_type;
912
913         rta = (struct rtattr **) _rtattr;
914
915         if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
916                 return -EINVAL;
917 #ifndef CONFIG_IPV6_SUBTREES
918         if (rtmsg->rtmsg_src_len)
919                 return -EINVAL;
920 #endif
921         if (rtmsg->rtmsg_ifindex) {
922                 err = -ENODEV;
923                 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
924                 if (!dev)
925                         goto out;
926                 idev = in6_dev_get(dev);
927                 if (!idev)
928                         goto out;
929         }
930
931         if (rtmsg->rtmsg_metric == 0)
932                 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
933
934         rt = ip6_dst_alloc();
935
936         if (rt == NULL) {
937                 err = -ENOMEM;
938                 goto out;
939         }
940
941         rt->u.dst.obsolete = -1;
942         rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
943         if (nlh && (r = NLMSG_DATA(nlh))) {
944                 rt->rt6i_protocol = r->rtm_protocol;
945         } else {
946                 rt->rt6i_protocol = RTPROT_BOOT;
947         }
948
949         addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
950
951         if (addr_type & IPV6_ADDR_MULTICAST)
952                 rt->u.dst.input = ip6_mc_input;
953         else
954                 rt->u.dst.input = ip6_forward;
955
956         rt->u.dst.output = ip6_output;
957
958         ipv6_addr_prefix(&rt->rt6i_dst.addr, 
959                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
960         rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
961         if (rt->rt6i_dst.plen == 128)
962                rt->u.dst.flags = DST_HOST;
963
964 #ifdef CONFIG_IPV6_SUBTREES
965         ipv6_addr_prefix(&rt->rt6i_src.addr, 
966                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
967         rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
968 #endif
969
970         rt->rt6i_metric = rtmsg->rtmsg_metric;
971
972         /* We cannot add true routes via loopback here,
973            they would result in kernel looping; promote them to reject routes
974          */
975         if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
976             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
977                 /* hold loopback dev/idev if we haven't done so. */
978                 if (dev != &loopback_dev) {
979                         if (dev) {
980                                 dev_put(dev);
981                                 in6_dev_put(idev);
982                         }
983                         dev = &loopback_dev;
984                         dev_hold(dev);
985                         idev = in6_dev_get(dev);
986                         if (!idev) {
987                                 err = -ENODEV;
988                                 goto out;
989                         }
990                 }
991                 rt->u.dst.output = ip6_pkt_discard_out;
992                 rt->u.dst.input = ip6_pkt_discard;
993                 rt->u.dst.error = -ENETUNREACH;
994                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
995                 goto install_route;
996         }
997
998         if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
999                 struct in6_addr *gw_addr;
1000                 int gwa_type;
1001
1002                 gw_addr = &rtmsg->rtmsg_gateway;
1003                 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
1004                 gwa_type = ipv6_addr_type(gw_addr);
1005
1006                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1007                         struct rt6_info *grt;
1008
1009                         /* IPv6 strictly inhibits using not link-local
1010                            addresses as nexthop address.
1011                            Otherwise, router will not able to send redirects.
1012                            It is very good, but in some (rare!) circumstances
1013                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1014                            some exceptions. --ANK
1015                          */
1016                         err = -EINVAL;
1017                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1018                                 goto out;
1019
1020                         grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1021
1022                         err = -EHOSTUNREACH;
1023                         if (grt == NULL)
1024                                 goto out;
1025                         if (dev) {
1026                                 if (dev != grt->rt6i_dev) {
1027                                         dst_release(&grt->u.dst);
1028                                         goto out;
1029                                 }
1030                         } else {
1031                                 dev = grt->rt6i_dev;
1032                                 idev = grt->rt6i_idev;
1033                                 dev_hold(dev);
1034                                 in6_dev_hold(grt->rt6i_idev);
1035                         }
1036                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1037                                 err = 0;
1038                         dst_release(&grt->u.dst);
1039
1040                         if (err)
1041                                 goto out;
1042                 }
1043                 err = -EINVAL;
1044                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1045                         goto out;
1046         }
1047
1048         err = -ENODEV;
1049         if (dev == NULL)
1050                 goto out;
1051
1052         if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1053                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1054                 if (IS_ERR(rt->rt6i_nexthop)) {
1055                         err = PTR_ERR(rt->rt6i_nexthop);
1056                         rt->rt6i_nexthop = NULL;
1057                         goto out;
1058                 }
1059         }
1060
1061         rt->rt6i_flags = rtmsg->rtmsg_flags;
1062
1063 install_route:
1064         if (rta && rta[RTA_METRICS-1]) {
1065                 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1066                 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1067
1068                 while (RTA_OK(attr, attrlen)) {
1069                         unsigned flavor = attr->rta_type;
1070                         if (flavor) {
1071                                 if (flavor > RTAX_MAX) {
1072                                         err = -EINVAL;
1073                                         goto out;
1074                                 }
1075                                 rt->u.dst.metrics[flavor-1] =
1076                                         *(u32 *)RTA_DATA(attr);
1077                         }
1078                         attr = RTA_NEXT(attr, attrlen);
1079                 }
1080         }
1081
1082         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1083                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1084         if (!rt->u.dst.metrics[RTAX_MTU-1])
1085                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1086         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1087                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1088         rt->u.dst.dev = dev;
1089         rt->rt6i_idev = idev;
1090         return ip6_ins_rt(rt, nlh, _rtattr, req);
1091
1092 out:
1093         if (dev)
1094                 dev_put(dev);
1095         if (idev)
1096                 in6_dev_put(idev);
1097         if (rt)
1098                 dst_free((struct dst_entry *) rt);
1099         return err;
1100 }
1101
1102 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1103 {
1104         int err;
1105
1106         write_lock_bh(&rt6_lock);
1107
1108         err = fib6_del(rt, nlh, _rtattr, req);
1109         dst_release(&rt->u.dst);
1110
1111         write_unlock_bh(&rt6_lock);
1112
1113         return err;
1114 }
1115
1116 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1117 {
1118         struct fib6_node *fn;
1119         struct rt6_info *rt;
1120         int err = -ESRCH;
1121
1122         read_lock_bh(&rt6_lock);
1123
1124         fn = fib6_locate(&ip6_routing_table,
1125                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1126                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1127         
1128         if (fn) {
1129                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1130                         if (rtmsg->rtmsg_ifindex &&
1131                             (rt->rt6i_dev == NULL ||
1132                              rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1133                                 continue;
1134                         if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1135                             !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1136                                 continue;
1137                         if (rtmsg->rtmsg_metric &&
1138                             rtmsg->rtmsg_metric != rt->rt6i_metric)
1139                                 continue;
1140                         dst_hold(&rt->u.dst);
1141                         read_unlock_bh(&rt6_lock);
1142
1143                         return ip6_del_rt(rt, nlh, _rtattr, req);
1144                 }
1145         }
1146         read_unlock_bh(&rt6_lock);
1147
1148         return err;
1149 }
1150
1151 /*
1152  *      Handle redirects
1153  */
1154 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1155                   struct neighbour *neigh, u8 *lladdr, int on_link)
1156 {
1157         struct rt6_info *rt, *nrt = NULL;
1158         int strict;
1159         struct fib6_node *fn;
1160         struct netevent_redirect netevent;
1161
1162         /*
1163          * Get the "current" route for this destination and
1164          * check if the redirect has come from approriate router.
1165          *
1166          * RFC 2461 specifies that redirects should only be
1167          * accepted if they come from the nexthop to the target.
1168          * Due to the way the routes are chosen, this notion
1169          * is a bit fuzzy and one might need to check all possible
1170          * routes.
1171          */
1172         strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
1173
1174         read_lock_bh(&rt6_lock);
1175         fn = fib6_lookup(&ip6_routing_table, dest, NULL);
1176 restart:
1177         for (rt = fn->leaf; rt; rt = rt->u.next) {
1178                 /*
1179                  * Current route is on-link; redirect is always invalid.
1180                  *
1181                  * Seems, previous statement is not true. It could
1182                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1183                  * But then router serving it might decide, that we should
1184                  * know truth 8)8) --ANK (980726).
1185                  */
1186                 if (rt6_check_expired(rt))
1187                         continue;
1188                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1189                         continue;
1190                 if (neigh->dev != rt->rt6i_dev)
1191                         continue;
1192                 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1193                         continue;
1194                 break;
1195         }
1196         if (rt)
1197                 dst_hold(&rt->u.dst);
1198         else if (strict) {
1199                 while ((fn = fn->parent) != NULL) {
1200                         if (fn->fn_flags & RTN_ROOT)
1201                                 break;
1202                         if (fn->fn_flags & RTN_RTINFO)
1203                                 goto restart;
1204                 }
1205         }
1206         read_unlock_bh(&rt6_lock);
1207
1208         if (!rt) {
1209                 if (net_ratelimit())
1210                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1211                                "for redirect target\n");
1212                 return;
1213         }
1214
1215         /*
1216          *      We have finally decided to accept it.
1217          */
1218
1219         neigh_update(neigh, lladdr, NUD_STALE, 
1220                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1221                      NEIGH_UPDATE_F_OVERRIDE|
1222                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1223                                      NEIGH_UPDATE_F_ISROUTER))
1224                      );
1225
1226         /*
1227          * Redirect received -> path was valid.
1228          * Look, redirects are sent only in response to data packets,
1229          * so that this nexthop apparently is reachable. --ANK
1230          */
1231         dst_confirm(&rt->u.dst);
1232
1233         /* Duplicate redirect: silently ignore. */
1234         if (neigh == rt->u.dst.neighbour)
1235                 goto out;
1236
1237         nrt = ip6_rt_copy(rt);
1238         if (nrt == NULL)
1239                 goto out;
1240
1241         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1242         if (on_link)
1243                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1244
1245         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1246         nrt->rt6i_dst.plen = 128;
1247         nrt->u.dst.flags |= DST_HOST;
1248
1249         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1250         nrt->rt6i_nexthop = neigh_clone(neigh);
1251         /* Reset pmtu, it may be better */
1252         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1253         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1254
1255         if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1256                 goto out;
1257
1258         netevent.old = &rt->u.dst;
1259         netevent.new = &nrt->u.dst;
1260         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1261
1262         if (rt->rt6i_flags&RTF_CACHE) {
1263                 ip6_del_rt(rt, NULL, NULL, NULL);
1264                 return;
1265         }
1266
1267 out:
1268         dst_release(&rt->u.dst);
1269         return;
1270 }
1271
1272 /*
1273  *      Handle ICMP "packet too big" messages
1274  *      i.e. Path MTU discovery
1275  */
1276
1277 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1278                         struct net_device *dev, u32 pmtu)
1279 {
1280         struct rt6_info *rt, *nrt;
1281         int allfrag = 0;
1282
1283         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1284         if (rt == NULL)
1285                 return;
1286
1287         if (pmtu >= dst_mtu(&rt->u.dst))
1288                 goto out;
1289
1290         if (pmtu < IPV6_MIN_MTU) {
1291                 /*
1292                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1293                  * MTU (1280) and a fragment header should always be included
1294                  * after a node receiving Too Big message reporting PMTU is
1295                  * less than the IPv6 Minimum Link MTU.
1296                  */
1297                 pmtu = IPV6_MIN_MTU;
1298                 allfrag = 1;
1299         }
1300
1301         /* New mtu received -> path was valid.
1302            They are sent only in response to data packets,
1303            so that this nexthop apparently is reachable. --ANK
1304          */
1305         dst_confirm(&rt->u.dst);
1306
1307         /* Host route. If it is static, it would be better
1308            not to override it, but add new one, so that
1309            when cache entry will expire old pmtu
1310            would return automatically.
1311          */
1312         if (rt->rt6i_flags & RTF_CACHE) {
1313                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1314                 if (allfrag)
1315                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1316                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1317                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1318                 goto out;
1319         }
1320
1321         /* Network route.
1322            Two cases are possible:
1323            1. It is connected route. Action: COW
1324            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1325          */
1326         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1327                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1328         else
1329                 nrt = rt6_alloc_clone(rt, daddr);
1330
1331         if (nrt) {
1332                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1333                 if (allfrag)
1334                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1335
1336                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1337                  * happened within 5 mins, the recommended timer is 10 mins.
1338                  * Here this route expiration time is set to ip6_rt_mtu_expires
1339                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1340                  * and detecting PMTU increase will be automatically happened.
1341                  */
1342                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1343                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1344
1345                 ip6_ins_rt(nrt, NULL, NULL, NULL);
1346         }
1347 out:
1348         dst_release(&rt->u.dst);
1349 }
1350
1351 /*
1352  *      Misc support functions
1353  */
1354
1355 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1356 {
1357         struct rt6_info *rt = ip6_dst_alloc();
1358
1359         if (rt) {
1360                 rt->u.dst.input = ort->u.dst.input;
1361                 rt->u.dst.output = ort->u.dst.output;
1362
1363                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1364                 rt->u.dst.dev = ort->u.dst.dev;
1365                 if (rt->u.dst.dev)
1366                         dev_hold(rt->u.dst.dev);
1367                 rt->rt6i_idev = ort->rt6i_idev;
1368                 if (rt->rt6i_idev)
1369                         in6_dev_hold(rt->rt6i_idev);
1370                 rt->u.dst.lastuse = jiffies;
1371                 rt->rt6i_expires = 0;
1372
1373                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1374                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1375                 rt->rt6i_metric = 0;
1376
1377                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1378 #ifdef CONFIG_IPV6_SUBTREES
1379                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1380 #endif
1381         }
1382         return rt;
1383 }
1384
1385 #ifdef CONFIG_IPV6_ROUTE_INFO
1386 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1387                                            struct in6_addr *gwaddr, int ifindex)
1388 {
1389         struct fib6_node *fn;
1390         struct rt6_info *rt = NULL;
1391
1392         write_lock_bh(&rt6_lock);
1393         fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1394         if (!fn)
1395                 goto out;
1396
1397         for (rt = fn->leaf; rt; rt = rt->u.next) {
1398                 if (rt->rt6i_dev->ifindex != ifindex)
1399                         continue;
1400                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1401                         continue;
1402                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1403                         continue;
1404                 dst_hold(&rt->u.dst);
1405                 break;
1406         }
1407 out:
1408         write_unlock_bh(&rt6_lock);
1409         return rt;
1410 }
1411
1412 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1413                                            struct in6_addr *gwaddr, int ifindex,
1414                                            unsigned pref)
1415 {
1416         struct in6_rtmsg rtmsg;
1417
1418         memset(&rtmsg, 0, sizeof(rtmsg));
1419         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1420         ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1421         rtmsg.rtmsg_dst_len = prefixlen;
1422         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1423         rtmsg.rtmsg_metric = 1024;
1424         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1425         /* We should treat it as a default route if prefix length is 0. */
1426         if (!prefixlen)
1427                 rtmsg.rtmsg_flags |= RTF_DEFAULT;
1428         rtmsg.rtmsg_ifindex = ifindex;
1429
1430         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1431
1432         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1433 }
1434 #endif
1435
1436 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1437 {       
1438         struct rt6_info *rt;
1439         struct fib6_node *fn;
1440
1441         fn = &ip6_routing_table;
1442
1443         write_lock_bh(&rt6_lock);
1444         for (rt = fn->leaf; rt; rt=rt->u.next) {
1445                 if (dev == rt->rt6i_dev &&
1446                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1447                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1448                         break;
1449         }
1450         if (rt)
1451                 dst_hold(&rt->u.dst);
1452         write_unlock_bh(&rt6_lock);
1453         return rt;
1454 }
1455
1456 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1457                                      struct net_device *dev,
1458                                      unsigned int pref)
1459 {
1460         struct in6_rtmsg rtmsg;
1461
1462         memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1463         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1464         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1465         rtmsg.rtmsg_metric = 1024;
1466         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1467                             RTF_PREF(pref);
1468
1469         rtmsg.rtmsg_ifindex = dev->ifindex;
1470
1471         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1472         return rt6_get_dflt_router(gwaddr, dev);
1473 }
1474
1475 void rt6_purge_dflt_routers(void)
1476 {
1477         struct rt6_info *rt;
1478
1479 restart:
1480         read_lock_bh(&rt6_lock);
1481         for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1482                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1483                         dst_hold(&rt->u.dst);
1484
1485                         read_unlock_bh(&rt6_lock);
1486
1487                         ip6_del_rt(rt, NULL, NULL, NULL);
1488
1489                         goto restart;
1490                 }
1491         }
1492         read_unlock_bh(&rt6_lock);
1493 }
1494
1495 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1496 {
1497         struct in6_rtmsg rtmsg;
1498         int err;
1499
1500         switch(cmd) {
1501         case SIOCADDRT:         /* Add a route */
1502         case SIOCDELRT:         /* Delete a route */
1503                 if (!capable(CAP_NET_ADMIN))
1504                         return -EPERM;
1505                 err = copy_from_user(&rtmsg, arg,
1506                                      sizeof(struct in6_rtmsg));
1507                 if (err)
1508                         return -EFAULT;
1509                         
1510                 rtnl_lock();
1511                 switch (cmd) {
1512                 case SIOCADDRT:
1513                         err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1514                         break;
1515                 case SIOCDELRT:
1516                         err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1517                         break;
1518                 default:
1519                         err = -EINVAL;
1520                 }
1521                 rtnl_unlock();
1522
1523                 return err;
1524         };
1525
1526         return -EINVAL;
1527 }
1528
1529 /*
1530  *      Drop the packet on the floor
1531  */
1532
1533 static int ip6_pkt_discard(struct sk_buff *skb)
1534 {
1535         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1536         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1537         kfree_skb(skb);
1538         return 0;
1539 }
1540
1541 static int ip6_pkt_discard_out(struct sk_buff *skb)
1542 {
1543         skb->dev = skb->dst->dev;
1544         return ip6_pkt_discard(skb);
1545 }
1546
1547 /*
1548  *      Allocate a dst for local (unicast / anycast) address.
1549  */
1550
1551 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1552                                     const struct in6_addr *addr,
1553                                     int anycast)
1554 {
1555         struct rt6_info *rt = ip6_dst_alloc();
1556
1557         if (rt == NULL)
1558                 return ERR_PTR(-ENOMEM);
1559
1560         dev_hold(&loopback_dev);
1561         in6_dev_hold(idev);
1562
1563         rt->u.dst.flags = DST_HOST;
1564         rt->u.dst.input = ip6_input;
1565         rt->u.dst.output = ip6_output;
1566         rt->rt6i_dev = &loopback_dev;
1567         rt->rt6i_idev = idev;
1568         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1569         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1570         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1571         rt->u.dst.obsolete = -1;
1572
1573         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1574         if (anycast)
1575                 rt->rt6i_flags |= RTF_ANYCAST;
1576         else
1577                 rt->rt6i_flags |= RTF_LOCAL;
1578         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1579         if (rt->rt6i_nexthop == NULL) {
1580                 dst_free((struct dst_entry *) rt);
1581                 return ERR_PTR(-ENOMEM);
1582         }
1583
1584         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1585         rt->rt6i_dst.plen = 128;
1586
1587         atomic_set(&rt->u.dst.__refcnt, 1);
1588
1589         return rt;
1590 }
1591
1592 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1593 {
1594         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1595             rt != &ip6_null_entry) {
1596                 RT6_TRACE("deleted by ifdown %p\n", rt);
1597                 return -1;
1598         }
1599         return 0;
1600 }
1601
1602 void rt6_ifdown(struct net_device *dev)
1603 {
1604         write_lock_bh(&rt6_lock);
1605         fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1606         write_unlock_bh(&rt6_lock);
1607 }
1608
1609 struct rt6_mtu_change_arg
1610 {
1611         struct net_device *dev;
1612         unsigned mtu;
1613 };
1614
1615 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1616 {
1617         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1618         struct inet6_dev *idev;
1619
1620         /* In IPv6 pmtu discovery is not optional,
1621            so that RTAX_MTU lock cannot disable it.
1622            We still use this lock to block changes
1623            caused by addrconf/ndisc.
1624         */
1625
1626         idev = __in6_dev_get(arg->dev);
1627         if (idev == NULL)
1628                 return 0;
1629
1630         /* For administrative MTU increase, there is no way to discover
1631            IPv6 PMTU increase, so PMTU increase should be updated here.
1632            Since RFC 1981 doesn't include administrative MTU increase
1633            update PMTU increase is a MUST. (i.e. jumbo frame)
1634          */
1635         /*
1636            If new MTU is less than route PMTU, this new MTU will be the
1637            lowest MTU in the path, update the route PMTU to reflect PMTU
1638            decreases; if new MTU is greater than route PMTU, and the
1639            old MTU is the lowest MTU in the path, update the route PMTU
1640            to reflect the increase. In this case if the other nodes' MTU
1641            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1642            PMTU discouvery.
1643          */
1644         if (rt->rt6i_dev == arg->dev &&
1645             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1646             (dst_mtu(&rt->u.dst) > arg->mtu ||
1647              (dst_mtu(&rt->u.dst) < arg->mtu &&
1648               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1649                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1650         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1651         return 0;
1652 }
1653
1654 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1655 {
1656         struct rt6_mtu_change_arg arg;
1657
1658         arg.dev = dev;
1659         arg.mtu = mtu;
1660         read_lock_bh(&rt6_lock);
1661         fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1662         read_unlock_bh(&rt6_lock);
1663 }
1664
1665 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1666                               struct in6_rtmsg *rtmsg)
1667 {
1668         memset(rtmsg, 0, sizeof(*rtmsg));
1669
1670         rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1671         rtmsg->rtmsg_src_len = r->rtm_src_len;
1672         rtmsg->rtmsg_flags = RTF_UP;
1673         if (r->rtm_type == RTN_UNREACHABLE)
1674                 rtmsg->rtmsg_flags |= RTF_REJECT;
1675
1676         if (rta[RTA_GATEWAY-1]) {
1677                 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1678                         return -EINVAL;
1679                 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1680                 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1681         }
1682         if (rta[RTA_DST-1]) {
1683                 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1684                         return -EINVAL;
1685                 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1686         }
1687         if (rta[RTA_SRC-1]) {
1688                 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1689                         return -EINVAL;
1690                 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1691         }
1692         if (rta[RTA_OIF-1]) {
1693                 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1694                         return -EINVAL;
1695                 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1696         }
1697         if (rta[RTA_PRIORITY-1]) {
1698                 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1699                         return -EINVAL;
1700                 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1701         }
1702         return 0;
1703 }
1704
1705 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1706 {
1707         struct rtmsg *r = NLMSG_DATA(nlh);
1708         struct in6_rtmsg rtmsg;
1709
1710         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1711                 return -EINVAL;
1712         return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1713 }
1714
1715 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1716 {
1717         struct rtmsg *r = NLMSG_DATA(nlh);
1718         struct in6_rtmsg rtmsg;
1719
1720         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1721                 return -EINVAL;
1722         return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1723 }
1724
1725 struct rt6_rtnl_dump_arg
1726 {
1727         struct sk_buff *skb;
1728         struct netlink_callback *cb;
1729 };
1730
1731 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1732                          struct in6_addr *dst, struct in6_addr *src,
1733                          int iif, int type, u32 pid, u32 seq,
1734                          int prefix, unsigned int flags)
1735 {
1736         struct rtmsg *rtm;
1737         struct nlmsghdr  *nlh;
1738         unsigned char    *b = skb->tail;
1739         struct rta_cacheinfo ci;
1740
1741         if (prefix) {   /* user wants prefix routes only */
1742                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1743                         /* success since this is not a prefix route */
1744                         return 1;
1745                 }
1746         }
1747
1748         nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1749         rtm = NLMSG_DATA(nlh);
1750         rtm->rtm_family = AF_INET6;
1751         rtm->rtm_dst_len = rt->rt6i_dst.plen;
1752         rtm->rtm_src_len = rt->rt6i_src.plen;
1753         rtm->rtm_tos = 0;
1754         rtm->rtm_table = RT_TABLE_MAIN;
1755         if (rt->rt6i_flags&RTF_REJECT)
1756                 rtm->rtm_type = RTN_UNREACHABLE;
1757         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1758                 rtm->rtm_type = RTN_LOCAL;
1759         else
1760                 rtm->rtm_type = RTN_UNICAST;
1761         rtm->rtm_flags = 0;
1762         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1763         rtm->rtm_protocol = rt->rt6i_protocol;
1764         if (rt->rt6i_flags&RTF_DYNAMIC)
1765                 rtm->rtm_protocol = RTPROT_REDIRECT;
1766         else if (rt->rt6i_flags & RTF_ADDRCONF)
1767                 rtm->rtm_protocol = RTPROT_KERNEL;
1768         else if (rt->rt6i_flags&RTF_DEFAULT)
1769                 rtm->rtm_protocol = RTPROT_RA;
1770
1771         if (rt->rt6i_flags&RTF_CACHE)
1772                 rtm->rtm_flags |= RTM_F_CLONED;
1773
1774         if (dst) {
1775                 RTA_PUT(skb, RTA_DST, 16, dst);
1776                 rtm->rtm_dst_len = 128;
1777         } else if (rtm->rtm_dst_len)
1778                 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1779 #ifdef CONFIG_IPV6_SUBTREES
1780         if (src) {
1781                 RTA_PUT(skb, RTA_SRC, 16, src);
1782                 rtm->rtm_src_len = 128;
1783         } else if (rtm->rtm_src_len)
1784                 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1785 #endif
1786         if (iif)
1787                 RTA_PUT(skb, RTA_IIF, 4, &iif);
1788         else if (dst) {
1789                 struct in6_addr saddr_buf;
1790                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1791                         RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1792         }
1793         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1794                 goto rtattr_failure;
1795         if (rt->u.dst.neighbour)
1796                 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1797         if (rt->u.dst.dev)
1798                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1799         RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1800         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1801         if (rt->rt6i_expires)
1802                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1803         else
1804                 ci.rta_expires = 0;
1805         ci.rta_used = rt->u.dst.__use;
1806         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1807         ci.rta_error = rt->u.dst.error;
1808         ci.rta_id = 0;
1809         ci.rta_ts = 0;
1810         ci.rta_tsage = 0;
1811         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1812         nlh->nlmsg_len = skb->tail - b;
1813         return skb->len;
1814
1815 nlmsg_failure:
1816 rtattr_failure:
1817         skb_trim(skb, b - skb->data);
1818         return -1;
1819 }
1820
1821 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1822 {
1823         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1824         int prefix;
1825
1826         if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1827                 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1828                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1829         } else
1830                 prefix = 0;
1831
1832         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1833                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1834                      prefix, NLM_F_MULTI);
1835 }
1836
1837 static int fib6_dump_node(struct fib6_walker_t *w)
1838 {
1839         int res;
1840         struct rt6_info *rt;
1841
1842         for (rt = w->leaf; rt; rt = rt->u.next) {
1843                 res = rt6_dump_route(rt, w->args);
1844                 if (res < 0) {
1845                         /* Frame is full, suspend walking */
1846                         w->leaf = rt;
1847                         return 1;
1848                 }
1849                 BUG_TRAP(res!=0);
1850         }
1851         w->leaf = NULL;
1852         return 0;
1853 }
1854
1855 static void fib6_dump_end(struct netlink_callback *cb)
1856 {
1857         struct fib6_walker_t *w = (void*)cb->args[0];
1858
1859         if (w) {
1860                 cb->args[0] = 0;
1861                 fib6_walker_unlink(w);
1862                 kfree(w);
1863         }
1864         cb->done = (void*)cb->args[1];
1865         cb->args[1] = 0;
1866 }
1867
1868 static int fib6_dump_done(struct netlink_callback *cb)
1869 {
1870         fib6_dump_end(cb);
1871         return cb->done ? cb->done(cb) : 0;
1872 }
1873
1874 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1875 {
1876         struct rt6_rtnl_dump_arg arg;
1877         struct fib6_walker_t *w;
1878         int res;
1879
1880         arg.skb = skb;
1881         arg.cb = cb;
1882
1883         w = (void*)cb->args[0];
1884         if (w == NULL) {
1885                 /* New dump:
1886                  * 
1887                  * 1. hook callback destructor.
1888                  */
1889                 cb->args[1] = (long)cb->done;
1890                 cb->done = fib6_dump_done;
1891
1892                 /*
1893                  * 2. allocate and initialize walker.
1894                  */
1895                 w = kzalloc(sizeof(*w), GFP_ATOMIC);
1896                 if (w == NULL)
1897                         return -ENOMEM;
1898                 RT6_TRACE("dump<%p", w);
1899                 w->root = &ip6_routing_table;
1900                 w->func = fib6_dump_node;
1901                 w->args = &arg;
1902                 cb->args[0] = (long)w;
1903                 read_lock_bh(&rt6_lock);
1904                 res = fib6_walk(w);
1905                 read_unlock_bh(&rt6_lock);
1906         } else {
1907                 w->args = &arg;
1908                 read_lock_bh(&rt6_lock);
1909                 res = fib6_walk_continue(w);
1910                 read_unlock_bh(&rt6_lock);
1911         }
1912 #if RT6_DEBUG >= 3
1913         if (res <= 0 && skb->len == 0)
1914                 RT6_TRACE("%p>dump end\n", w);
1915 #endif
1916         res = res < 0 ? res : skb->len;
1917         /* res < 0 is an error. (really, impossible)
1918            res == 0 means that dump is complete, but skb still can contain data.
1919            res > 0 dump is not complete, but frame is full.
1920          */
1921         /* Destroy walker, if dump of this table is complete. */
1922         if (res <= 0)
1923                 fib6_dump_end(cb);
1924         return res;
1925 }
1926
1927 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1928 {
1929         struct rtattr **rta = arg;
1930         int iif = 0;
1931         int err = -ENOBUFS;
1932         struct sk_buff *skb;
1933         struct flowi fl;
1934         struct rt6_info *rt;
1935
1936         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1937         if (skb == NULL)
1938                 goto out;
1939
1940         /* Reserve room for dummy headers, this skb can pass
1941            through good chunk of routing engine.
1942          */
1943         skb->mac.raw = skb->data;
1944         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1945
1946         memset(&fl, 0, sizeof(fl));
1947         if (rta[RTA_SRC-1])
1948                 ipv6_addr_copy(&fl.fl6_src,
1949                                (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1950         if (rta[RTA_DST-1])
1951                 ipv6_addr_copy(&fl.fl6_dst,
1952                                (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1953
1954         if (rta[RTA_IIF-1])
1955                 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1956
1957         if (iif) {
1958                 struct net_device *dev;
1959                 dev = __dev_get_by_index(iif);
1960                 if (!dev) {
1961                         err = -ENODEV;
1962                         goto out_free;
1963                 }
1964         }
1965
1966         fl.oif = 0;
1967         if (rta[RTA_OIF-1])
1968                 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1969
1970         rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1971
1972         skb->dst = &rt->u.dst;
1973
1974         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1975         err = rt6_fill_node(skb, rt, 
1976                             &fl.fl6_dst, &fl.fl6_src,
1977                             iif,
1978                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1979                             nlh->nlmsg_seq, 0, 0);
1980         if (err < 0) {
1981                 err = -EMSGSIZE;
1982                 goto out_free;
1983         }
1984
1985         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1986         if (err > 0)
1987                 err = 0;
1988 out:
1989         return err;
1990 out_free:
1991         kfree_skb(skb);
1992         goto out;       
1993 }
1994
1995 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, 
1996                         struct netlink_skb_parms *req)
1997 {
1998         struct sk_buff *skb;
1999         int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
2000         u32 pid = current->pid;
2001         u32 seq = 0;
2002
2003         if (req)
2004                 pid = req->pid;
2005         if (nlh)
2006                 seq = nlh->nlmsg_seq;
2007         
2008         skb = alloc_skb(size, gfp_any());
2009         if (!skb) {
2010                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
2011                 return;
2012         }
2013         if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
2014                 kfree_skb(skb);
2015                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2016                 return;
2017         }
2018         NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2019         netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2020 }
2021
2022 /*
2023  *      /proc
2024  */
2025
2026 #ifdef CONFIG_PROC_FS
2027
2028 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2029
2030 struct rt6_proc_arg
2031 {
2032         char *buffer;
2033         int offset;
2034         int length;
2035         int skip;
2036         int len;
2037 };
2038
2039 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2040 {
2041         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2042         int i;
2043
2044         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2045                 arg->skip++;
2046                 return 0;
2047         }
2048
2049         if (arg->len >= arg->length)
2050                 return 0;
2051
2052         for (i=0; i<16; i++) {
2053                 sprintf(arg->buffer + arg->len, "%02x",
2054                         rt->rt6i_dst.addr.s6_addr[i]);
2055                 arg->len += 2;
2056         }
2057         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2058                             rt->rt6i_dst.plen);
2059
2060 #ifdef CONFIG_IPV6_SUBTREES
2061         for (i=0; i<16; i++) {
2062                 sprintf(arg->buffer + arg->len, "%02x",
2063                         rt->rt6i_src.addr.s6_addr[i]);
2064                 arg->len += 2;
2065         }
2066         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2067                             rt->rt6i_src.plen);
2068 #else
2069         sprintf(arg->buffer + arg->len,
2070                 "00000000000000000000000000000000 00 ");
2071         arg->len += 36;
2072 #endif
2073
2074         if (rt->rt6i_nexthop) {
2075                 for (i=0; i<16; i++) {
2076                         sprintf(arg->buffer + arg->len, "%02x",
2077                                 rt->rt6i_nexthop->primary_key[i]);
2078                         arg->len += 2;
2079                 }
2080         } else {
2081                 sprintf(arg->buffer + arg->len,
2082                         "00000000000000000000000000000000");
2083                 arg->len += 32;
2084         }
2085         arg->len += sprintf(arg->buffer + arg->len,
2086                             " %08x %08x %08x %08x %8s\n",
2087                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2088                             rt->u.dst.__use, rt->rt6i_flags, 
2089                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2090         return 0;
2091 }
2092
2093 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2094 {
2095         struct rt6_proc_arg arg;
2096         arg.buffer = buffer;
2097         arg.offset = offset;
2098         arg.length = length;
2099         arg.skip = 0;
2100         arg.len = 0;
2101
2102         read_lock_bh(&rt6_lock);
2103         fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2104         read_unlock_bh(&rt6_lock);
2105
2106         *start = buffer;
2107         if (offset)
2108                 *start += offset % RT6_INFO_LEN;
2109
2110         arg.len -= offset % RT6_INFO_LEN;
2111
2112         if (arg.len > length)
2113                 arg.len = length;
2114         if (arg.len < 0)
2115                 arg.len = 0;
2116
2117         return arg.len;
2118 }
2119
2120 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2121 {
2122         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2123                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2124                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2125                       rt6_stats.fib_rt_cache,
2126                       atomic_read(&ip6_dst_ops.entries),
2127                       rt6_stats.fib_discarded_routes);
2128
2129         return 0;
2130 }
2131
2132 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2133 {
2134         return single_open(file, rt6_stats_seq_show, NULL);
2135 }
2136
2137 static struct file_operations rt6_stats_seq_fops = {
2138         .owner   = THIS_MODULE,
2139         .open    = rt6_stats_seq_open,
2140         .read    = seq_read,
2141         .llseek  = seq_lseek,
2142         .release = single_release,
2143 };
2144 #endif  /* CONFIG_PROC_FS */
2145
2146 #ifdef CONFIG_SYSCTL
2147
2148 static int flush_delay;
2149
2150 static
2151 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2152                               void __user *buffer, size_t *lenp, loff_t *ppos)
2153 {
2154         if (write) {
2155                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2156                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2157                 return 0;
2158         } else
2159                 return -EINVAL;
2160 }
2161
2162 ctl_table ipv6_route_table[] = {
2163         {
2164                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2165                 .procname       =       "flush",
2166                 .data           =       &flush_delay,
2167                 .maxlen         =       sizeof(int),
2168                 .mode           =       0200,
2169                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2170         },
2171         {
2172                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2173                 .procname       =       "gc_thresh",
2174                 .data           =       &ip6_dst_ops.gc_thresh,
2175                 .maxlen         =       sizeof(int),
2176                 .mode           =       0644,
2177                 .proc_handler   =       &proc_dointvec,
2178         },
2179         {
2180                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2181                 .procname       =       "max_size",
2182                 .data           =       &ip6_rt_max_size,
2183                 .maxlen         =       sizeof(int),
2184                 .mode           =       0644,
2185                 .proc_handler   =       &proc_dointvec,
2186         },
2187         {
2188                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2189                 .procname       =       "gc_min_interval",
2190                 .data           =       &ip6_rt_gc_min_interval,
2191                 .maxlen         =       sizeof(int),
2192                 .mode           =       0644,
2193                 .proc_handler   =       &proc_dointvec_jiffies,
2194                 .strategy       =       &sysctl_jiffies,
2195         },
2196         {
2197                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2198                 .procname       =       "gc_timeout",
2199                 .data           =       &ip6_rt_gc_timeout,
2200                 .maxlen         =       sizeof(int),
2201                 .mode           =       0644,
2202                 .proc_handler   =       &proc_dointvec_jiffies,
2203                 .strategy       =       &sysctl_jiffies,
2204         },
2205         {
2206                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2207                 .procname       =       "gc_interval",
2208                 .data           =       &ip6_rt_gc_interval,
2209                 .maxlen         =       sizeof(int),
2210                 .mode           =       0644,
2211                 .proc_handler   =       &proc_dointvec_jiffies,
2212                 .strategy       =       &sysctl_jiffies,
2213         },
2214         {
2215                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2216                 .procname       =       "gc_elasticity",
2217                 .data           =       &ip6_rt_gc_elasticity,
2218                 .maxlen         =       sizeof(int),
2219                 .mode           =       0644,
2220                 .proc_handler   =       &proc_dointvec_jiffies,
2221                 .strategy       =       &sysctl_jiffies,
2222         },
2223         {
2224                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2225                 .procname       =       "mtu_expires",
2226                 .data           =       &ip6_rt_mtu_expires,
2227                 .maxlen         =       sizeof(int),
2228                 .mode           =       0644,
2229                 .proc_handler   =       &proc_dointvec_jiffies,
2230                 .strategy       =       &sysctl_jiffies,
2231         },
2232         {
2233                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2234                 .procname       =       "min_adv_mss",
2235                 .data           =       &ip6_rt_min_advmss,
2236                 .maxlen         =       sizeof(int),
2237                 .mode           =       0644,
2238                 .proc_handler   =       &proc_dointvec_jiffies,
2239                 .strategy       =       &sysctl_jiffies,
2240         },
2241         {
2242                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2243                 .procname       =       "gc_min_interval_ms",
2244                 .data           =       &ip6_rt_gc_min_interval,
2245                 .maxlen         =       sizeof(int),
2246                 .mode           =       0644,
2247                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2248                 .strategy       =       &sysctl_ms_jiffies,
2249         },
2250         { .ctl_name = 0 }
2251 };
2252
2253 #endif
2254
2255 void __init ip6_route_init(void)
2256 {
2257         struct proc_dir_entry *p;
2258
2259         ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2260                                                      sizeof(struct rt6_info),
2261                                                      0, SLAB_HWCACHE_ALIGN,
2262                                                      NULL, NULL);
2263         if (!ip6_dst_ops.kmem_cachep)
2264                 panic("cannot create ip6_dst_cache");
2265
2266         fib6_init();
2267 #ifdef  CONFIG_PROC_FS
2268         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2269         if (p)
2270                 p->owner = THIS_MODULE;
2271
2272         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2273 #endif
2274 #ifdef CONFIG_XFRM
2275         xfrm6_init();
2276 #endif
2277 }
2278
2279 void ip6_route_cleanup(void)
2280 {
2281 #ifdef CONFIG_PROC_FS
2282         proc_net_remove("ipv6_route");
2283         proc_net_remove("rt6_stats");
2284 #endif
2285 #ifdef CONFIG_XFRM
2286         xfrm6_fini();
2287 #endif
2288         rt6_ifdown(NULL);
2289         fib6_gc_cleanup();
2290         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2291 }