]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv6/route.c
ce1f49b595b078c4ff3746906c29e9fbdb5efd31
[net-next-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/init.h>
38 #include <linux/netlink.h>
39 #include <linux/if_arp.h>
40
41 #ifdef  CONFIG_PROC_FS
42 #include <linux/proc_fs.h>
43 #include <linux/seq_file.h>
44 #endif
45
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 #define RT6_SELECT_F_IFACE      0x1
78 #define RT6_SELECT_F_REACHABLE  0x2
79
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(void);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct sk_buff *skb);
98 static void             ip6_link_failure(struct sk_buff *skb);
99 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103                                            struct in6_addr *gwaddr, int ifindex,
104                                            unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106                                            struct in6_addr *gwaddr, int ifindex);
107 #endif
108
109 static struct dst_ops ip6_dst_ops = {
110         .family                 =       AF_INET6,
111         .protocol               =       __constant_htons(ETH_P_IPV6),
112         .gc                     =       ip6_dst_gc,
113         .gc_thresh              =       1024,
114         .check                  =       ip6_dst_check,
115         .destroy                =       ip6_dst_destroy,
116         .ifdown                 =       ip6_dst_ifdown,
117         .negative_advice        =       ip6_negative_advice,
118         .link_failure           =       ip6_link_failure,
119         .update_pmtu            =       ip6_rt_update_pmtu,
120         .entry_size             =       sizeof(struct rt6_info),
121 };
122
123 struct rt6_info ip6_null_entry = {
124         .u = {
125                 .dst = {
126                         .__refcnt       = ATOMIC_INIT(1),
127                         .__use          = 1,
128                         .dev            = &loopback_dev,
129                         .obsolete       = -1,
130                         .error          = -ENETUNREACH,
131                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
132                         .input          = ip6_pkt_discard,
133                         .output         = ip6_pkt_discard_out,
134                         .ops            = &ip6_dst_ops,
135                         .path           = (struct dst_entry*)&ip6_null_entry,
136                 }
137         },
138         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
139         .rt6i_metric    = ~(u32) 0,
140         .rt6i_ref       = ATOMIC_INIT(1),
141 };
142
143 struct fib6_node ip6_routing_table = {
144         .leaf           = &ip6_null_entry,
145         .fn_flags       = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
146 };
147
148 /* Protects all the ip6 fib */
149
150 DEFINE_RWLOCK(rt6_lock);
151
152
153 /* allocate dst with ip6_dst_ops */
154 static __inline__ struct rt6_info *ip6_dst_alloc(void)
155 {
156         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
157 }
158
159 static void ip6_dst_destroy(struct dst_entry *dst)
160 {
161         struct rt6_info *rt = (struct rt6_info *)dst;
162         struct inet6_dev *idev = rt->rt6i_idev;
163
164         if (idev != NULL) {
165                 rt->rt6i_idev = NULL;
166                 in6_dev_put(idev);
167         }       
168 }
169
170 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
171                            int how)
172 {
173         struct rt6_info *rt = (struct rt6_info *)dst;
174         struct inet6_dev *idev = rt->rt6i_idev;
175
176         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178                 if (loopback_idev != NULL) {
179                         rt->rt6i_idev = loopback_idev;
180                         in6_dev_put(idev);
181                 }
182         }
183 }
184
185 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
186 {
187         return (rt->rt6i_flags & RTF_EXPIRES &&
188                 time_after(jiffies, rt->rt6i_expires));
189 }
190
191 /*
192  *      Route lookup. Any rt6_lock is implied.
193  */
194
195 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
196                                                     int oif,
197                                                     int strict)
198 {
199         struct rt6_info *local = NULL;
200         struct rt6_info *sprt;
201
202         if (oif) {
203                 for (sprt = rt; sprt; sprt = sprt->u.next) {
204                         struct net_device *dev = sprt->rt6i_dev;
205                         if (dev->ifindex == oif)
206                                 return sprt;
207                         if (dev->flags & IFF_LOOPBACK) {
208                                 if (sprt->rt6i_idev == NULL ||
209                                     sprt->rt6i_idev->dev->ifindex != oif) {
210                                         if (strict && oif)
211                                                 continue;
212                                         if (local && (!oif || 
213                                                       local->rt6i_idev->dev->ifindex == oif))
214                                                 continue;
215                                 }
216                                 local = sprt;
217                         }
218                 }
219
220                 if (local)
221                         return local;
222
223                 if (strict)
224                         return &ip6_null_entry;
225         }
226         return rt;
227 }
228
229 #ifdef CONFIG_IPV6_ROUTER_PREF
230 static void rt6_probe(struct rt6_info *rt)
231 {
232         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
233         /*
234          * Okay, this does not seem to be appropriate
235          * for now, however, we need to check if it
236          * is really so; aka Router Reachability Probing.
237          *
238          * Router Reachability Probe MUST be rate-limited
239          * to no more than one per minute.
240          */
241         if (!neigh || (neigh->nud_state & NUD_VALID))
242                 return;
243         read_lock_bh(&neigh->lock);
244         if (!(neigh->nud_state & NUD_VALID) &&
245             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
246                 struct in6_addr mcaddr;
247                 struct in6_addr *target;
248
249                 neigh->updated = jiffies;
250                 read_unlock_bh(&neigh->lock);
251
252                 target = (struct in6_addr *)&neigh->primary_key;
253                 addrconf_addr_solict_mult(target, &mcaddr);
254                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
255         } else
256                 read_unlock_bh(&neigh->lock);
257 }
258 #else
259 static inline void rt6_probe(struct rt6_info *rt)
260 {
261         return;
262 }
263 #endif
264
265 /*
266  * Default Router Selection (RFC 2461 6.3.6)
267  */
268 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
269 {
270         struct net_device *dev = rt->rt6i_dev;
271         if (!oif || dev->ifindex == oif)
272                 return 2;
273         if ((dev->flags & IFF_LOOPBACK) &&
274             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
275                 return 1;
276         return 0;
277 }
278
279 static int inline rt6_check_neigh(struct rt6_info *rt)
280 {
281         struct neighbour *neigh = rt->rt6i_nexthop;
282         int m = 0;
283         if (rt->rt6i_flags & RTF_NONEXTHOP ||
284             !(rt->rt6i_flags & RTF_GATEWAY))
285                 m = 1;
286         else if (neigh) {
287                 read_lock_bh(&neigh->lock);
288                 if (neigh->nud_state & NUD_VALID)
289                         m = 2;
290                 read_unlock_bh(&neigh->lock);
291         }
292         return m;
293 }
294
295 static int rt6_score_route(struct rt6_info *rt, int oif,
296                            int strict)
297 {
298         int m, n;
299                 
300         m = rt6_check_dev(rt, oif);
301         if (!m && (strict & RT6_SELECT_F_IFACE))
302                 return -1;
303 #ifdef CONFIG_IPV6_ROUTER_PREF
304         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
305 #endif
306         n = rt6_check_neigh(rt);
307         if (n > 1)
308                 m |= 16;
309         else if (!n && strict & RT6_SELECT_F_REACHABLE)
310                 return -1;
311         return m;
312 }
313
314 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
315                                    int strict)
316 {
317         struct rt6_info *match = NULL, *last = NULL;
318         struct rt6_info *rt, *rt0 = *head;
319         u32 metric;
320         int mpri = -1;
321
322         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
323                   __FUNCTION__, head, head ? *head : NULL, oif);
324
325         for (rt = rt0, metric = rt0->rt6i_metric;
326              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
327              rt = rt->u.next) {
328                 int m;
329
330                 if (rt6_check_expired(rt))
331                         continue;
332
333                 last = rt;
334
335                 m = rt6_score_route(rt, oif, strict);
336                 if (m < 0)
337                         continue;
338
339                 if (m > mpri) {
340                         rt6_probe(match);
341                         match = rt;
342                         mpri = m;
343                 } else {
344                         rt6_probe(rt);
345                 }
346         }
347
348         if (!match &&
349             (strict & RT6_SELECT_F_REACHABLE) &&
350             last && last != rt0) {
351                 /* no entries matched; do round-robin */
352                 static DEFINE_SPINLOCK(lock);
353                 spin_lock(&lock);
354                 *head = rt0->u.next;
355                 rt0->u.next = last->u.next;
356                 last->u.next = rt0;
357                 spin_unlock(&lock);
358         }
359
360         RT6_TRACE("%s() => %p, score=%d\n",
361                   __FUNCTION__, match, mpri);
362
363         return (match ? match : &ip6_null_entry);
364 }
365
366 #ifdef CONFIG_IPV6_ROUTE_INFO
367 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
368                   struct in6_addr *gwaddr)
369 {
370         struct route_info *rinfo = (struct route_info *) opt;
371         struct in6_addr prefix_buf, *prefix;
372         unsigned int pref;
373         u32 lifetime;
374         struct rt6_info *rt;
375
376         if (len < sizeof(struct route_info)) {
377                 return -EINVAL;
378         }
379
380         /* Sanity check for prefix_len and length */
381         if (rinfo->length > 3) {
382                 return -EINVAL;
383         } else if (rinfo->prefix_len > 128) {
384                 return -EINVAL;
385         } else if (rinfo->prefix_len > 64) {
386                 if (rinfo->length < 2) {
387                         return -EINVAL;
388                 }
389         } else if (rinfo->prefix_len > 0) {
390                 if (rinfo->length < 1) {
391                         return -EINVAL;
392                 }
393         }
394
395         pref = rinfo->route_pref;
396         if (pref == ICMPV6_ROUTER_PREF_INVALID)
397                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
398
399         lifetime = htonl(rinfo->lifetime);
400         if (lifetime == 0xffffffff) {
401                 /* infinity */
402         } else if (lifetime > 0x7fffffff/HZ) {
403                 /* Avoid arithmetic overflow */
404                 lifetime = 0x7fffffff/HZ - 1;
405         }
406
407         if (rinfo->length == 3)
408                 prefix = (struct in6_addr *)rinfo->prefix;
409         else {
410                 /* this function is safe */
411                 ipv6_addr_prefix(&prefix_buf,
412                                  (struct in6_addr *)rinfo->prefix,
413                                  rinfo->prefix_len);
414                 prefix = &prefix_buf;
415         }
416
417         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
418
419         if (rt && !lifetime) {
420                 ip6_del_rt(rt, NULL, NULL, NULL);
421                 rt = NULL;
422         }
423
424         if (!rt && lifetime)
425                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
426                                         pref);
427         else if (rt)
428                 rt->rt6i_flags = RTF_ROUTEINFO |
429                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
430
431         if (rt) {
432                 if (lifetime == 0xffffffff) {
433                         rt->rt6i_flags &= ~RTF_EXPIRES;
434                 } else {
435                         rt->rt6i_expires = jiffies + HZ * lifetime;
436                         rt->rt6i_flags |= RTF_EXPIRES;
437                 }
438                 dst_release(&rt->u.dst);
439         }
440         return 0;
441 }
442 #endif
443
444 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
445                             int oif, int strict)
446 {
447         struct fib6_node *fn;
448         struct rt6_info *rt;
449
450         read_lock_bh(&rt6_lock);
451         fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
452         rt = rt6_device_match(fn->leaf, oif, strict);
453         dst_hold(&rt->u.dst);
454         rt->u.dst.__use++;
455         read_unlock_bh(&rt6_lock);
456
457         rt->u.dst.lastuse = jiffies;
458         if (rt->u.dst.error == 0)
459                 return rt;
460         dst_release(&rt->u.dst);
461         return NULL;
462 }
463
464 /* ip6_ins_rt is called with FREE rt6_lock.
465    It takes new route entry, the addition fails by any reason the
466    route is freed. In any case, if caller does not hold it, it may
467    be destroyed.
468  */
469
470 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
471                 void *_rtattr, struct netlink_skb_parms *req)
472 {
473         int err;
474
475         write_lock_bh(&rt6_lock);
476         err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
477         write_unlock_bh(&rt6_lock);
478
479         return err;
480 }
481
482 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
483                                       struct in6_addr *saddr)
484 {
485         struct rt6_info *rt;
486
487         /*
488          *      Clone the route.
489          */
490
491         rt = ip6_rt_copy(ort);
492
493         if (rt) {
494                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
495                         if (rt->rt6i_dst.plen != 128 &&
496                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
497                                 rt->rt6i_flags |= RTF_ANYCAST;
498                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
499                 }
500
501                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
502                 rt->rt6i_dst.plen = 128;
503                 rt->rt6i_flags |= RTF_CACHE;
504                 rt->u.dst.flags |= DST_HOST;
505
506 #ifdef CONFIG_IPV6_SUBTREES
507                 if (rt->rt6i_src.plen && saddr) {
508                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
509                         rt->rt6i_src.plen = 128;
510                 }
511 #endif
512
513                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
514
515         }
516
517         return rt;
518 }
519
520 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
521 {
522         struct rt6_info *rt = ip6_rt_copy(ort);
523         if (rt) {
524                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
525                 rt->rt6i_dst.plen = 128;
526                 rt->rt6i_flags |= RTF_CACHE;
527                 if (rt->rt6i_flags & RTF_REJECT)
528                         rt->u.dst.error = ort->u.dst.error;
529                 rt->u.dst.flags |= DST_HOST;
530                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
531         }
532         return rt;
533 }
534
535 #define BACKTRACK() \
536 if (rt == &ip6_null_entry) { \
537        while ((fn = fn->parent) != NULL) { \
538                 if (fn->fn_flags & RTN_ROOT) { \
539                         goto out; \
540                 } \
541                 if (fn->fn_flags & RTN_RTINFO) \
542                         goto restart; \
543         } \
544 }
545
546
547 void ip6_route_input(struct sk_buff *skb)
548 {
549         struct fib6_node *fn;
550         struct rt6_info *rt, *nrt;
551         int strict;
552         int attempts = 3;
553         int err;
554         int reachable = RT6_SELECT_F_REACHABLE;
555
556         strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
557
558 relookup:
559         read_lock_bh(&rt6_lock);
560
561 restart_2:
562         fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
563                          &skb->nh.ipv6h->saddr);
564
565 restart:
566         rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
567         BACKTRACK();
568         if (rt == &ip6_null_entry ||
569             rt->rt6i_flags & RTF_CACHE)
570                 goto out;
571
572         dst_hold(&rt->u.dst);
573         read_unlock_bh(&rt6_lock);
574
575         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
576                 nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
577         else {
578 #if CLONE_OFFLINK_ROUTE
579                 nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
580 #else
581                 goto out2;
582 #endif
583         }
584
585         dst_release(&rt->u.dst);
586         rt = nrt ? : &ip6_null_entry;
587
588         dst_hold(&rt->u.dst);
589         if (nrt) {
590                 err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
591                 if (!err)
592                         goto out2;
593         }
594
595         if (--attempts <= 0)
596                 goto out2;
597
598         /*
599          * Race condition! In the gap, when rt6_lock was
600          * released someone could insert this route.  Relookup.
601          */
602         dst_release(&rt->u.dst);
603         goto relookup;
604
605 out:
606         if (reachable) {
607                 reachable = 0;
608                 goto restart_2;
609         }
610         dst_hold(&rt->u.dst);
611         read_unlock_bh(&rt6_lock);
612 out2:
613         rt->u.dst.lastuse = jiffies;
614         rt->u.dst.__use++;
615         skb->dst = (struct dst_entry *) rt;
616         return;
617 }
618
619 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
620 {
621         struct fib6_node *fn;
622         struct rt6_info *rt, *nrt;
623         int strict;
624         int attempts = 3;
625         int err;
626         int reachable = RT6_SELECT_F_REACHABLE;
627
628         strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
629
630 relookup:
631         read_lock_bh(&rt6_lock);
632
633 restart_2:
634         fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
635
636 restart:
637         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
638         BACKTRACK();
639         if (rt == &ip6_null_entry ||
640             rt->rt6i_flags & RTF_CACHE)
641                 goto out;
642
643         dst_hold(&rt->u.dst);
644         read_unlock_bh(&rt6_lock);
645
646         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
647                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
648         else {
649 #if CLONE_OFFLINK_ROUTE
650                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
651 #else
652                 goto out2;
653 #endif
654         }
655
656         dst_release(&rt->u.dst);
657         rt = nrt ? : &ip6_null_entry;
658
659         dst_hold(&rt->u.dst);
660         if (nrt) {
661                 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
662                 if (!err)
663                         goto out2;
664         }
665
666         if (--attempts <= 0)
667                 goto out2;
668
669         /*
670          * Race condition! In the gap, when rt6_lock was
671          * released someone could insert this route.  Relookup.
672          */
673         dst_release(&rt->u.dst);
674         goto relookup;
675
676 out:
677         if (reachable) {
678                 reachable = 0;
679                 goto restart_2;
680         }
681         dst_hold(&rt->u.dst);
682         read_unlock_bh(&rt6_lock);
683 out2:
684         rt->u.dst.lastuse = jiffies;
685         rt->u.dst.__use++;
686         return &rt->u.dst;
687 }
688
689
690 /*
691  *      Destination cache support functions
692  */
693
694 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
695 {
696         struct rt6_info *rt;
697
698         rt = (struct rt6_info *) dst;
699
700         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
701                 return dst;
702
703         return NULL;
704 }
705
706 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
707 {
708         struct rt6_info *rt = (struct rt6_info *) dst;
709
710         if (rt) {
711                 if (rt->rt6i_flags & RTF_CACHE)
712                         ip6_del_rt(rt, NULL, NULL, NULL);
713                 else
714                         dst_release(dst);
715         }
716         return NULL;
717 }
718
719 static void ip6_link_failure(struct sk_buff *skb)
720 {
721         struct rt6_info *rt;
722
723         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
724
725         rt = (struct rt6_info *) skb->dst;
726         if (rt) {
727                 if (rt->rt6i_flags&RTF_CACHE) {
728                         dst_set_expires(&rt->u.dst, 0);
729                         rt->rt6i_flags |= RTF_EXPIRES;
730                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
731                         rt->rt6i_node->fn_sernum = -1;
732         }
733 }
734
735 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
736 {
737         struct rt6_info *rt6 = (struct rt6_info*)dst;
738
739         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
740                 rt6->rt6i_flags |= RTF_MODIFIED;
741                 if (mtu < IPV6_MIN_MTU) {
742                         mtu = IPV6_MIN_MTU;
743                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
744                 }
745                 dst->metrics[RTAX_MTU-1] = mtu;
746                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
747         }
748 }
749
750 static int ipv6_get_mtu(struct net_device *dev);
751
752 static inline unsigned int ipv6_advmss(unsigned int mtu)
753 {
754         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
755
756         if (mtu < ip6_rt_min_advmss)
757                 mtu = ip6_rt_min_advmss;
758
759         /*
760          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
761          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
762          * IPV6_MAXPLEN is also valid and means: "any MSS, 
763          * rely only on pmtu discovery"
764          */
765         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
766                 mtu = IPV6_MAXPLEN;
767         return mtu;
768 }
769
770 static struct dst_entry *ndisc_dst_gc_list;
771 DEFINE_SPINLOCK(ndisc_lock);
772
773 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
774                                   struct neighbour *neigh,
775                                   struct in6_addr *addr,
776                                   int (*output)(struct sk_buff *))
777 {
778         struct rt6_info *rt;
779         struct inet6_dev *idev = in6_dev_get(dev);
780
781         if (unlikely(idev == NULL))
782                 return NULL;
783
784         rt = ip6_dst_alloc();
785         if (unlikely(rt == NULL)) {
786                 in6_dev_put(idev);
787                 goto out;
788         }
789
790         dev_hold(dev);
791         if (neigh)
792                 neigh_hold(neigh);
793         else
794                 neigh = ndisc_get_neigh(dev, addr);
795
796         rt->rt6i_dev      = dev;
797         rt->rt6i_idev     = idev;
798         rt->rt6i_nexthop  = neigh;
799         atomic_set(&rt->u.dst.__refcnt, 1);
800         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
801         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
802         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
803         rt->u.dst.output  = output;
804
805 #if 0   /* there's no chance to use these for ndisc */
806         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
807                                 ? DST_HOST 
808                                 : 0;
809         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
810         rt->rt6i_dst.plen = 128;
811 #endif
812
813         spin_lock_bh(&ndisc_lock);
814         rt->u.dst.next = ndisc_dst_gc_list;
815         ndisc_dst_gc_list = &rt->u.dst;
816         spin_unlock_bh(&ndisc_lock);
817
818         fib6_force_start_gc();
819
820 out:
821         return (struct dst_entry *)rt;
822 }
823
824 int ndisc_dst_gc(int *more)
825 {
826         struct dst_entry *dst, *next, **pprev;
827         int freed;
828
829         next = NULL;
830         freed = 0;
831
832         spin_lock_bh(&ndisc_lock);
833         pprev = &ndisc_dst_gc_list;
834
835         while ((dst = *pprev) != NULL) {
836                 if (!atomic_read(&dst->__refcnt)) {
837                         *pprev = dst->next;
838                         dst_free(dst);
839                         freed++;
840                 } else {
841                         pprev = &dst->next;
842                         (*more)++;
843                 }
844         }
845
846         spin_unlock_bh(&ndisc_lock);
847
848         return freed;
849 }
850
851 static int ip6_dst_gc(void)
852 {
853         static unsigned expire = 30*HZ;
854         static unsigned long last_gc;
855         unsigned long now = jiffies;
856
857         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
858             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
859                 goto out;
860
861         expire++;
862         fib6_run_gc(expire);
863         last_gc = now;
864         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
865                 expire = ip6_rt_gc_timeout>>1;
866
867 out:
868         expire -= expire>>ip6_rt_gc_elasticity;
869         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
870 }
871
872 /* Clean host part of a prefix. Not necessary in radix tree,
873    but results in cleaner routing tables.
874
875    Remove it only when all the things will work!
876  */
877
878 static int ipv6_get_mtu(struct net_device *dev)
879 {
880         int mtu = IPV6_MIN_MTU;
881         struct inet6_dev *idev;
882
883         idev = in6_dev_get(dev);
884         if (idev) {
885                 mtu = idev->cnf.mtu6;
886                 in6_dev_put(idev);
887         }
888         return mtu;
889 }
890
891 int ipv6_get_hoplimit(struct net_device *dev)
892 {
893         int hoplimit = ipv6_devconf.hop_limit;
894         struct inet6_dev *idev;
895
896         idev = in6_dev_get(dev);
897         if (idev) {
898                 hoplimit = idev->cnf.hop_limit;
899                 in6_dev_put(idev);
900         }
901         return hoplimit;
902 }
903
904 /*
905  *
906  */
907
908 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, 
909                 void *_rtattr, struct netlink_skb_parms *req)
910 {
911         int err;
912         struct rtmsg *r;
913         struct rtattr **rta;
914         struct rt6_info *rt = NULL;
915         struct net_device *dev = NULL;
916         struct inet6_dev *idev = NULL;
917         int addr_type;
918
919         rta = (struct rtattr **) _rtattr;
920
921         if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
922                 return -EINVAL;
923 #ifndef CONFIG_IPV6_SUBTREES
924         if (rtmsg->rtmsg_src_len)
925                 return -EINVAL;
926 #endif
927         if (rtmsg->rtmsg_ifindex) {
928                 err = -ENODEV;
929                 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
930                 if (!dev)
931                         goto out;
932                 idev = in6_dev_get(dev);
933                 if (!idev)
934                         goto out;
935         }
936
937         if (rtmsg->rtmsg_metric == 0)
938                 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
939
940         rt = ip6_dst_alloc();
941
942         if (rt == NULL) {
943                 err = -ENOMEM;
944                 goto out;
945         }
946
947         rt->u.dst.obsolete = -1;
948         rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
949         if (nlh && (r = NLMSG_DATA(nlh))) {
950                 rt->rt6i_protocol = r->rtm_protocol;
951         } else {
952                 rt->rt6i_protocol = RTPROT_BOOT;
953         }
954
955         addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
956
957         if (addr_type & IPV6_ADDR_MULTICAST)
958                 rt->u.dst.input = ip6_mc_input;
959         else
960                 rt->u.dst.input = ip6_forward;
961
962         rt->u.dst.output = ip6_output;
963
964         ipv6_addr_prefix(&rt->rt6i_dst.addr, 
965                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
966         rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
967         if (rt->rt6i_dst.plen == 128)
968                rt->u.dst.flags = DST_HOST;
969
970 #ifdef CONFIG_IPV6_SUBTREES
971         ipv6_addr_prefix(&rt->rt6i_src.addr, 
972                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
973         rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
974 #endif
975
976         rt->rt6i_metric = rtmsg->rtmsg_metric;
977
978         /* We cannot add true routes via loopback here,
979            they would result in kernel looping; promote them to reject routes
980          */
981         if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
982             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
983                 /* hold loopback dev/idev if we haven't done so. */
984                 if (dev != &loopback_dev) {
985                         if (dev) {
986                                 dev_put(dev);
987                                 in6_dev_put(idev);
988                         }
989                         dev = &loopback_dev;
990                         dev_hold(dev);
991                         idev = in6_dev_get(dev);
992                         if (!idev) {
993                                 err = -ENODEV;
994                                 goto out;
995                         }
996                 }
997                 rt->u.dst.output = ip6_pkt_discard_out;
998                 rt->u.dst.input = ip6_pkt_discard;
999                 rt->u.dst.error = -ENETUNREACH;
1000                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1001                 goto install_route;
1002         }
1003
1004         if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
1005                 struct in6_addr *gw_addr;
1006                 int gwa_type;
1007
1008                 gw_addr = &rtmsg->rtmsg_gateway;
1009                 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
1010                 gwa_type = ipv6_addr_type(gw_addr);
1011
1012                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1013                         struct rt6_info *grt;
1014
1015                         /* IPv6 strictly inhibits using not link-local
1016                            addresses as nexthop address.
1017                            Otherwise, router will not able to send redirects.
1018                            It is very good, but in some (rare!) circumstances
1019                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1020                            some exceptions. --ANK
1021                          */
1022                         err = -EINVAL;
1023                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1024                                 goto out;
1025
1026                         grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1027
1028                         err = -EHOSTUNREACH;
1029                         if (grt == NULL)
1030                                 goto out;
1031                         if (dev) {
1032                                 if (dev != grt->rt6i_dev) {
1033                                         dst_release(&grt->u.dst);
1034                                         goto out;
1035                                 }
1036                         } else {
1037                                 dev = grt->rt6i_dev;
1038                                 idev = grt->rt6i_idev;
1039                                 dev_hold(dev);
1040                                 in6_dev_hold(grt->rt6i_idev);
1041                         }
1042                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1043                                 err = 0;
1044                         dst_release(&grt->u.dst);
1045
1046                         if (err)
1047                                 goto out;
1048                 }
1049                 err = -EINVAL;
1050                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1051                         goto out;
1052         }
1053
1054         err = -ENODEV;
1055         if (dev == NULL)
1056                 goto out;
1057
1058         if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1059                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1060                 if (IS_ERR(rt->rt6i_nexthop)) {
1061                         err = PTR_ERR(rt->rt6i_nexthop);
1062                         rt->rt6i_nexthop = NULL;
1063                         goto out;
1064                 }
1065         }
1066
1067         rt->rt6i_flags = rtmsg->rtmsg_flags;
1068
1069 install_route:
1070         if (rta && rta[RTA_METRICS-1]) {
1071                 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1072                 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1073
1074                 while (RTA_OK(attr, attrlen)) {
1075                         unsigned flavor = attr->rta_type;
1076                         if (flavor) {
1077                                 if (flavor > RTAX_MAX) {
1078                                         err = -EINVAL;
1079                                         goto out;
1080                                 }
1081                                 rt->u.dst.metrics[flavor-1] =
1082                                         *(u32 *)RTA_DATA(attr);
1083                         }
1084                         attr = RTA_NEXT(attr, attrlen);
1085                 }
1086         }
1087
1088         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1089                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1090         if (!rt->u.dst.metrics[RTAX_MTU-1])
1091                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1092         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1093                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1094         rt->u.dst.dev = dev;
1095         rt->rt6i_idev = idev;
1096         return ip6_ins_rt(rt, nlh, _rtattr, req);
1097
1098 out:
1099         if (dev)
1100                 dev_put(dev);
1101         if (idev)
1102                 in6_dev_put(idev);
1103         if (rt)
1104                 dst_free((struct dst_entry *) rt);
1105         return err;
1106 }
1107
1108 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1109 {
1110         int err;
1111
1112         write_lock_bh(&rt6_lock);
1113
1114         err = fib6_del(rt, nlh, _rtattr, req);
1115         dst_release(&rt->u.dst);
1116
1117         write_unlock_bh(&rt6_lock);
1118
1119         return err;
1120 }
1121
1122 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1123 {
1124         struct fib6_node *fn;
1125         struct rt6_info *rt;
1126         int err = -ESRCH;
1127
1128         read_lock_bh(&rt6_lock);
1129
1130         fn = fib6_locate(&ip6_routing_table,
1131                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1132                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1133         
1134         if (fn) {
1135                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1136                         if (rtmsg->rtmsg_ifindex &&
1137                             (rt->rt6i_dev == NULL ||
1138                              rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1139                                 continue;
1140                         if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1141                             !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1142                                 continue;
1143                         if (rtmsg->rtmsg_metric &&
1144                             rtmsg->rtmsg_metric != rt->rt6i_metric)
1145                                 continue;
1146                         dst_hold(&rt->u.dst);
1147                         read_unlock_bh(&rt6_lock);
1148
1149                         return ip6_del_rt(rt, nlh, _rtattr, req);
1150                 }
1151         }
1152         read_unlock_bh(&rt6_lock);
1153
1154         return err;
1155 }
1156
1157 /*
1158  *      Handle redirects
1159  */
1160 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1161                   struct neighbour *neigh, u8 *lladdr, int on_link)
1162 {
1163         struct rt6_info *rt, *nrt = NULL;
1164         int strict;
1165         struct fib6_node *fn;
1166         struct netevent_redirect netevent;
1167
1168         /*
1169          * Get the "current" route for this destination and
1170          * check if the redirect has come from approriate router.
1171          *
1172          * RFC 2461 specifies that redirects should only be
1173          * accepted if they come from the nexthop to the target.
1174          * Due to the way the routes are chosen, this notion
1175          * is a bit fuzzy and one might need to check all possible
1176          * routes.
1177          */
1178         strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
1179
1180         read_lock_bh(&rt6_lock);
1181         fn = fib6_lookup(&ip6_routing_table, dest, NULL);
1182 restart:
1183         for (rt = fn->leaf; rt; rt = rt->u.next) {
1184                 /*
1185                  * Current route is on-link; redirect is always invalid.
1186                  *
1187                  * Seems, previous statement is not true. It could
1188                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1189                  * But then router serving it might decide, that we should
1190                  * know truth 8)8) --ANK (980726).
1191                  */
1192                 if (rt6_check_expired(rt))
1193                         continue;
1194                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1195                         continue;
1196                 if (neigh->dev != rt->rt6i_dev)
1197                         continue;
1198                 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1199                         continue;
1200                 break;
1201         }
1202         if (rt)
1203                 dst_hold(&rt->u.dst);
1204         else if (strict) {
1205                 while ((fn = fn->parent) != NULL) {
1206                         if (fn->fn_flags & RTN_ROOT)
1207                                 break;
1208                         if (fn->fn_flags & RTN_RTINFO)
1209                                 goto restart;
1210                 }
1211         }
1212         read_unlock_bh(&rt6_lock);
1213
1214         if (!rt) {
1215                 if (net_ratelimit())
1216                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1217                                "for redirect target\n");
1218                 return;
1219         }
1220
1221         /*
1222          *      We have finally decided to accept it.
1223          */
1224
1225         neigh_update(neigh, lladdr, NUD_STALE, 
1226                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1227                      NEIGH_UPDATE_F_OVERRIDE|
1228                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1229                                      NEIGH_UPDATE_F_ISROUTER))
1230                      );
1231
1232         /*
1233          * Redirect received -> path was valid.
1234          * Look, redirects are sent only in response to data packets,
1235          * so that this nexthop apparently is reachable. --ANK
1236          */
1237         dst_confirm(&rt->u.dst);
1238
1239         /* Duplicate redirect: silently ignore. */
1240         if (neigh == rt->u.dst.neighbour)
1241                 goto out;
1242
1243         nrt = ip6_rt_copy(rt);
1244         if (nrt == NULL)
1245                 goto out;
1246
1247         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1248         if (on_link)
1249                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1250
1251         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1252         nrt->rt6i_dst.plen = 128;
1253         nrt->u.dst.flags |= DST_HOST;
1254
1255         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1256         nrt->rt6i_nexthop = neigh_clone(neigh);
1257         /* Reset pmtu, it may be better */
1258         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1259         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1260
1261         if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1262                 goto out;
1263
1264         netevent.old = &rt->u.dst;
1265         netevent.new = &nrt->u.dst;
1266         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1267
1268         if (rt->rt6i_flags&RTF_CACHE) {
1269                 ip6_del_rt(rt, NULL, NULL, NULL);
1270                 return;
1271         }
1272
1273 out:
1274         dst_release(&rt->u.dst);
1275         return;
1276 }
1277
1278 /*
1279  *      Handle ICMP "packet too big" messages
1280  *      i.e. Path MTU discovery
1281  */
1282
1283 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1284                         struct net_device *dev, u32 pmtu)
1285 {
1286         struct rt6_info *rt, *nrt;
1287         int allfrag = 0;
1288
1289         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1290         if (rt == NULL)
1291                 return;
1292
1293         if (pmtu >= dst_mtu(&rt->u.dst))
1294                 goto out;
1295
1296         if (pmtu < IPV6_MIN_MTU) {
1297                 /*
1298                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1299                  * MTU (1280) and a fragment header should always be included
1300                  * after a node receiving Too Big message reporting PMTU is
1301                  * less than the IPv6 Minimum Link MTU.
1302                  */
1303                 pmtu = IPV6_MIN_MTU;
1304                 allfrag = 1;
1305         }
1306
1307         /* New mtu received -> path was valid.
1308            They are sent only in response to data packets,
1309            so that this nexthop apparently is reachable. --ANK
1310          */
1311         dst_confirm(&rt->u.dst);
1312
1313         /* Host route. If it is static, it would be better
1314            not to override it, but add new one, so that
1315            when cache entry will expire old pmtu
1316            would return automatically.
1317          */
1318         if (rt->rt6i_flags & RTF_CACHE) {
1319                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1320                 if (allfrag)
1321                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1322                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1323                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1324                 goto out;
1325         }
1326
1327         /* Network route.
1328            Two cases are possible:
1329            1. It is connected route. Action: COW
1330            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1331          */
1332         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1333                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1334         else
1335                 nrt = rt6_alloc_clone(rt, daddr);
1336
1337         if (nrt) {
1338                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1339                 if (allfrag)
1340                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1341
1342                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1343                  * happened within 5 mins, the recommended timer is 10 mins.
1344                  * Here this route expiration time is set to ip6_rt_mtu_expires
1345                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1346                  * and detecting PMTU increase will be automatically happened.
1347                  */
1348                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1349                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1350
1351                 ip6_ins_rt(nrt, NULL, NULL, NULL);
1352         }
1353 out:
1354         dst_release(&rt->u.dst);
1355 }
1356
1357 /*
1358  *      Misc support functions
1359  */
1360
1361 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1362 {
1363         struct rt6_info *rt = ip6_dst_alloc();
1364
1365         if (rt) {
1366                 rt->u.dst.input = ort->u.dst.input;
1367                 rt->u.dst.output = ort->u.dst.output;
1368
1369                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1370                 rt->u.dst.dev = ort->u.dst.dev;
1371                 if (rt->u.dst.dev)
1372                         dev_hold(rt->u.dst.dev);
1373                 rt->rt6i_idev = ort->rt6i_idev;
1374                 if (rt->rt6i_idev)
1375                         in6_dev_hold(rt->rt6i_idev);
1376                 rt->u.dst.lastuse = jiffies;
1377                 rt->rt6i_expires = 0;
1378
1379                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1380                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1381                 rt->rt6i_metric = 0;
1382
1383                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1384 #ifdef CONFIG_IPV6_SUBTREES
1385                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1386 #endif
1387         }
1388         return rt;
1389 }
1390
1391 #ifdef CONFIG_IPV6_ROUTE_INFO
1392 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1393                                            struct in6_addr *gwaddr, int ifindex)
1394 {
1395         struct fib6_node *fn;
1396         struct rt6_info *rt = NULL;
1397
1398         write_lock_bh(&rt6_lock);
1399         fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1400         if (!fn)
1401                 goto out;
1402
1403         for (rt = fn->leaf; rt; rt = rt->u.next) {
1404                 if (rt->rt6i_dev->ifindex != ifindex)
1405                         continue;
1406                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1407                         continue;
1408                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1409                         continue;
1410                 dst_hold(&rt->u.dst);
1411                 break;
1412         }
1413 out:
1414         write_unlock_bh(&rt6_lock);
1415         return rt;
1416 }
1417
1418 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1419                                            struct in6_addr *gwaddr, int ifindex,
1420                                            unsigned pref)
1421 {
1422         struct in6_rtmsg rtmsg;
1423
1424         memset(&rtmsg, 0, sizeof(rtmsg));
1425         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1426         ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1427         rtmsg.rtmsg_dst_len = prefixlen;
1428         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1429         rtmsg.rtmsg_metric = 1024;
1430         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1431         /* We should treat it as a default route if prefix length is 0. */
1432         if (!prefixlen)
1433                 rtmsg.rtmsg_flags |= RTF_DEFAULT;
1434         rtmsg.rtmsg_ifindex = ifindex;
1435
1436         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1437
1438         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1439 }
1440 #endif
1441
1442 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1443 {       
1444         struct rt6_info *rt;
1445         struct fib6_node *fn;
1446
1447         fn = &ip6_routing_table;
1448
1449         write_lock_bh(&rt6_lock);
1450         for (rt = fn->leaf; rt; rt=rt->u.next) {
1451                 if (dev == rt->rt6i_dev &&
1452                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1453                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1454                         break;
1455         }
1456         if (rt)
1457                 dst_hold(&rt->u.dst);
1458         write_unlock_bh(&rt6_lock);
1459         return rt;
1460 }
1461
1462 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1463                                      struct net_device *dev,
1464                                      unsigned int pref)
1465 {
1466         struct in6_rtmsg rtmsg;
1467
1468         memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1469         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1470         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1471         rtmsg.rtmsg_metric = 1024;
1472         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1473                             RTF_PREF(pref);
1474
1475         rtmsg.rtmsg_ifindex = dev->ifindex;
1476
1477         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1478         return rt6_get_dflt_router(gwaddr, dev);
1479 }
1480
1481 void rt6_purge_dflt_routers(void)
1482 {
1483         struct rt6_info *rt;
1484
1485 restart:
1486         read_lock_bh(&rt6_lock);
1487         for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1488                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1489                         dst_hold(&rt->u.dst);
1490
1491                         read_unlock_bh(&rt6_lock);
1492
1493                         ip6_del_rt(rt, NULL, NULL, NULL);
1494
1495                         goto restart;
1496                 }
1497         }
1498         read_unlock_bh(&rt6_lock);
1499 }
1500
1501 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1502 {
1503         struct in6_rtmsg rtmsg;
1504         int err;
1505
1506         switch(cmd) {
1507         case SIOCADDRT:         /* Add a route */
1508         case SIOCDELRT:         /* Delete a route */
1509                 if (!capable(CAP_NET_ADMIN))
1510                         return -EPERM;
1511                 err = copy_from_user(&rtmsg, arg,
1512                                      sizeof(struct in6_rtmsg));
1513                 if (err)
1514                         return -EFAULT;
1515                         
1516                 rtnl_lock();
1517                 switch (cmd) {
1518                 case SIOCADDRT:
1519                         err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1520                         break;
1521                 case SIOCDELRT:
1522                         err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1523                         break;
1524                 default:
1525                         err = -EINVAL;
1526                 }
1527                 rtnl_unlock();
1528
1529                 return err;
1530         };
1531
1532         return -EINVAL;
1533 }
1534
1535 /*
1536  *      Drop the packet on the floor
1537  */
1538
1539 static int ip6_pkt_discard(struct sk_buff *skb)
1540 {
1541         int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1542         if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1543                 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1544
1545         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1546         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1547         kfree_skb(skb);
1548         return 0;
1549 }
1550
1551 static int ip6_pkt_discard_out(struct sk_buff *skb)
1552 {
1553         skb->dev = skb->dst->dev;
1554         return ip6_pkt_discard(skb);
1555 }
1556
1557 /*
1558  *      Allocate a dst for local (unicast / anycast) address.
1559  */
1560
1561 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1562                                     const struct in6_addr *addr,
1563                                     int anycast)
1564 {
1565         struct rt6_info *rt = ip6_dst_alloc();
1566
1567         if (rt == NULL)
1568                 return ERR_PTR(-ENOMEM);
1569
1570         dev_hold(&loopback_dev);
1571         in6_dev_hold(idev);
1572
1573         rt->u.dst.flags = DST_HOST;
1574         rt->u.dst.input = ip6_input;
1575         rt->u.dst.output = ip6_output;
1576         rt->rt6i_dev = &loopback_dev;
1577         rt->rt6i_idev = idev;
1578         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1579         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1580         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1581         rt->u.dst.obsolete = -1;
1582
1583         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1584         if (anycast)
1585                 rt->rt6i_flags |= RTF_ANYCAST;
1586         else
1587                 rt->rt6i_flags |= RTF_LOCAL;
1588         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1589         if (rt->rt6i_nexthop == NULL) {
1590                 dst_free((struct dst_entry *) rt);
1591                 return ERR_PTR(-ENOMEM);
1592         }
1593
1594         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1595         rt->rt6i_dst.plen = 128;
1596
1597         atomic_set(&rt->u.dst.__refcnt, 1);
1598
1599         return rt;
1600 }
1601
1602 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1603 {
1604         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1605             rt != &ip6_null_entry) {
1606                 RT6_TRACE("deleted by ifdown %p\n", rt);
1607                 return -1;
1608         }
1609         return 0;
1610 }
1611
1612 void rt6_ifdown(struct net_device *dev)
1613 {
1614         write_lock_bh(&rt6_lock);
1615         fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1616         write_unlock_bh(&rt6_lock);
1617 }
1618
1619 struct rt6_mtu_change_arg
1620 {
1621         struct net_device *dev;
1622         unsigned mtu;
1623 };
1624
1625 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1626 {
1627         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1628         struct inet6_dev *idev;
1629
1630         /* In IPv6 pmtu discovery is not optional,
1631            so that RTAX_MTU lock cannot disable it.
1632            We still use this lock to block changes
1633            caused by addrconf/ndisc.
1634         */
1635
1636         idev = __in6_dev_get(arg->dev);
1637         if (idev == NULL)
1638                 return 0;
1639
1640         /* For administrative MTU increase, there is no way to discover
1641            IPv6 PMTU increase, so PMTU increase should be updated here.
1642            Since RFC 1981 doesn't include administrative MTU increase
1643            update PMTU increase is a MUST. (i.e. jumbo frame)
1644          */
1645         /*
1646            If new MTU is less than route PMTU, this new MTU will be the
1647            lowest MTU in the path, update the route PMTU to reflect PMTU
1648            decreases; if new MTU is greater than route PMTU, and the
1649            old MTU is the lowest MTU in the path, update the route PMTU
1650            to reflect the increase. In this case if the other nodes' MTU
1651            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1652            PMTU discouvery.
1653          */
1654         if (rt->rt6i_dev == arg->dev &&
1655             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1656             (dst_mtu(&rt->u.dst) > arg->mtu ||
1657              (dst_mtu(&rt->u.dst) < arg->mtu &&
1658               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1659                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1660         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1661         return 0;
1662 }
1663
1664 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1665 {
1666         struct rt6_mtu_change_arg arg;
1667
1668         arg.dev = dev;
1669         arg.mtu = mtu;
1670         read_lock_bh(&rt6_lock);
1671         fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1672         read_unlock_bh(&rt6_lock);
1673 }
1674
1675 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1676                               struct in6_rtmsg *rtmsg)
1677 {
1678         memset(rtmsg, 0, sizeof(*rtmsg));
1679
1680         rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1681         rtmsg->rtmsg_src_len = r->rtm_src_len;
1682         rtmsg->rtmsg_flags = RTF_UP;
1683         if (r->rtm_type == RTN_UNREACHABLE)
1684                 rtmsg->rtmsg_flags |= RTF_REJECT;
1685
1686         if (rta[RTA_GATEWAY-1]) {
1687                 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1688                         return -EINVAL;
1689                 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1690                 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1691         }
1692         if (rta[RTA_DST-1]) {
1693                 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1694                         return -EINVAL;
1695                 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1696         }
1697         if (rta[RTA_SRC-1]) {
1698                 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1699                         return -EINVAL;
1700                 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1701         }
1702         if (rta[RTA_OIF-1]) {
1703                 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1704                         return -EINVAL;
1705                 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1706         }
1707         if (rta[RTA_PRIORITY-1]) {
1708                 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1709                         return -EINVAL;
1710                 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1711         }
1712         return 0;
1713 }
1714
1715 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1716 {
1717         struct rtmsg *r = NLMSG_DATA(nlh);
1718         struct in6_rtmsg rtmsg;
1719
1720         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1721                 return -EINVAL;
1722         return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1723 }
1724
1725 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1726 {
1727         struct rtmsg *r = NLMSG_DATA(nlh);
1728         struct in6_rtmsg rtmsg;
1729
1730         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1731                 return -EINVAL;
1732         return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1733 }
1734
1735 struct rt6_rtnl_dump_arg
1736 {
1737         struct sk_buff *skb;
1738         struct netlink_callback *cb;
1739 };
1740
1741 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1742                          struct in6_addr *dst, struct in6_addr *src,
1743                          int iif, int type, u32 pid, u32 seq,
1744                          int prefix, unsigned int flags)
1745 {
1746         struct rtmsg *rtm;
1747         struct nlmsghdr  *nlh;
1748         unsigned char    *b = skb->tail;
1749         struct rta_cacheinfo ci;
1750
1751         if (prefix) {   /* user wants prefix routes only */
1752                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1753                         /* success since this is not a prefix route */
1754                         return 1;
1755                 }
1756         }
1757
1758         nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1759         rtm = NLMSG_DATA(nlh);
1760         rtm->rtm_family = AF_INET6;
1761         rtm->rtm_dst_len = rt->rt6i_dst.plen;
1762         rtm->rtm_src_len = rt->rt6i_src.plen;
1763         rtm->rtm_tos = 0;
1764         rtm->rtm_table = RT_TABLE_MAIN;
1765         if (rt->rt6i_flags&RTF_REJECT)
1766                 rtm->rtm_type = RTN_UNREACHABLE;
1767         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1768                 rtm->rtm_type = RTN_LOCAL;
1769         else
1770                 rtm->rtm_type = RTN_UNICAST;
1771         rtm->rtm_flags = 0;
1772         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1773         rtm->rtm_protocol = rt->rt6i_protocol;
1774         if (rt->rt6i_flags&RTF_DYNAMIC)
1775                 rtm->rtm_protocol = RTPROT_REDIRECT;
1776         else if (rt->rt6i_flags & RTF_ADDRCONF)
1777                 rtm->rtm_protocol = RTPROT_KERNEL;
1778         else if (rt->rt6i_flags&RTF_DEFAULT)
1779                 rtm->rtm_protocol = RTPROT_RA;
1780
1781         if (rt->rt6i_flags&RTF_CACHE)
1782                 rtm->rtm_flags |= RTM_F_CLONED;
1783
1784         if (dst) {
1785                 RTA_PUT(skb, RTA_DST, 16, dst);
1786                 rtm->rtm_dst_len = 128;
1787         } else if (rtm->rtm_dst_len)
1788                 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1789 #ifdef CONFIG_IPV6_SUBTREES
1790         if (src) {
1791                 RTA_PUT(skb, RTA_SRC, 16, src);
1792                 rtm->rtm_src_len = 128;
1793         } else if (rtm->rtm_src_len)
1794                 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1795 #endif
1796         if (iif)
1797                 RTA_PUT(skb, RTA_IIF, 4, &iif);
1798         else if (dst) {
1799                 struct in6_addr saddr_buf;
1800                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1801                         RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1802         }
1803         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1804                 goto rtattr_failure;
1805         if (rt->u.dst.neighbour)
1806                 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1807         if (rt->u.dst.dev)
1808                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1809         RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1810         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1811         if (rt->rt6i_expires)
1812                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1813         else
1814                 ci.rta_expires = 0;
1815         ci.rta_used = rt->u.dst.__use;
1816         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1817         ci.rta_error = rt->u.dst.error;
1818         ci.rta_id = 0;
1819         ci.rta_ts = 0;
1820         ci.rta_tsage = 0;
1821         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1822         nlh->nlmsg_len = skb->tail - b;
1823         return skb->len;
1824
1825 nlmsg_failure:
1826 rtattr_failure:
1827         skb_trim(skb, b - skb->data);
1828         return -1;
1829 }
1830
1831 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1832 {
1833         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1834         int prefix;
1835
1836         if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1837                 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1838                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1839         } else
1840                 prefix = 0;
1841
1842         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1843                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1844                      prefix, NLM_F_MULTI);
1845 }
1846
1847 static int fib6_dump_node(struct fib6_walker_t *w)
1848 {
1849         int res;
1850         struct rt6_info *rt;
1851
1852         for (rt = w->leaf; rt; rt = rt->u.next) {
1853                 res = rt6_dump_route(rt, w->args);
1854                 if (res < 0) {
1855                         /* Frame is full, suspend walking */
1856                         w->leaf = rt;
1857                         return 1;
1858                 }
1859                 BUG_TRAP(res!=0);
1860         }
1861         w->leaf = NULL;
1862         return 0;
1863 }
1864
1865 static void fib6_dump_end(struct netlink_callback *cb)
1866 {
1867         struct fib6_walker_t *w = (void*)cb->args[0];
1868
1869         if (w) {
1870                 cb->args[0] = 0;
1871                 fib6_walker_unlink(w);
1872                 kfree(w);
1873         }
1874         cb->done = (void*)cb->args[1];
1875         cb->args[1] = 0;
1876 }
1877
1878 static int fib6_dump_done(struct netlink_callback *cb)
1879 {
1880         fib6_dump_end(cb);
1881         return cb->done ? cb->done(cb) : 0;
1882 }
1883
1884 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1885 {
1886         struct rt6_rtnl_dump_arg arg;
1887         struct fib6_walker_t *w;
1888         int res;
1889
1890         arg.skb = skb;
1891         arg.cb = cb;
1892
1893         w = (void*)cb->args[0];
1894         if (w == NULL) {
1895                 /* New dump:
1896                  * 
1897                  * 1. hook callback destructor.
1898                  */
1899                 cb->args[1] = (long)cb->done;
1900                 cb->done = fib6_dump_done;
1901
1902                 /*
1903                  * 2. allocate and initialize walker.
1904                  */
1905                 w = kzalloc(sizeof(*w), GFP_ATOMIC);
1906                 if (w == NULL)
1907                         return -ENOMEM;
1908                 RT6_TRACE("dump<%p", w);
1909                 w->root = &ip6_routing_table;
1910                 w->func = fib6_dump_node;
1911                 w->args = &arg;
1912                 cb->args[0] = (long)w;
1913                 read_lock_bh(&rt6_lock);
1914                 res = fib6_walk(w);
1915                 read_unlock_bh(&rt6_lock);
1916         } else {
1917                 w->args = &arg;
1918                 read_lock_bh(&rt6_lock);
1919                 res = fib6_walk_continue(w);
1920                 read_unlock_bh(&rt6_lock);
1921         }
1922 #if RT6_DEBUG >= 3
1923         if (res <= 0 && skb->len == 0)
1924                 RT6_TRACE("%p>dump end\n", w);
1925 #endif
1926         res = res < 0 ? res : skb->len;
1927         /* res < 0 is an error. (really, impossible)
1928            res == 0 means that dump is complete, but skb still can contain data.
1929            res > 0 dump is not complete, but frame is full.
1930          */
1931         /* Destroy walker, if dump of this table is complete. */
1932         if (res <= 0)
1933                 fib6_dump_end(cb);
1934         return res;
1935 }
1936
1937 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1938 {
1939         struct rtattr **rta = arg;
1940         int iif = 0;
1941         int err = -ENOBUFS;
1942         struct sk_buff *skb;
1943         struct flowi fl;
1944         struct rt6_info *rt;
1945
1946         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1947         if (skb == NULL)
1948                 goto out;
1949
1950         /* Reserve room for dummy headers, this skb can pass
1951            through good chunk of routing engine.
1952          */
1953         skb->mac.raw = skb->data;
1954         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1955
1956         memset(&fl, 0, sizeof(fl));
1957         if (rta[RTA_SRC-1])
1958                 ipv6_addr_copy(&fl.fl6_src,
1959                                (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1960         if (rta[RTA_DST-1])
1961                 ipv6_addr_copy(&fl.fl6_dst,
1962                                (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1963
1964         if (rta[RTA_IIF-1])
1965                 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1966
1967         if (iif) {
1968                 struct net_device *dev;
1969                 dev = __dev_get_by_index(iif);
1970                 if (!dev) {
1971                         err = -ENODEV;
1972                         goto out_free;
1973                 }
1974         }
1975
1976         fl.oif = 0;
1977         if (rta[RTA_OIF-1])
1978                 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1979
1980         rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1981
1982         skb->dst = &rt->u.dst;
1983
1984         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1985         err = rt6_fill_node(skb, rt, 
1986                             &fl.fl6_dst, &fl.fl6_src,
1987                             iif,
1988                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1989                             nlh->nlmsg_seq, 0, 0);
1990         if (err < 0) {
1991                 err = -EMSGSIZE;
1992                 goto out_free;
1993         }
1994
1995         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1996         if (err > 0)
1997                 err = 0;
1998 out:
1999         return err;
2000 out_free:
2001         kfree_skb(skb);
2002         goto out;       
2003 }
2004
2005 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, 
2006                         struct netlink_skb_parms *req)
2007 {
2008         struct sk_buff *skb;
2009         int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
2010         u32 pid = current->pid;
2011         u32 seq = 0;
2012
2013         if (req)
2014                 pid = req->pid;
2015         if (nlh)
2016                 seq = nlh->nlmsg_seq;
2017         
2018         skb = alloc_skb(size, gfp_any());
2019         if (!skb) {
2020                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
2021                 return;
2022         }
2023         if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
2024                 kfree_skb(skb);
2025                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2026                 return;
2027         }
2028         NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2029         netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2030 }
2031
2032 /*
2033  *      /proc
2034  */
2035
2036 #ifdef CONFIG_PROC_FS
2037
2038 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2039
2040 struct rt6_proc_arg
2041 {
2042         char *buffer;
2043         int offset;
2044         int length;
2045         int skip;
2046         int len;
2047 };
2048
2049 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2050 {
2051         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2052         int i;
2053
2054         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2055                 arg->skip++;
2056                 return 0;
2057         }
2058
2059         if (arg->len >= arg->length)
2060                 return 0;
2061
2062         for (i=0; i<16; i++) {
2063                 sprintf(arg->buffer + arg->len, "%02x",
2064                         rt->rt6i_dst.addr.s6_addr[i]);
2065                 arg->len += 2;
2066         }
2067         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2068                             rt->rt6i_dst.plen);
2069
2070 #ifdef CONFIG_IPV6_SUBTREES
2071         for (i=0; i<16; i++) {
2072                 sprintf(arg->buffer + arg->len, "%02x",
2073                         rt->rt6i_src.addr.s6_addr[i]);
2074                 arg->len += 2;
2075         }
2076         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2077                             rt->rt6i_src.plen);
2078 #else
2079         sprintf(arg->buffer + arg->len,
2080                 "00000000000000000000000000000000 00 ");
2081         arg->len += 36;
2082 #endif
2083
2084         if (rt->rt6i_nexthop) {
2085                 for (i=0; i<16; i++) {
2086                         sprintf(arg->buffer + arg->len, "%02x",
2087                                 rt->rt6i_nexthop->primary_key[i]);
2088                         arg->len += 2;
2089                 }
2090         } else {
2091                 sprintf(arg->buffer + arg->len,
2092                         "00000000000000000000000000000000");
2093                 arg->len += 32;
2094         }
2095         arg->len += sprintf(arg->buffer + arg->len,
2096                             " %08x %08x %08x %08x %8s\n",
2097                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2098                             rt->u.dst.__use, rt->rt6i_flags, 
2099                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2100         return 0;
2101 }
2102
2103 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2104 {
2105         struct rt6_proc_arg arg;
2106         arg.buffer = buffer;
2107         arg.offset = offset;
2108         arg.length = length;
2109         arg.skip = 0;
2110         arg.len = 0;
2111
2112         read_lock_bh(&rt6_lock);
2113         fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2114         read_unlock_bh(&rt6_lock);
2115
2116         *start = buffer;
2117         if (offset)
2118                 *start += offset % RT6_INFO_LEN;
2119
2120         arg.len -= offset % RT6_INFO_LEN;
2121
2122         if (arg.len > length)
2123                 arg.len = length;
2124         if (arg.len < 0)
2125                 arg.len = 0;
2126
2127         return arg.len;
2128 }
2129
2130 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2131 {
2132         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2133                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2134                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2135                       rt6_stats.fib_rt_cache,
2136                       atomic_read(&ip6_dst_ops.entries),
2137                       rt6_stats.fib_discarded_routes);
2138
2139         return 0;
2140 }
2141
2142 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2143 {
2144         return single_open(file, rt6_stats_seq_show, NULL);
2145 }
2146
2147 static struct file_operations rt6_stats_seq_fops = {
2148         .owner   = THIS_MODULE,
2149         .open    = rt6_stats_seq_open,
2150         .read    = seq_read,
2151         .llseek  = seq_lseek,
2152         .release = single_release,
2153 };
2154 #endif  /* CONFIG_PROC_FS */
2155
2156 #ifdef CONFIG_SYSCTL
2157
2158 static int flush_delay;
2159
2160 static
2161 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2162                               void __user *buffer, size_t *lenp, loff_t *ppos)
2163 {
2164         if (write) {
2165                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2166                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2167                 return 0;
2168         } else
2169                 return -EINVAL;
2170 }
2171
2172 ctl_table ipv6_route_table[] = {
2173         {
2174                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2175                 .procname       =       "flush",
2176                 .data           =       &flush_delay,
2177                 .maxlen         =       sizeof(int),
2178                 .mode           =       0200,
2179                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2180         },
2181         {
2182                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2183                 .procname       =       "gc_thresh",
2184                 .data           =       &ip6_dst_ops.gc_thresh,
2185                 .maxlen         =       sizeof(int),
2186                 .mode           =       0644,
2187                 .proc_handler   =       &proc_dointvec,
2188         },
2189         {
2190                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2191                 .procname       =       "max_size",
2192                 .data           =       &ip6_rt_max_size,
2193                 .maxlen         =       sizeof(int),
2194                 .mode           =       0644,
2195                 .proc_handler   =       &proc_dointvec,
2196         },
2197         {
2198                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2199                 .procname       =       "gc_min_interval",
2200                 .data           =       &ip6_rt_gc_min_interval,
2201                 .maxlen         =       sizeof(int),
2202                 .mode           =       0644,
2203                 .proc_handler   =       &proc_dointvec_jiffies,
2204                 .strategy       =       &sysctl_jiffies,
2205         },
2206         {
2207                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2208                 .procname       =       "gc_timeout",
2209                 .data           =       &ip6_rt_gc_timeout,
2210                 .maxlen         =       sizeof(int),
2211                 .mode           =       0644,
2212                 .proc_handler   =       &proc_dointvec_jiffies,
2213                 .strategy       =       &sysctl_jiffies,
2214         },
2215         {
2216                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2217                 .procname       =       "gc_interval",
2218                 .data           =       &ip6_rt_gc_interval,
2219                 .maxlen         =       sizeof(int),
2220                 .mode           =       0644,
2221                 .proc_handler   =       &proc_dointvec_jiffies,
2222                 .strategy       =       &sysctl_jiffies,
2223         },
2224         {
2225                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2226                 .procname       =       "gc_elasticity",
2227                 .data           =       &ip6_rt_gc_elasticity,
2228                 .maxlen         =       sizeof(int),
2229                 .mode           =       0644,
2230                 .proc_handler   =       &proc_dointvec_jiffies,
2231                 .strategy       =       &sysctl_jiffies,
2232         },
2233         {
2234                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2235                 .procname       =       "mtu_expires",
2236                 .data           =       &ip6_rt_mtu_expires,
2237                 .maxlen         =       sizeof(int),
2238                 .mode           =       0644,
2239                 .proc_handler   =       &proc_dointvec_jiffies,
2240                 .strategy       =       &sysctl_jiffies,
2241         },
2242         {
2243                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2244                 .procname       =       "min_adv_mss",
2245                 .data           =       &ip6_rt_min_advmss,
2246                 .maxlen         =       sizeof(int),
2247                 .mode           =       0644,
2248                 .proc_handler   =       &proc_dointvec_jiffies,
2249                 .strategy       =       &sysctl_jiffies,
2250         },
2251         {
2252                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2253                 .procname       =       "gc_min_interval_ms",
2254                 .data           =       &ip6_rt_gc_min_interval,
2255                 .maxlen         =       sizeof(int),
2256                 .mode           =       0644,
2257                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2258                 .strategy       =       &sysctl_ms_jiffies,
2259         },
2260         { .ctl_name = 0 }
2261 };
2262
2263 #endif
2264
2265 void __init ip6_route_init(void)
2266 {
2267         struct proc_dir_entry *p;
2268
2269         ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2270                                                      sizeof(struct rt6_info),
2271                                                      0, SLAB_HWCACHE_ALIGN,
2272                                                      NULL, NULL);
2273         if (!ip6_dst_ops.kmem_cachep)
2274                 panic("cannot create ip6_dst_cache");
2275
2276         fib6_init();
2277 #ifdef  CONFIG_PROC_FS
2278         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2279         if (p)
2280                 p->owner = THIS_MODULE;
2281
2282         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2283 #endif
2284 #ifdef CONFIG_XFRM
2285         xfrm6_init();
2286 #endif
2287 }
2288
2289 void ip6_route_cleanup(void)
2290 {
2291 #ifdef CONFIG_PROC_FS
2292         proc_net_remove("ipv6_route");
2293         proc_net_remove("rt6_stats");
2294 #endif
2295 #ifdef CONFIG_XFRM
2296         xfrm6_fini();
2297 #endif
2298         rt6_ifdown(NULL);
2299         fib6_gc_cleanup();
2300         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2301 }