]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv6/route.c
[IPv6] route: Simplify ip6_ins_rt()
[net-next-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/init.h>
38 #include <linux/if_arp.h>
39
40 #ifdef  CONFIG_PROC_FS
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #endif
44
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 #define RT6_SELECT_F_IFACE      0x1
78 #define RT6_SELECT_F_REACHABLE  0x2
79
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(void);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct sk_buff *skb);
98 static void             ip6_link_failure(struct sk_buff *skb);
99 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103                                            struct in6_addr *gwaddr, int ifindex,
104                                            unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106                                            struct in6_addr *gwaddr, int ifindex);
107 #endif
108
109 static struct dst_ops ip6_dst_ops = {
110         .family                 =       AF_INET6,
111         .protocol               =       __constant_htons(ETH_P_IPV6),
112         .gc                     =       ip6_dst_gc,
113         .gc_thresh              =       1024,
114         .check                  =       ip6_dst_check,
115         .destroy                =       ip6_dst_destroy,
116         .ifdown                 =       ip6_dst_ifdown,
117         .negative_advice        =       ip6_negative_advice,
118         .link_failure           =       ip6_link_failure,
119         .update_pmtu            =       ip6_rt_update_pmtu,
120         .entry_size             =       sizeof(struct rt6_info),
121 };
122
123 struct rt6_info ip6_null_entry = {
124         .u = {
125                 .dst = {
126                         .__refcnt       = ATOMIC_INIT(1),
127                         .__use          = 1,
128                         .dev            = &loopback_dev,
129                         .obsolete       = -1,
130                         .error          = -ENETUNREACH,
131                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
132                         .input          = ip6_pkt_discard,
133                         .output         = ip6_pkt_discard_out,
134                         .ops            = &ip6_dst_ops,
135                         .path           = (struct dst_entry*)&ip6_null_entry,
136                 }
137         },
138         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
139         .rt6i_metric    = ~(u32) 0,
140         .rt6i_ref       = ATOMIC_INIT(1),
141 };
142
143 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
144
145 struct rt6_info ip6_prohibit_entry = {
146         .u = {
147                 .dst = {
148                         .__refcnt       = ATOMIC_INIT(1),
149                         .__use          = 1,
150                         .dev            = &loopback_dev,
151                         .obsolete       = -1,
152                         .error          = -EACCES,
153                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
154                         .input          = ip6_pkt_discard,
155                         .output         = ip6_pkt_discard_out,
156                         .ops            = &ip6_dst_ops,
157                         .path           = (struct dst_entry*)&ip6_prohibit_entry,
158                 }
159         },
160         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
161         .rt6i_metric    = ~(u32) 0,
162         .rt6i_ref       = ATOMIC_INIT(1),
163 };
164
165 struct rt6_info ip6_blk_hole_entry = {
166         .u = {
167                 .dst = {
168                         .__refcnt       = ATOMIC_INIT(1),
169                         .__use          = 1,
170                         .dev            = &loopback_dev,
171                         .obsolete       = -1,
172                         .error          = -EINVAL,
173                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
174                         .input          = ip6_pkt_discard,
175                         .output         = ip6_pkt_discard_out,
176                         .ops            = &ip6_dst_ops,
177                         .path           = (struct dst_entry*)&ip6_blk_hole_entry,
178                 }
179         },
180         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
181         .rt6i_metric    = ~(u32) 0,
182         .rt6i_ref       = ATOMIC_INIT(1),
183 };
184
185 #endif
186
187 /* allocate dst with ip6_dst_ops */
188 static __inline__ struct rt6_info *ip6_dst_alloc(void)
189 {
190         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
191 }
192
193 static void ip6_dst_destroy(struct dst_entry *dst)
194 {
195         struct rt6_info *rt = (struct rt6_info *)dst;
196         struct inet6_dev *idev = rt->rt6i_idev;
197
198         if (idev != NULL) {
199                 rt->rt6i_idev = NULL;
200                 in6_dev_put(idev);
201         }       
202 }
203
204 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
205                            int how)
206 {
207         struct rt6_info *rt = (struct rt6_info *)dst;
208         struct inet6_dev *idev = rt->rt6i_idev;
209
210         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
211                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
212                 if (loopback_idev != NULL) {
213                         rt->rt6i_idev = loopback_idev;
214                         in6_dev_put(idev);
215                 }
216         }
217 }
218
219 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
220 {
221         return (rt->rt6i_flags & RTF_EXPIRES &&
222                 time_after(jiffies, rt->rt6i_expires));
223 }
224
225 static inline int rt6_need_strict(struct in6_addr *daddr)
226 {
227         return (ipv6_addr_type(daddr) &
228                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
229 }
230
231 /*
232  *      Route lookup. Any table->tb6_lock is implied.
233  */
234
235 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
236                                                     int oif,
237                                                     int strict)
238 {
239         struct rt6_info *local = NULL;
240         struct rt6_info *sprt;
241
242         if (oif) {
243                 for (sprt = rt; sprt; sprt = sprt->u.next) {
244                         struct net_device *dev = sprt->rt6i_dev;
245                         if (dev->ifindex == oif)
246                                 return sprt;
247                         if (dev->flags & IFF_LOOPBACK) {
248                                 if (sprt->rt6i_idev == NULL ||
249                                     sprt->rt6i_idev->dev->ifindex != oif) {
250                                         if (strict && oif)
251                                                 continue;
252                                         if (local && (!oif || 
253                                                       local->rt6i_idev->dev->ifindex == oif))
254                                                 continue;
255                                 }
256                                 local = sprt;
257                         }
258                 }
259
260                 if (local)
261                         return local;
262
263                 if (strict)
264                         return &ip6_null_entry;
265         }
266         return rt;
267 }
268
269 #ifdef CONFIG_IPV6_ROUTER_PREF
270 static void rt6_probe(struct rt6_info *rt)
271 {
272         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
273         /*
274          * Okay, this does not seem to be appropriate
275          * for now, however, we need to check if it
276          * is really so; aka Router Reachability Probing.
277          *
278          * Router Reachability Probe MUST be rate-limited
279          * to no more than one per minute.
280          */
281         if (!neigh || (neigh->nud_state & NUD_VALID))
282                 return;
283         read_lock_bh(&neigh->lock);
284         if (!(neigh->nud_state & NUD_VALID) &&
285             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
286                 struct in6_addr mcaddr;
287                 struct in6_addr *target;
288
289                 neigh->updated = jiffies;
290                 read_unlock_bh(&neigh->lock);
291
292                 target = (struct in6_addr *)&neigh->primary_key;
293                 addrconf_addr_solict_mult(target, &mcaddr);
294                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
295         } else
296                 read_unlock_bh(&neigh->lock);
297 }
298 #else
299 static inline void rt6_probe(struct rt6_info *rt)
300 {
301         return;
302 }
303 #endif
304
305 /*
306  * Default Router Selection (RFC 2461 6.3.6)
307  */
308 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
309 {
310         struct net_device *dev = rt->rt6i_dev;
311         if (!oif || dev->ifindex == oif)
312                 return 2;
313         if ((dev->flags & IFF_LOOPBACK) &&
314             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
315                 return 1;
316         return 0;
317 }
318
319 static int inline rt6_check_neigh(struct rt6_info *rt)
320 {
321         struct neighbour *neigh = rt->rt6i_nexthop;
322         int m = 0;
323         if (rt->rt6i_flags & RTF_NONEXTHOP ||
324             !(rt->rt6i_flags & RTF_GATEWAY))
325                 m = 1;
326         else if (neigh) {
327                 read_lock_bh(&neigh->lock);
328                 if (neigh->nud_state & NUD_VALID)
329                         m = 2;
330                 read_unlock_bh(&neigh->lock);
331         }
332         return m;
333 }
334
335 static int rt6_score_route(struct rt6_info *rt, int oif,
336                            int strict)
337 {
338         int m, n;
339                 
340         m = rt6_check_dev(rt, oif);
341         if (!m && (strict & RT6_SELECT_F_IFACE))
342                 return -1;
343 #ifdef CONFIG_IPV6_ROUTER_PREF
344         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
345 #endif
346         n = rt6_check_neigh(rt);
347         if (n > 1)
348                 m |= 16;
349         else if (!n && strict & RT6_SELECT_F_REACHABLE)
350                 return -1;
351         return m;
352 }
353
354 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
355                                    int strict)
356 {
357         struct rt6_info *match = NULL, *last = NULL;
358         struct rt6_info *rt, *rt0 = *head;
359         u32 metric;
360         int mpri = -1;
361
362         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
363                   __FUNCTION__, head, head ? *head : NULL, oif);
364
365         for (rt = rt0, metric = rt0->rt6i_metric;
366              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
367              rt = rt->u.next) {
368                 int m;
369
370                 if (rt6_check_expired(rt))
371                         continue;
372
373                 last = rt;
374
375                 m = rt6_score_route(rt, oif, strict);
376                 if (m < 0)
377                         continue;
378
379                 if (m > mpri) {
380                         rt6_probe(match);
381                         match = rt;
382                         mpri = m;
383                 } else {
384                         rt6_probe(rt);
385                 }
386         }
387
388         if (!match &&
389             (strict & RT6_SELECT_F_REACHABLE) &&
390             last && last != rt0) {
391                 /* no entries matched; do round-robin */
392                 static DEFINE_SPINLOCK(lock);
393                 spin_lock(&lock);
394                 *head = rt0->u.next;
395                 rt0->u.next = last->u.next;
396                 last->u.next = rt0;
397                 spin_unlock(&lock);
398         }
399
400         RT6_TRACE("%s() => %p, score=%d\n",
401                   __FUNCTION__, match, mpri);
402
403         return (match ? match : &ip6_null_entry);
404 }
405
406 #ifdef CONFIG_IPV6_ROUTE_INFO
407 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
408                   struct in6_addr *gwaddr)
409 {
410         struct route_info *rinfo = (struct route_info *) opt;
411         struct in6_addr prefix_buf, *prefix;
412         unsigned int pref;
413         u32 lifetime;
414         struct rt6_info *rt;
415
416         if (len < sizeof(struct route_info)) {
417                 return -EINVAL;
418         }
419
420         /* Sanity check for prefix_len and length */
421         if (rinfo->length > 3) {
422                 return -EINVAL;
423         } else if (rinfo->prefix_len > 128) {
424                 return -EINVAL;
425         } else if (rinfo->prefix_len > 64) {
426                 if (rinfo->length < 2) {
427                         return -EINVAL;
428                 }
429         } else if (rinfo->prefix_len > 0) {
430                 if (rinfo->length < 1) {
431                         return -EINVAL;
432                 }
433         }
434
435         pref = rinfo->route_pref;
436         if (pref == ICMPV6_ROUTER_PREF_INVALID)
437                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
438
439         lifetime = htonl(rinfo->lifetime);
440         if (lifetime == 0xffffffff) {
441                 /* infinity */
442         } else if (lifetime > 0x7fffffff/HZ) {
443                 /* Avoid arithmetic overflow */
444                 lifetime = 0x7fffffff/HZ - 1;
445         }
446
447         if (rinfo->length == 3)
448                 prefix = (struct in6_addr *)rinfo->prefix;
449         else {
450                 /* this function is safe */
451                 ipv6_addr_prefix(&prefix_buf,
452                                  (struct in6_addr *)rinfo->prefix,
453                                  rinfo->prefix_len);
454                 prefix = &prefix_buf;
455         }
456
457         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
458
459         if (rt && !lifetime) {
460                 ip6_del_rt(rt);
461                 rt = NULL;
462         }
463
464         if (!rt && lifetime)
465                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
466                                         pref);
467         else if (rt)
468                 rt->rt6i_flags = RTF_ROUTEINFO |
469                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
470
471         if (rt) {
472                 if (lifetime == 0xffffffff) {
473                         rt->rt6i_flags &= ~RTF_EXPIRES;
474                 } else {
475                         rt->rt6i_expires = jiffies + HZ * lifetime;
476                         rt->rt6i_flags |= RTF_EXPIRES;
477                 }
478                 dst_release(&rt->u.dst);
479         }
480         return 0;
481 }
482 #endif
483
484 #define BACKTRACK() \
485 if (rt == &ip6_null_entry && flags & RT6_F_STRICT) { \
486         while ((fn = fn->parent) != NULL) { \
487                 if (fn->fn_flags & RTN_TL_ROOT) { \
488                         dst_hold(&rt->u.dst); \
489                         goto out; \
490                 } \
491                 if (fn->fn_flags & RTN_RTINFO) \
492                         goto restart; \
493         } \
494 }
495
496 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
497                                              struct flowi *fl, int flags)
498 {
499         struct fib6_node *fn;
500         struct rt6_info *rt;
501
502         read_lock_bh(&table->tb6_lock);
503         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
504 restart:
505         rt = fn->leaf;
506         rt = rt6_device_match(rt, fl->oif, flags & RT6_F_STRICT);
507         BACKTRACK();
508         dst_hold(&rt->u.dst);
509 out:
510         read_unlock_bh(&table->tb6_lock);
511
512         rt->u.dst.lastuse = jiffies;
513         rt->u.dst.__use++;
514
515         return rt;
516
517 }
518
519 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
520                             int oif, int strict)
521 {
522         struct flowi fl = {
523                 .oif = oif,
524                 .nl_u = {
525                         .ip6_u = {
526                                 .daddr = *daddr,
527                                 /* TODO: saddr */
528                         },
529                 },
530         };
531         struct dst_entry *dst;
532         int flags = strict ? RT6_F_STRICT : 0;
533
534         dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
535         if (dst->error == 0)
536                 return (struct rt6_info *) dst;
537
538         dst_release(dst);
539
540         return NULL;
541 }
542
543 /* ip6_ins_rt is called with FREE table->tb6_lock.
544    It takes new route entry, the addition fails by any reason the
545    route is freed. In any case, if caller does not hold it, it may
546    be destroyed.
547  */
548
549 static int __ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
550                         void *_rtattr, struct netlink_skb_parms *req)
551 {
552         int err;
553         struct fib6_table *table;
554
555         table = rt->rt6i_table;
556         write_lock_bh(&table->tb6_lock);
557         err = fib6_add(&table->tb6_root, rt, nlh, _rtattr, req);
558         write_unlock_bh(&table->tb6_lock);
559
560         return err;
561 }
562
563 int ip6_ins_rt(struct rt6_info *rt)
564 {
565         return __ip6_ins_rt(rt, NULL, NULL, NULL);
566 }
567
568 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
569                                       struct in6_addr *saddr)
570 {
571         struct rt6_info *rt;
572
573         /*
574          *      Clone the route.
575          */
576
577         rt = ip6_rt_copy(ort);
578
579         if (rt) {
580                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
581                         if (rt->rt6i_dst.plen != 128 &&
582                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
583                                 rt->rt6i_flags |= RTF_ANYCAST;
584                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
585                 }
586
587                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
588                 rt->rt6i_dst.plen = 128;
589                 rt->rt6i_flags |= RTF_CACHE;
590                 rt->u.dst.flags |= DST_HOST;
591
592 #ifdef CONFIG_IPV6_SUBTREES
593                 if (rt->rt6i_src.plen && saddr) {
594                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
595                         rt->rt6i_src.plen = 128;
596                 }
597 #endif
598
599                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
600
601         }
602
603         return rt;
604 }
605
606 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
607 {
608         struct rt6_info *rt = ip6_rt_copy(ort);
609         if (rt) {
610                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
611                 rt->rt6i_dst.plen = 128;
612                 rt->rt6i_flags |= RTF_CACHE;
613                 if (rt->rt6i_flags & RTF_REJECT)
614                         rt->u.dst.error = ort->u.dst.error;
615                 rt->u.dst.flags |= DST_HOST;
616                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
617         }
618         return rt;
619 }
620
621 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
622                                             struct flowi *fl, int flags)
623 {
624         struct fib6_node *fn;
625         struct rt6_info *rt, *nrt;
626         int strict = 0;
627         int attempts = 3;
628         int err;
629         int reachable = RT6_SELECT_F_REACHABLE;
630
631         if (flags & RT6_F_STRICT)
632                 strict = RT6_SELECT_F_IFACE;
633
634 relookup:
635         read_lock_bh(&table->tb6_lock);
636
637 restart_2:
638         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
639
640 restart:
641         rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
642         BACKTRACK();
643         if (rt == &ip6_null_entry ||
644             rt->rt6i_flags & RTF_CACHE)
645                 goto out;
646
647         dst_hold(&rt->u.dst);
648         read_unlock_bh(&table->tb6_lock);
649
650         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
651                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
652         else {
653 #if CLONE_OFFLINK_ROUTE
654                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
655 #else
656                 goto out2;
657 #endif
658         }
659
660         dst_release(&rt->u.dst);
661         rt = nrt ? : &ip6_null_entry;
662
663         dst_hold(&rt->u.dst);
664         if (nrt) {
665                 err = ip6_ins_rt(nrt);
666                 if (!err)
667                         goto out2;
668         }
669
670         if (--attempts <= 0)
671                 goto out2;
672
673         /*
674          * Race condition! In the gap, when table->tb6_lock was
675          * released someone could insert this route.  Relookup.
676          */
677         dst_release(&rt->u.dst);
678         goto relookup;
679
680 out:
681         if (reachable) {
682                 reachable = 0;
683                 goto restart_2;
684         }
685         dst_hold(&rt->u.dst);
686         read_unlock_bh(&table->tb6_lock);
687 out2:
688         rt->u.dst.lastuse = jiffies;
689         rt->u.dst.__use++;
690
691         return rt;
692 }
693
694 void ip6_route_input(struct sk_buff *skb)
695 {
696         struct ipv6hdr *iph = skb->nh.ipv6h;
697         struct flowi fl = {
698                 .iif = skb->dev->ifindex,
699                 .nl_u = {
700                         .ip6_u = {
701                                 .daddr = iph->daddr,
702                                 .saddr = iph->saddr,
703                                 .flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
704                         },
705                 },
706                 .proto = iph->nexthdr,
707         };
708         int flags = 0;
709
710         if (rt6_need_strict(&iph->daddr))
711                 flags |= RT6_F_STRICT;
712
713         skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
714 }
715
716 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
717                                              struct flowi *fl, int flags)
718 {
719         struct fib6_node *fn;
720         struct rt6_info *rt, *nrt;
721         int strict = 0;
722         int attempts = 3;
723         int err;
724         int reachable = RT6_SELECT_F_REACHABLE;
725
726         if (flags & RT6_F_STRICT)
727                 strict = RT6_SELECT_F_IFACE;
728
729 relookup:
730         read_lock_bh(&table->tb6_lock);
731
732 restart_2:
733         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
734
735 restart:
736         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
737         BACKTRACK();
738         if (rt == &ip6_null_entry ||
739             rt->rt6i_flags & RTF_CACHE)
740                 goto out;
741
742         dst_hold(&rt->u.dst);
743         read_unlock_bh(&table->tb6_lock);
744
745         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
746                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
747         else {
748 #if CLONE_OFFLINK_ROUTE
749                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
750 #else
751                 goto out2;
752 #endif
753         }
754
755         dst_release(&rt->u.dst);
756         rt = nrt ? : &ip6_null_entry;
757
758         dst_hold(&rt->u.dst);
759         if (nrt) {
760                 err = ip6_ins_rt(nrt);
761                 if (!err)
762                         goto out2;
763         }
764
765         if (--attempts <= 0)
766                 goto out2;
767
768         /*
769          * Race condition! In the gap, when table->tb6_lock was
770          * released someone could insert this route.  Relookup.
771          */
772         dst_release(&rt->u.dst);
773         goto relookup;
774
775 out:
776         if (reachable) {
777                 reachable = 0;
778                 goto restart_2;
779         }
780         dst_hold(&rt->u.dst);
781         read_unlock_bh(&table->tb6_lock);
782 out2:
783         rt->u.dst.lastuse = jiffies;
784         rt->u.dst.__use++;
785         return rt;
786 }
787
788 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
789 {
790         int flags = 0;
791
792         if (rt6_need_strict(&fl->fl6_dst))
793                 flags |= RT6_F_STRICT;
794
795         return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
796 }
797
798
799 /*
800  *      Destination cache support functions
801  */
802
803 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
804 {
805         struct rt6_info *rt;
806
807         rt = (struct rt6_info *) dst;
808
809         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
810                 return dst;
811
812         return NULL;
813 }
814
815 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
816 {
817         struct rt6_info *rt = (struct rt6_info *) dst;
818
819         if (rt) {
820                 if (rt->rt6i_flags & RTF_CACHE)
821                         ip6_del_rt(rt);
822                 else
823                         dst_release(dst);
824         }
825         return NULL;
826 }
827
828 static void ip6_link_failure(struct sk_buff *skb)
829 {
830         struct rt6_info *rt;
831
832         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
833
834         rt = (struct rt6_info *) skb->dst;
835         if (rt) {
836                 if (rt->rt6i_flags&RTF_CACHE) {
837                         dst_set_expires(&rt->u.dst, 0);
838                         rt->rt6i_flags |= RTF_EXPIRES;
839                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
840                         rt->rt6i_node->fn_sernum = -1;
841         }
842 }
843
844 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
845 {
846         struct rt6_info *rt6 = (struct rt6_info*)dst;
847
848         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
849                 rt6->rt6i_flags |= RTF_MODIFIED;
850                 if (mtu < IPV6_MIN_MTU) {
851                         mtu = IPV6_MIN_MTU;
852                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
853                 }
854                 dst->metrics[RTAX_MTU-1] = mtu;
855                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
856         }
857 }
858
859 static int ipv6_get_mtu(struct net_device *dev);
860
861 static inline unsigned int ipv6_advmss(unsigned int mtu)
862 {
863         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
864
865         if (mtu < ip6_rt_min_advmss)
866                 mtu = ip6_rt_min_advmss;
867
868         /*
869          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
870          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
871          * IPV6_MAXPLEN is also valid and means: "any MSS, 
872          * rely only on pmtu discovery"
873          */
874         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
875                 mtu = IPV6_MAXPLEN;
876         return mtu;
877 }
878
879 static struct dst_entry *ndisc_dst_gc_list;
880 static DEFINE_SPINLOCK(ndisc_lock);
881
882 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
883                                   struct neighbour *neigh,
884                                   struct in6_addr *addr,
885                                   int (*output)(struct sk_buff *))
886 {
887         struct rt6_info *rt;
888         struct inet6_dev *idev = in6_dev_get(dev);
889
890         if (unlikely(idev == NULL))
891                 return NULL;
892
893         rt = ip6_dst_alloc();
894         if (unlikely(rt == NULL)) {
895                 in6_dev_put(idev);
896                 goto out;
897         }
898
899         dev_hold(dev);
900         if (neigh)
901                 neigh_hold(neigh);
902         else
903                 neigh = ndisc_get_neigh(dev, addr);
904
905         rt->rt6i_dev      = dev;
906         rt->rt6i_idev     = idev;
907         rt->rt6i_nexthop  = neigh;
908         atomic_set(&rt->u.dst.__refcnt, 1);
909         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
910         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
911         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
912         rt->u.dst.output  = output;
913
914 #if 0   /* there's no chance to use these for ndisc */
915         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
916                                 ? DST_HOST 
917                                 : 0;
918         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
919         rt->rt6i_dst.plen = 128;
920 #endif
921
922         spin_lock_bh(&ndisc_lock);
923         rt->u.dst.next = ndisc_dst_gc_list;
924         ndisc_dst_gc_list = &rt->u.dst;
925         spin_unlock_bh(&ndisc_lock);
926
927         fib6_force_start_gc();
928
929 out:
930         return (struct dst_entry *)rt;
931 }
932
933 int ndisc_dst_gc(int *more)
934 {
935         struct dst_entry *dst, *next, **pprev;
936         int freed;
937
938         next = NULL;
939         freed = 0;
940
941         spin_lock_bh(&ndisc_lock);
942         pprev = &ndisc_dst_gc_list;
943
944         while ((dst = *pprev) != NULL) {
945                 if (!atomic_read(&dst->__refcnt)) {
946                         *pprev = dst->next;
947                         dst_free(dst);
948                         freed++;
949                 } else {
950                         pprev = &dst->next;
951                         (*more)++;
952                 }
953         }
954
955         spin_unlock_bh(&ndisc_lock);
956
957         return freed;
958 }
959
960 static int ip6_dst_gc(void)
961 {
962         static unsigned expire = 30*HZ;
963         static unsigned long last_gc;
964         unsigned long now = jiffies;
965
966         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
967             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
968                 goto out;
969
970         expire++;
971         fib6_run_gc(expire);
972         last_gc = now;
973         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
974                 expire = ip6_rt_gc_timeout>>1;
975
976 out:
977         expire -= expire>>ip6_rt_gc_elasticity;
978         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
979 }
980
981 /* Clean host part of a prefix. Not necessary in radix tree,
982    but results in cleaner routing tables.
983
984    Remove it only when all the things will work!
985  */
986
987 static int ipv6_get_mtu(struct net_device *dev)
988 {
989         int mtu = IPV6_MIN_MTU;
990         struct inet6_dev *idev;
991
992         idev = in6_dev_get(dev);
993         if (idev) {
994                 mtu = idev->cnf.mtu6;
995                 in6_dev_put(idev);
996         }
997         return mtu;
998 }
999
1000 int ipv6_get_hoplimit(struct net_device *dev)
1001 {
1002         int hoplimit = ipv6_devconf.hop_limit;
1003         struct inet6_dev *idev;
1004
1005         idev = in6_dev_get(dev);
1006         if (idev) {
1007                 hoplimit = idev->cnf.hop_limit;
1008                 in6_dev_put(idev);
1009         }
1010         return hoplimit;
1011 }
1012
1013 /*
1014  *
1015  */
1016
1017 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, 
1018                   void *_rtattr, struct netlink_skb_parms *req,
1019                   u32 table_id)
1020 {
1021         int err;
1022         struct rtmsg *r;
1023         struct rtattr **rta;
1024         struct rt6_info *rt = NULL;
1025         struct net_device *dev = NULL;
1026         struct inet6_dev *idev = NULL;
1027         struct fib6_table *table;
1028         int addr_type;
1029
1030         rta = (struct rtattr **) _rtattr;
1031
1032         if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
1033                 return -EINVAL;
1034 #ifndef CONFIG_IPV6_SUBTREES
1035         if (rtmsg->rtmsg_src_len)
1036                 return -EINVAL;
1037 #endif
1038         if (rtmsg->rtmsg_ifindex) {
1039                 err = -ENODEV;
1040                 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
1041                 if (!dev)
1042                         goto out;
1043                 idev = in6_dev_get(dev);
1044                 if (!idev)
1045                         goto out;
1046         }
1047
1048         if (rtmsg->rtmsg_metric == 0)
1049                 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
1050
1051         table = fib6_new_table(table_id);
1052         if (table == NULL) {
1053                 err = -ENOBUFS;
1054                 goto out;
1055         }
1056
1057         rt = ip6_dst_alloc();
1058
1059         if (rt == NULL) {
1060                 err = -ENOMEM;
1061                 goto out;
1062         }
1063
1064         rt->u.dst.obsolete = -1;
1065         rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
1066         if (nlh && (r = NLMSG_DATA(nlh))) {
1067                 rt->rt6i_protocol = r->rtm_protocol;
1068         } else {
1069                 rt->rt6i_protocol = RTPROT_BOOT;
1070         }
1071
1072         addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
1073
1074         if (addr_type & IPV6_ADDR_MULTICAST)
1075                 rt->u.dst.input = ip6_mc_input;
1076         else
1077                 rt->u.dst.input = ip6_forward;
1078
1079         rt->u.dst.output = ip6_output;
1080
1081         ipv6_addr_prefix(&rt->rt6i_dst.addr, 
1082                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
1083         rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
1084         if (rt->rt6i_dst.plen == 128)
1085                rt->u.dst.flags = DST_HOST;
1086
1087 #ifdef CONFIG_IPV6_SUBTREES
1088         ipv6_addr_prefix(&rt->rt6i_src.addr, 
1089                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1090         rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
1091 #endif
1092
1093         rt->rt6i_metric = rtmsg->rtmsg_metric;
1094
1095         /* We cannot add true routes via loopback here,
1096            they would result in kernel looping; promote them to reject routes
1097          */
1098         if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
1099             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1100                 /* hold loopback dev/idev if we haven't done so. */
1101                 if (dev != &loopback_dev) {
1102                         if (dev) {
1103                                 dev_put(dev);
1104                                 in6_dev_put(idev);
1105                         }
1106                         dev = &loopback_dev;
1107                         dev_hold(dev);
1108                         idev = in6_dev_get(dev);
1109                         if (!idev) {
1110                                 err = -ENODEV;
1111                                 goto out;
1112                         }
1113                 }
1114                 rt->u.dst.output = ip6_pkt_discard_out;
1115                 rt->u.dst.input = ip6_pkt_discard;
1116                 rt->u.dst.error = -ENETUNREACH;
1117                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1118                 goto install_route;
1119         }
1120
1121         if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
1122                 struct in6_addr *gw_addr;
1123                 int gwa_type;
1124
1125                 gw_addr = &rtmsg->rtmsg_gateway;
1126                 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
1127                 gwa_type = ipv6_addr_type(gw_addr);
1128
1129                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1130                         struct rt6_info *grt;
1131
1132                         /* IPv6 strictly inhibits using not link-local
1133                            addresses as nexthop address.
1134                            Otherwise, router will not able to send redirects.
1135                            It is very good, but in some (rare!) circumstances
1136                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1137                            some exceptions. --ANK
1138                          */
1139                         err = -EINVAL;
1140                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1141                                 goto out;
1142
1143                         grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1144
1145                         err = -EHOSTUNREACH;
1146                         if (grt == NULL)
1147                                 goto out;
1148                         if (dev) {
1149                                 if (dev != grt->rt6i_dev) {
1150                                         dst_release(&grt->u.dst);
1151                                         goto out;
1152                                 }
1153                         } else {
1154                                 dev = grt->rt6i_dev;
1155                                 idev = grt->rt6i_idev;
1156                                 dev_hold(dev);
1157                                 in6_dev_hold(grt->rt6i_idev);
1158                         }
1159                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1160                                 err = 0;
1161                         dst_release(&grt->u.dst);
1162
1163                         if (err)
1164                                 goto out;
1165                 }
1166                 err = -EINVAL;
1167                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1168                         goto out;
1169         }
1170
1171         err = -ENODEV;
1172         if (dev == NULL)
1173                 goto out;
1174
1175         if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1176                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1177                 if (IS_ERR(rt->rt6i_nexthop)) {
1178                         err = PTR_ERR(rt->rt6i_nexthop);
1179                         rt->rt6i_nexthop = NULL;
1180                         goto out;
1181                 }
1182         }
1183
1184         rt->rt6i_flags = rtmsg->rtmsg_flags;
1185
1186 install_route:
1187         if (rta && rta[RTA_METRICS-1]) {
1188                 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1189                 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1190
1191                 while (RTA_OK(attr, attrlen)) {
1192                         unsigned flavor = attr->rta_type;
1193                         if (flavor) {
1194                                 if (flavor > RTAX_MAX) {
1195                                         err = -EINVAL;
1196                                         goto out;
1197                                 }
1198                                 rt->u.dst.metrics[flavor-1] =
1199                                         *(u32 *)RTA_DATA(attr);
1200                         }
1201                         attr = RTA_NEXT(attr, attrlen);
1202                 }
1203         }
1204
1205         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1206                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1207         if (!rt->u.dst.metrics[RTAX_MTU-1])
1208                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1209         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1210                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1211         rt->u.dst.dev = dev;
1212         rt->rt6i_idev = idev;
1213         rt->rt6i_table = table;
1214         return __ip6_ins_rt(rt, nlh, _rtattr, req);
1215
1216 out:
1217         if (dev)
1218                 dev_put(dev);
1219         if (idev)
1220                 in6_dev_put(idev);
1221         if (rt)
1222                 dst_free((struct dst_entry *) rt);
1223         return err;
1224 }
1225
1226 static int __ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
1227                         void *_rtattr, struct netlink_skb_parms *req)
1228 {
1229         int err;
1230         struct fib6_table *table;
1231
1232         if (rt == &ip6_null_entry)
1233                 return -ENOENT;
1234
1235         table = rt->rt6i_table;
1236         write_lock_bh(&table->tb6_lock);
1237
1238         err = fib6_del(rt, nlh, _rtattr, req);
1239         dst_release(&rt->u.dst);
1240
1241         write_unlock_bh(&table->tb6_lock);
1242
1243         return err;
1244 }
1245
1246 int ip6_del_rt(struct rt6_info *rt)
1247 {
1248         return __ip6_del_rt(rt, NULL, NULL, NULL);
1249 }
1250
1251 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
1252                          void *_rtattr, struct netlink_skb_parms *req,
1253                          u32 table_id)
1254 {
1255         struct fib6_table *table;
1256         struct fib6_node *fn;
1257         struct rt6_info *rt;
1258         int err = -ESRCH;
1259
1260         table = fib6_get_table(table_id);
1261         if (table == NULL)
1262                 return err;
1263
1264         read_lock_bh(&table->tb6_lock);
1265
1266         fn = fib6_locate(&table->tb6_root,
1267                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1268                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1269         
1270         if (fn) {
1271                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1272                         if (rtmsg->rtmsg_ifindex &&
1273                             (rt->rt6i_dev == NULL ||
1274                              rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1275                                 continue;
1276                         if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1277                             !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1278                                 continue;
1279                         if (rtmsg->rtmsg_metric &&
1280                             rtmsg->rtmsg_metric != rt->rt6i_metric)
1281                                 continue;
1282                         dst_hold(&rt->u.dst);
1283                         read_unlock_bh(&table->tb6_lock);
1284
1285                         return __ip6_del_rt(rt, nlh, _rtattr, req);
1286                 }
1287         }
1288         read_unlock_bh(&table->tb6_lock);
1289
1290         return err;
1291 }
1292
1293 /*
1294  *      Handle redirects
1295  */
1296 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1297                   struct neighbour *neigh, u8 *lladdr, int on_link)
1298 {
1299         struct rt6_info *rt, *nrt = NULL;
1300         struct fib6_node *fn;
1301         struct fib6_table *table;
1302         struct netevent_redirect netevent;
1303
1304         /* TODO: Very lazy, might need to check all tables */
1305         table = fib6_get_table(RT6_TABLE_MAIN);
1306         if (table == NULL)
1307                 return;
1308
1309         /*
1310          * Get the "current" route for this destination and
1311          * check if the redirect has come from approriate router.
1312          *
1313          * RFC 2461 specifies that redirects should only be
1314          * accepted if they come from the nexthop to the target.
1315          * Due to the way the routes are chosen, this notion
1316          * is a bit fuzzy and one might need to check all possible
1317          * routes.
1318          */
1319
1320         read_lock_bh(&table->tb6_lock);
1321         fn = fib6_lookup(&table->tb6_root, dest, NULL);
1322 restart:
1323         for (rt = fn->leaf; rt; rt = rt->u.next) {
1324                 /*
1325                  * Current route is on-link; redirect is always invalid.
1326                  *
1327                  * Seems, previous statement is not true. It could
1328                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1329                  * But then router serving it might decide, that we should
1330                  * know truth 8)8) --ANK (980726).
1331                  */
1332                 if (rt6_check_expired(rt))
1333                         continue;
1334                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1335                         continue;
1336                 if (neigh->dev != rt->rt6i_dev)
1337                         continue;
1338                 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1339                         continue;
1340                 break;
1341         }
1342         if (rt)
1343                 dst_hold(&rt->u.dst);
1344         else if (rt6_need_strict(dest)) {
1345                 while ((fn = fn->parent) != NULL) {
1346                         if (fn->fn_flags & RTN_ROOT)
1347                                 break;
1348                         if (fn->fn_flags & RTN_RTINFO)
1349                                 goto restart;
1350                 }
1351         }
1352         read_unlock_bh(&table->tb6_lock);
1353
1354         if (!rt) {
1355                 if (net_ratelimit())
1356                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1357                                "for redirect target\n");
1358                 return;
1359         }
1360
1361         /*
1362          *      We have finally decided to accept it.
1363          */
1364
1365         neigh_update(neigh, lladdr, NUD_STALE, 
1366                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1367                      NEIGH_UPDATE_F_OVERRIDE|
1368                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1369                                      NEIGH_UPDATE_F_ISROUTER))
1370                      );
1371
1372         /*
1373          * Redirect received -> path was valid.
1374          * Look, redirects are sent only in response to data packets,
1375          * so that this nexthop apparently is reachable. --ANK
1376          */
1377         dst_confirm(&rt->u.dst);
1378
1379         /* Duplicate redirect: silently ignore. */
1380         if (neigh == rt->u.dst.neighbour)
1381                 goto out;
1382
1383         nrt = ip6_rt_copy(rt);
1384         if (nrt == NULL)
1385                 goto out;
1386
1387         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1388         if (on_link)
1389                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1390
1391         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1392         nrt->rt6i_dst.plen = 128;
1393         nrt->u.dst.flags |= DST_HOST;
1394
1395         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1396         nrt->rt6i_nexthop = neigh_clone(neigh);
1397         /* Reset pmtu, it may be better */
1398         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1399         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1400
1401         if (ip6_ins_rt(nrt))
1402                 goto out;
1403
1404         netevent.old = &rt->u.dst;
1405         netevent.new = &nrt->u.dst;
1406         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1407
1408         if (rt->rt6i_flags&RTF_CACHE) {
1409                 ip6_del_rt(rt);
1410                 return;
1411         }
1412
1413 out:
1414         dst_release(&rt->u.dst);
1415         return;
1416 }
1417
1418 /*
1419  *      Handle ICMP "packet too big" messages
1420  *      i.e. Path MTU discovery
1421  */
1422
1423 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1424                         struct net_device *dev, u32 pmtu)
1425 {
1426         struct rt6_info *rt, *nrt;
1427         int allfrag = 0;
1428
1429         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1430         if (rt == NULL)
1431                 return;
1432
1433         if (pmtu >= dst_mtu(&rt->u.dst))
1434                 goto out;
1435
1436         if (pmtu < IPV6_MIN_MTU) {
1437                 /*
1438                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1439                  * MTU (1280) and a fragment header should always be included
1440                  * after a node receiving Too Big message reporting PMTU is
1441                  * less than the IPv6 Minimum Link MTU.
1442                  */
1443                 pmtu = IPV6_MIN_MTU;
1444                 allfrag = 1;
1445         }
1446
1447         /* New mtu received -> path was valid.
1448            They are sent only in response to data packets,
1449            so that this nexthop apparently is reachable. --ANK
1450          */
1451         dst_confirm(&rt->u.dst);
1452
1453         /* Host route. If it is static, it would be better
1454            not to override it, but add new one, so that
1455            when cache entry will expire old pmtu
1456            would return automatically.
1457          */
1458         if (rt->rt6i_flags & RTF_CACHE) {
1459                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1460                 if (allfrag)
1461                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1462                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1463                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1464                 goto out;
1465         }
1466
1467         /* Network route.
1468            Two cases are possible:
1469            1. It is connected route. Action: COW
1470            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1471          */
1472         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1473                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1474         else
1475                 nrt = rt6_alloc_clone(rt, daddr);
1476
1477         if (nrt) {
1478                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1479                 if (allfrag)
1480                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1481
1482                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1483                  * happened within 5 mins, the recommended timer is 10 mins.
1484                  * Here this route expiration time is set to ip6_rt_mtu_expires
1485                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1486                  * and detecting PMTU increase will be automatically happened.
1487                  */
1488                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1489                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1490
1491                 ip6_ins_rt(nrt);
1492         }
1493 out:
1494         dst_release(&rt->u.dst);
1495 }
1496
1497 /*
1498  *      Misc support functions
1499  */
1500
1501 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1502 {
1503         struct rt6_info *rt = ip6_dst_alloc();
1504
1505         if (rt) {
1506                 rt->u.dst.input = ort->u.dst.input;
1507                 rt->u.dst.output = ort->u.dst.output;
1508
1509                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1510                 rt->u.dst.dev = ort->u.dst.dev;
1511                 if (rt->u.dst.dev)
1512                         dev_hold(rt->u.dst.dev);
1513                 rt->rt6i_idev = ort->rt6i_idev;
1514                 if (rt->rt6i_idev)
1515                         in6_dev_hold(rt->rt6i_idev);
1516                 rt->u.dst.lastuse = jiffies;
1517                 rt->rt6i_expires = 0;
1518
1519                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1520                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1521                 rt->rt6i_metric = 0;
1522
1523                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1524 #ifdef CONFIG_IPV6_SUBTREES
1525                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1526 #endif
1527                 rt->rt6i_table = ort->rt6i_table;
1528         }
1529         return rt;
1530 }
1531
1532 #ifdef CONFIG_IPV6_ROUTE_INFO
1533 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1534                                            struct in6_addr *gwaddr, int ifindex)
1535 {
1536         struct fib6_node *fn;
1537         struct rt6_info *rt = NULL;
1538         struct fib6_table *table;
1539
1540         table = fib6_get_table(RT6_TABLE_INFO);
1541         if (table == NULL)
1542                 return NULL;
1543
1544         write_lock_bh(&table->tb6_lock);
1545         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1546         if (!fn)
1547                 goto out;
1548
1549         for (rt = fn->leaf; rt; rt = rt->u.next) {
1550                 if (rt->rt6i_dev->ifindex != ifindex)
1551                         continue;
1552                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1553                         continue;
1554                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1555                         continue;
1556                 dst_hold(&rt->u.dst);
1557                 break;
1558         }
1559 out:
1560         write_unlock_bh(&table->tb6_lock);
1561         return rt;
1562 }
1563
1564 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1565                                            struct in6_addr *gwaddr, int ifindex,
1566                                            unsigned pref)
1567 {
1568         struct in6_rtmsg rtmsg;
1569
1570         memset(&rtmsg, 0, sizeof(rtmsg));
1571         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1572         ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1573         rtmsg.rtmsg_dst_len = prefixlen;
1574         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1575         rtmsg.rtmsg_metric = 1024;
1576         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1577         /* We should treat it as a default route if prefix length is 0. */
1578         if (!prefixlen)
1579                 rtmsg.rtmsg_flags |= RTF_DEFAULT;
1580         rtmsg.rtmsg_ifindex = ifindex;
1581
1582         ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_INFO);
1583
1584         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1585 }
1586 #endif
1587
1588 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1589 {       
1590         struct rt6_info *rt;
1591         struct fib6_table *table;
1592
1593         table = fib6_get_table(RT6_TABLE_DFLT);
1594         if (table == NULL)
1595                 return NULL;
1596
1597         write_lock_bh(&table->tb6_lock);
1598         for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1599                 if (dev == rt->rt6i_dev &&
1600                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1601                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1602                         break;
1603         }
1604         if (rt)
1605                 dst_hold(&rt->u.dst);
1606         write_unlock_bh(&table->tb6_lock);
1607         return rt;
1608 }
1609
1610 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1611                                      struct net_device *dev,
1612                                      unsigned int pref)
1613 {
1614         struct in6_rtmsg rtmsg;
1615
1616         memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1617         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1618         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1619         rtmsg.rtmsg_metric = 1024;
1620         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1621                             RTF_PREF(pref);
1622
1623         rtmsg.rtmsg_ifindex = dev->ifindex;
1624
1625         ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_DFLT);
1626         return rt6_get_dflt_router(gwaddr, dev);
1627 }
1628
1629 void rt6_purge_dflt_routers(void)
1630 {
1631         struct rt6_info *rt;
1632         struct fib6_table *table;
1633
1634         /* NOTE: Keep consistent with rt6_get_dflt_router */
1635         table = fib6_get_table(RT6_TABLE_DFLT);
1636         if (table == NULL)
1637                 return;
1638
1639 restart:
1640         read_lock_bh(&table->tb6_lock);
1641         for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1642                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1643                         dst_hold(&rt->u.dst);
1644                         read_unlock_bh(&table->tb6_lock);
1645                         ip6_del_rt(rt);
1646                         goto restart;
1647                 }
1648         }
1649         read_unlock_bh(&table->tb6_lock);
1650 }
1651
1652 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1653 {
1654         struct in6_rtmsg rtmsg;
1655         int err;
1656
1657         switch(cmd) {
1658         case SIOCADDRT:         /* Add a route */
1659         case SIOCDELRT:         /* Delete a route */
1660                 if (!capable(CAP_NET_ADMIN))
1661                         return -EPERM;
1662                 err = copy_from_user(&rtmsg, arg,
1663                                      sizeof(struct in6_rtmsg));
1664                 if (err)
1665                         return -EFAULT;
1666                         
1667                 rtnl_lock();
1668                 switch (cmd) {
1669                 case SIOCADDRT:
1670                         err = ip6_route_add(&rtmsg, NULL, NULL, NULL,
1671                                             RT6_TABLE_MAIN);
1672                         break;
1673                 case SIOCDELRT:
1674                         err = ip6_route_del(&rtmsg, NULL, NULL, NULL,
1675                                             RT6_TABLE_MAIN);
1676                         break;
1677                 default:
1678                         err = -EINVAL;
1679                 }
1680                 rtnl_unlock();
1681
1682                 return err;
1683         };
1684
1685         return -EINVAL;
1686 }
1687
1688 /*
1689  *      Drop the packet on the floor
1690  */
1691
1692 static int ip6_pkt_discard(struct sk_buff *skb)
1693 {
1694         int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1695         if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1696                 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1697
1698         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1699         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1700         kfree_skb(skb);
1701         return 0;
1702 }
1703
1704 static int ip6_pkt_discard_out(struct sk_buff *skb)
1705 {
1706         skb->dev = skb->dst->dev;
1707         return ip6_pkt_discard(skb);
1708 }
1709
1710 /*
1711  *      Allocate a dst for local (unicast / anycast) address.
1712  */
1713
1714 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1715                                     const struct in6_addr *addr,
1716                                     int anycast)
1717 {
1718         struct rt6_info *rt = ip6_dst_alloc();
1719
1720         if (rt == NULL)
1721                 return ERR_PTR(-ENOMEM);
1722
1723         dev_hold(&loopback_dev);
1724         in6_dev_hold(idev);
1725
1726         rt->u.dst.flags = DST_HOST;
1727         rt->u.dst.input = ip6_input;
1728         rt->u.dst.output = ip6_output;
1729         rt->rt6i_dev = &loopback_dev;
1730         rt->rt6i_idev = idev;
1731         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1732         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1733         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1734         rt->u.dst.obsolete = -1;
1735
1736         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1737         if (anycast)
1738                 rt->rt6i_flags |= RTF_ANYCAST;
1739         else
1740                 rt->rt6i_flags |= RTF_LOCAL;
1741         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1742         if (rt->rt6i_nexthop == NULL) {
1743                 dst_free((struct dst_entry *) rt);
1744                 return ERR_PTR(-ENOMEM);
1745         }
1746
1747         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1748         rt->rt6i_dst.plen = 128;
1749         rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1750
1751         atomic_set(&rt->u.dst.__refcnt, 1);
1752
1753         return rt;
1754 }
1755
1756 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1757 {
1758         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1759             rt != &ip6_null_entry) {
1760                 RT6_TRACE("deleted by ifdown %p\n", rt);
1761                 return -1;
1762         }
1763         return 0;
1764 }
1765
1766 void rt6_ifdown(struct net_device *dev)
1767 {
1768         fib6_clean_all(fib6_ifdown, 0, dev);
1769 }
1770
1771 struct rt6_mtu_change_arg
1772 {
1773         struct net_device *dev;
1774         unsigned mtu;
1775 };
1776
1777 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1778 {
1779         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1780         struct inet6_dev *idev;
1781
1782         /* In IPv6 pmtu discovery is not optional,
1783            so that RTAX_MTU lock cannot disable it.
1784            We still use this lock to block changes
1785            caused by addrconf/ndisc.
1786         */
1787
1788         idev = __in6_dev_get(arg->dev);
1789         if (idev == NULL)
1790                 return 0;
1791
1792         /* For administrative MTU increase, there is no way to discover
1793            IPv6 PMTU increase, so PMTU increase should be updated here.
1794            Since RFC 1981 doesn't include administrative MTU increase
1795            update PMTU increase is a MUST. (i.e. jumbo frame)
1796          */
1797         /*
1798            If new MTU is less than route PMTU, this new MTU will be the
1799            lowest MTU in the path, update the route PMTU to reflect PMTU
1800            decreases; if new MTU is greater than route PMTU, and the
1801            old MTU is the lowest MTU in the path, update the route PMTU
1802            to reflect the increase. In this case if the other nodes' MTU
1803            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1804            PMTU discouvery.
1805          */
1806         if (rt->rt6i_dev == arg->dev &&
1807             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1808             (dst_mtu(&rt->u.dst) > arg->mtu ||
1809              (dst_mtu(&rt->u.dst) < arg->mtu &&
1810               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1811                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1812         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1813         return 0;
1814 }
1815
1816 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1817 {
1818         struct rt6_mtu_change_arg arg = {
1819                 .dev = dev,
1820                 .mtu = mtu,
1821         };
1822
1823         fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1824 }
1825
1826 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1827                               struct in6_rtmsg *rtmsg)
1828 {
1829         memset(rtmsg, 0, sizeof(*rtmsg));
1830
1831         rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1832         rtmsg->rtmsg_src_len = r->rtm_src_len;
1833         rtmsg->rtmsg_flags = RTF_UP;
1834         if (r->rtm_type == RTN_UNREACHABLE)
1835                 rtmsg->rtmsg_flags |= RTF_REJECT;
1836
1837         if (rta[RTA_GATEWAY-1]) {
1838                 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1839                         return -EINVAL;
1840                 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1841                 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1842         }
1843         if (rta[RTA_DST-1]) {
1844                 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1845                         return -EINVAL;
1846                 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1847         }
1848         if (rta[RTA_SRC-1]) {
1849                 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1850                         return -EINVAL;
1851                 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1852         }
1853         if (rta[RTA_OIF-1]) {
1854                 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1855                         return -EINVAL;
1856                 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1857         }
1858         if (rta[RTA_PRIORITY-1]) {
1859                 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1860                         return -EINVAL;
1861                 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1862         }
1863         return 0;
1864 }
1865
1866 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1867 {
1868         struct rtmsg *r = NLMSG_DATA(nlh);
1869         struct in6_rtmsg rtmsg;
1870
1871         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1872                 return -EINVAL;
1873         return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb),
1874                              rtm_get_table(arg, r->rtm_table));
1875 }
1876
1877 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1878 {
1879         struct rtmsg *r = NLMSG_DATA(nlh);
1880         struct in6_rtmsg rtmsg;
1881
1882         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1883                 return -EINVAL;
1884         return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb),
1885                              rtm_get_table(arg, r->rtm_table));
1886 }
1887
1888 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1889                          struct in6_addr *dst, struct in6_addr *src,
1890                          int iif, int type, u32 pid, u32 seq,
1891                          int prefix, unsigned int flags)
1892 {
1893         struct rtmsg *rtm;
1894         struct nlmsghdr  *nlh;
1895         unsigned char    *b = skb->tail;
1896         struct rta_cacheinfo ci;
1897         u32 table;
1898
1899         if (prefix) {   /* user wants prefix routes only */
1900                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1901                         /* success since this is not a prefix route */
1902                         return 1;
1903                 }
1904         }
1905
1906         nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1907         rtm = NLMSG_DATA(nlh);
1908         rtm->rtm_family = AF_INET6;
1909         rtm->rtm_dst_len = rt->rt6i_dst.plen;
1910         rtm->rtm_src_len = rt->rt6i_src.plen;
1911         rtm->rtm_tos = 0;
1912         if (rt->rt6i_table)
1913                 table = rt->rt6i_table->tb6_id;
1914         else
1915                 table = RT6_TABLE_UNSPEC;
1916         rtm->rtm_table = table;
1917         RTA_PUT_U32(skb, RTA_TABLE, table);
1918         if (rt->rt6i_flags&RTF_REJECT)
1919                 rtm->rtm_type = RTN_UNREACHABLE;
1920         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1921                 rtm->rtm_type = RTN_LOCAL;
1922         else
1923                 rtm->rtm_type = RTN_UNICAST;
1924         rtm->rtm_flags = 0;
1925         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1926         rtm->rtm_protocol = rt->rt6i_protocol;
1927         if (rt->rt6i_flags&RTF_DYNAMIC)
1928                 rtm->rtm_protocol = RTPROT_REDIRECT;
1929         else if (rt->rt6i_flags & RTF_ADDRCONF)
1930                 rtm->rtm_protocol = RTPROT_KERNEL;
1931         else if (rt->rt6i_flags&RTF_DEFAULT)
1932                 rtm->rtm_protocol = RTPROT_RA;
1933
1934         if (rt->rt6i_flags&RTF_CACHE)
1935                 rtm->rtm_flags |= RTM_F_CLONED;
1936
1937         if (dst) {
1938                 RTA_PUT(skb, RTA_DST, 16, dst);
1939                 rtm->rtm_dst_len = 128;
1940         } else if (rtm->rtm_dst_len)
1941                 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1942 #ifdef CONFIG_IPV6_SUBTREES
1943         if (src) {
1944                 RTA_PUT(skb, RTA_SRC, 16, src);
1945                 rtm->rtm_src_len = 128;
1946         } else if (rtm->rtm_src_len)
1947                 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1948 #endif
1949         if (iif)
1950                 RTA_PUT(skb, RTA_IIF, 4, &iif);
1951         else if (dst) {
1952                 struct in6_addr saddr_buf;
1953                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1954                         RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1955         }
1956         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1957                 goto rtattr_failure;
1958         if (rt->u.dst.neighbour)
1959                 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1960         if (rt->u.dst.dev)
1961                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1962         RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1963         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1964         if (rt->rt6i_expires)
1965                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1966         else
1967                 ci.rta_expires = 0;
1968         ci.rta_used = rt->u.dst.__use;
1969         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1970         ci.rta_error = rt->u.dst.error;
1971         ci.rta_id = 0;
1972         ci.rta_ts = 0;
1973         ci.rta_tsage = 0;
1974         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1975         nlh->nlmsg_len = skb->tail - b;
1976         return skb->len;
1977
1978 nlmsg_failure:
1979 rtattr_failure:
1980         skb_trim(skb, b - skb->data);
1981         return -1;
1982 }
1983
1984 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1985 {
1986         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1987         int prefix;
1988
1989         if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1990                 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1991                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1992         } else
1993                 prefix = 0;
1994
1995         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1996                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1997                      prefix, NLM_F_MULTI);
1998 }
1999
2000 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2001 {
2002         struct rtattr **rta = arg;
2003         int iif = 0;
2004         int err = -ENOBUFS;
2005         struct sk_buff *skb;
2006         struct flowi fl;
2007         struct rt6_info *rt;
2008
2009         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2010         if (skb == NULL)
2011                 goto out;
2012
2013         /* Reserve room for dummy headers, this skb can pass
2014            through good chunk of routing engine.
2015          */
2016         skb->mac.raw = skb->data;
2017         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2018
2019         memset(&fl, 0, sizeof(fl));
2020         if (rta[RTA_SRC-1])
2021                 ipv6_addr_copy(&fl.fl6_src,
2022                                (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
2023         if (rta[RTA_DST-1])
2024                 ipv6_addr_copy(&fl.fl6_dst,
2025                                (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
2026
2027         if (rta[RTA_IIF-1])
2028                 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
2029
2030         if (iif) {
2031                 struct net_device *dev;
2032                 dev = __dev_get_by_index(iif);
2033                 if (!dev) {
2034                         err = -ENODEV;
2035                         goto out_free;
2036                 }
2037         }
2038
2039         fl.oif = 0;
2040         if (rta[RTA_OIF-1])
2041                 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
2042
2043         rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
2044
2045         skb->dst = &rt->u.dst;
2046
2047         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2048         err = rt6_fill_node(skb, rt, 
2049                             &fl.fl6_dst, &fl.fl6_src,
2050                             iif,
2051                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2052                             nlh->nlmsg_seq, 0, 0);
2053         if (err < 0) {
2054                 err = -EMSGSIZE;
2055                 goto out_free;
2056         }
2057
2058         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2059 out:
2060         return err;
2061 out_free:
2062         kfree_skb(skb);
2063         goto out;       
2064 }
2065
2066 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, 
2067                         struct netlink_skb_parms *req)
2068 {
2069         struct sk_buff *skb;
2070         u32 pid = req ? req->pid : 0;
2071         u32 seq = nlh ? nlh->nlmsg_seq : 0;
2072         int payload = sizeof(struct rtmsg) + 256;
2073         int err = -ENOBUFS;
2074
2075         skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2076         if (skb == NULL)
2077                 goto errout;
2078
2079         err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2080         if (err < 0) {
2081                 kfree_skb(skb);
2082                 goto errout;
2083         }
2084
2085         err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2086 errout:
2087         if (err < 0)
2088                 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2089 }
2090
2091 /*
2092  *      /proc
2093  */
2094
2095 #ifdef CONFIG_PROC_FS
2096
2097 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2098
2099 struct rt6_proc_arg
2100 {
2101         char *buffer;
2102         int offset;
2103         int length;
2104         int skip;
2105         int len;
2106 };
2107
2108 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2109 {
2110         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2111         int i;
2112
2113         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2114                 arg->skip++;
2115                 return 0;
2116         }
2117
2118         if (arg->len >= arg->length)
2119                 return 0;
2120
2121         for (i=0; i<16; i++) {
2122                 sprintf(arg->buffer + arg->len, "%02x",
2123                         rt->rt6i_dst.addr.s6_addr[i]);
2124                 arg->len += 2;
2125         }
2126         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2127                             rt->rt6i_dst.plen);
2128
2129 #ifdef CONFIG_IPV6_SUBTREES
2130         for (i=0; i<16; i++) {
2131                 sprintf(arg->buffer + arg->len, "%02x",
2132                         rt->rt6i_src.addr.s6_addr[i]);
2133                 arg->len += 2;
2134         }
2135         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2136                             rt->rt6i_src.plen);
2137 #else
2138         sprintf(arg->buffer + arg->len,
2139                 "00000000000000000000000000000000 00 ");
2140         arg->len += 36;
2141 #endif
2142
2143         if (rt->rt6i_nexthop) {
2144                 for (i=0; i<16; i++) {
2145                         sprintf(arg->buffer + arg->len, "%02x",
2146                                 rt->rt6i_nexthop->primary_key[i]);
2147                         arg->len += 2;
2148                 }
2149         } else {
2150                 sprintf(arg->buffer + arg->len,
2151                         "00000000000000000000000000000000");
2152                 arg->len += 32;
2153         }
2154         arg->len += sprintf(arg->buffer + arg->len,
2155                             " %08x %08x %08x %08x %8s\n",
2156                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2157                             rt->u.dst.__use, rt->rt6i_flags, 
2158                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2159         return 0;
2160 }
2161
2162 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2163 {
2164         struct rt6_proc_arg arg = {
2165                 .buffer = buffer,
2166                 .offset = offset,
2167                 .length = length,
2168         };
2169
2170         fib6_clean_all(rt6_info_route, 0, &arg);
2171
2172         *start = buffer;
2173         if (offset)
2174                 *start += offset % RT6_INFO_LEN;
2175
2176         arg.len -= offset % RT6_INFO_LEN;
2177
2178         if (arg.len > length)
2179                 arg.len = length;
2180         if (arg.len < 0)
2181                 arg.len = 0;
2182
2183         return arg.len;
2184 }
2185
2186 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2187 {
2188         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2189                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2190                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2191                       rt6_stats.fib_rt_cache,
2192                       atomic_read(&ip6_dst_ops.entries),
2193                       rt6_stats.fib_discarded_routes);
2194
2195         return 0;
2196 }
2197
2198 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2199 {
2200         return single_open(file, rt6_stats_seq_show, NULL);
2201 }
2202
2203 static struct file_operations rt6_stats_seq_fops = {
2204         .owner   = THIS_MODULE,
2205         .open    = rt6_stats_seq_open,
2206         .read    = seq_read,
2207         .llseek  = seq_lseek,
2208         .release = single_release,
2209 };
2210 #endif  /* CONFIG_PROC_FS */
2211
2212 #ifdef CONFIG_SYSCTL
2213
2214 static int flush_delay;
2215
2216 static
2217 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2218                               void __user *buffer, size_t *lenp, loff_t *ppos)
2219 {
2220         if (write) {
2221                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2222                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2223                 return 0;
2224         } else
2225                 return -EINVAL;
2226 }
2227
2228 ctl_table ipv6_route_table[] = {
2229         {
2230                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2231                 .procname       =       "flush",
2232                 .data           =       &flush_delay,
2233                 .maxlen         =       sizeof(int),
2234                 .mode           =       0200,
2235                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2236         },
2237         {
2238                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2239                 .procname       =       "gc_thresh",
2240                 .data           =       &ip6_dst_ops.gc_thresh,
2241                 .maxlen         =       sizeof(int),
2242                 .mode           =       0644,
2243                 .proc_handler   =       &proc_dointvec,
2244         },
2245         {
2246                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2247                 .procname       =       "max_size",
2248                 .data           =       &ip6_rt_max_size,
2249                 .maxlen         =       sizeof(int),
2250                 .mode           =       0644,
2251                 .proc_handler   =       &proc_dointvec,
2252         },
2253         {
2254                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2255                 .procname       =       "gc_min_interval",
2256                 .data           =       &ip6_rt_gc_min_interval,
2257                 .maxlen         =       sizeof(int),
2258                 .mode           =       0644,
2259                 .proc_handler   =       &proc_dointvec_jiffies,
2260                 .strategy       =       &sysctl_jiffies,
2261         },
2262         {
2263                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2264                 .procname       =       "gc_timeout",
2265                 .data           =       &ip6_rt_gc_timeout,
2266                 .maxlen         =       sizeof(int),
2267                 .mode           =       0644,
2268                 .proc_handler   =       &proc_dointvec_jiffies,
2269                 .strategy       =       &sysctl_jiffies,
2270         },
2271         {
2272                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2273                 .procname       =       "gc_interval",
2274                 .data           =       &ip6_rt_gc_interval,
2275                 .maxlen         =       sizeof(int),
2276                 .mode           =       0644,
2277                 .proc_handler   =       &proc_dointvec_jiffies,
2278                 .strategy       =       &sysctl_jiffies,
2279         },
2280         {
2281                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2282                 .procname       =       "gc_elasticity",
2283                 .data           =       &ip6_rt_gc_elasticity,
2284                 .maxlen         =       sizeof(int),
2285                 .mode           =       0644,
2286                 .proc_handler   =       &proc_dointvec_jiffies,
2287                 .strategy       =       &sysctl_jiffies,
2288         },
2289         {
2290                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2291                 .procname       =       "mtu_expires",
2292                 .data           =       &ip6_rt_mtu_expires,
2293                 .maxlen         =       sizeof(int),
2294                 .mode           =       0644,
2295                 .proc_handler   =       &proc_dointvec_jiffies,
2296                 .strategy       =       &sysctl_jiffies,
2297         },
2298         {
2299                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2300                 .procname       =       "min_adv_mss",
2301                 .data           =       &ip6_rt_min_advmss,
2302                 .maxlen         =       sizeof(int),
2303                 .mode           =       0644,
2304                 .proc_handler   =       &proc_dointvec_jiffies,
2305                 .strategy       =       &sysctl_jiffies,
2306         },
2307         {
2308                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2309                 .procname       =       "gc_min_interval_ms",
2310                 .data           =       &ip6_rt_gc_min_interval,
2311                 .maxlen         =       sizeof(int),
2312                 .mode           =       0644,
2313                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2314                 .strategy       =       &sysctl_ms_jiffies,
2315         },
2316         { .ctl_name = 0 }
2317 };
2318
2319 #endif
2320
2321 void __init ip6_route_init(void)
2322 {
2323         struct proc_dir_entry *p;
2324
2325         ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2326                                                      sizeof(struct rt6_info),
2327                                                      0, SLAB_HWCACHE_ALIGN,
2328                                                      NULL, NULL);
2329         if (!ip6_dst_ops.kmem_cachep)
2330                 panic("cannot create ip6_dst_cache");
2331
2332         fib6_init();
2333 #ifdef  CONFIG_PROC_FS
2334         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2335         if (p)
2336                 p->owner = THIS_MODULE;
2337
2338         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2339 #endif
2340 #ifdef CONFIG_XFRM
2341         xfrm6_init();
2342 #endif
2343 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2344         fib6_rules_init();
2345 #endif
2346 }
2347
2348 void ip6_route_cleanup(void)
2349 {
2350 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2351         fib6_rules_cleanup();
2352 #endif
2353 #ifdef CONFIG_PROC_FS
2354         proc_net_remove("ipv6_route");
2355         proc_net_remove("rt6_stats");
2356 #endif
2357 #ifdef CONFIG_XFRM
2358         xfrm6_fini();
2359 #endif
2360         rt6_ifdown(NULL);
2361         fib6_gc_cleanup();
2362         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2363 }