]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv6/route.c
[IPV6] ROUTE: Add credits about subtree fixes.
[net-next-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41
42 #ifdef  CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57 #include <net/netevent.h>
58 #include <net/netlink.h>
59
60 #include <asm/uaccess.h>
61
62 #ifdef CONFIG_SYSCTL
63 #include <linux/sysctl.h>
64 #endif
65
66 /* Set to 3 to get tracing. */
67 #define RT6_DEBUG 2
68
69 #if RT6_DEBUG >= 3
70 #define RDBG(x) printk x
71 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #else
73 #define RDBG(x)
74 #define RT6_TRACE(x...) do { ; } while (0)
75 #endif
76
77 #define CLONE_OFFLINK_ROUTE 0
78
79 #define RT6_SELECT_F_IFACE      0x1
80 #define RT6_SELECT_F_REACHABLE  0x2
81
82 static int ip6_rt_max_size = 4096;
83 static int ip6_rt_gc_min_interval = HZ / 2;
84 static int ip6_rt_gc_timeout = 60*HZ;
85 int ip6_rt_gc_interval = 30*HZ;
86 static int ip6_rt_gc_elasticity = 9;
87 static int ip6_rt_mtu_expires = 10*60*HZ;
88 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
89
90 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
91 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
92 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
93 static void             ip6_dst_destroy(struct dst_entry *);
94 static void             ip6_dst_ifdown(struct dst_entry *,
95                                        struct net_device *dev, int how);
96 static int               ip6_dst_gc(void);
97
98 static int              ip6_pkt_discard(struct sk_buff *skb);
99 static int              ip6_pkt_discard_out(struct sk_buff *skb);
100 static void             ip6_link_failure(struct sk_buff *skb);
101 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
102
103 #ifdef CONFIG_IPV6_ROUTE_INFO
104 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
105                                            struct in6_addr *gwaddr, int ifindex,
106                                            unsigned pref);
107 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
108                                            struct in6_addr *gwaddr, int ifindex);
109 #endif
110
111 static struct dst_ops ip6_dst_ops = {
112         .family                 =       AF_INET6,
113         .protocol               =       __constant_htons(ETH_P_IPV6),
114         .gc                     =       ip6_dst_gc,
115         .gc_thresh              =       1024,
116         .check                  =       ip6_dst_check,
117         .destroy                =       ip6_dst_destroy,
118         .ifdown                 =       ip6_dst_ifdown,
119         .negative_advice        =       ip6_negative_advice,
120         .link_failure           =       ip6_link_failure,
121         .update_pmtu            =       ip6_rt_update_pmtu,
122         .entry_size             =       sizeof(struct rt6_info),
123 };
124
125 struct rt6_info ip6_null_entry = {
126         .u = {
127                 .dst = {
128                         .__refcnt       = ATOMIC_INIT(1),
129                         .__use          = 1,
130                         .dev            = &loopback_dev,
131                         .obsolete       = -1,
132                         .error          = -ENETUNREACH,
133                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
134                         .input          = ip6_pkt_discard,
135                         .output         = ip6_pkt_discard_out,
136                         .ops            = &ip6_dst_ops,
137                         .path           = (struct dst_entry*)&ip6_null_entry,
138                 }
139         },
140         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
141         .rt6i_metric    = ~(u32) 0,
142         .rt6i_ref       = ATOMIC_INIT(1),
143 };
144
145 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
146
147 struct rt6_info ip6_prohibit_entry = {
148         .u = {
149                 .dst = {
150                         .__refcnt       = ATOMIC_INIT(1),
151                         .__use          = 1,
152                         .dev            = &loopback_dev,
153                         .obsolete       = -1,
154                         .error          = -EACCES,
155                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
156                         .input          = ip6_pkt_discard,
157                         .output         = ip6_pkt_discard_out,
158                         .ops            = &ip6_dst_ops,
159                         .path           = (struct dst_entry*)&ip6_prohibit_entry,
160                 }
161         },
162         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
163         .rt6i_metric    = ~(u32) 0,
164         .rt6i_ref       = ATOMIC_INIT(1),
165 };
166
167 struct rt6_info ip6_blk_hole_entry = {
168         .u = {
169                 .dst = {
170                         .__refcnt       = ATOMIC_INIT(1),
171                         .__use          = 1,
172                         .dev            = &loopback_dev,
173                         .obsolete       = -1,
174                         .error          = -EINVAL,
175                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
176                         .input          = ip6_pkt_discard,
177                         .output         = ip6_pkt_discard_out,
178                         .ops            = &ip6_dst_ops,
179                         .path           = (struct dst_entry*)&ip6_blk_hole_entry,
180                 }
181         },
182         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
183         .rt6i_metric    = ~(u32) 0,
184         .rt6i_ref       = ATOMIC_INIT(1),
185 };
186
187 #endif
188
189 /* allocate dst with ip6_dst_ops */
190 static __inline__ struct rt6_info *ip6_dst_alloc(void)
191 {
192         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
193 }
194
195 static void ip6_dst_destroy(struct dst_entry *dst)
196 {
197         struct rt6_info *rt = (struct rt6_info *)dst;
198         struct inet6_dev *idev = rt->rt6i_idev;
199
200         if (idev != NULL) {
201                 rt->rt6i_idev = NULL;
202                 in6_dev_put(idev);
203         }       
204 }
205
206 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
207                            int how)
208 {
209         struct rt6_info *rt = (struct rt6_info *)dst;
210         struct inet6_dev *idev = rt->rt6i_idev;
211
212         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
213                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
214                 if (loopback_idev != NULL) {
215                         rt->rt6i_idev = loopback_idev;
216                         in6_dev_put(idev);
217                 }
218         }
219 }
220
221 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
222 {
223         return (rt->rt6i_flags & RTF_EXPIRES &&
224                 time_after(jiffies, rt->rt6i_expires));
225 }
226
227 static inline int rt6_need_strict(struct in6_addr *daddr)
228 {
229         return (ipv6_addr_type(daddr) &
230                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
231 }
232
233 /*
234  *      Route lookup. Any table->tb6_lock is implied.
235  */
236
237 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
238                                                     int oif,
239                                                     int strict)
240 {
241         struct rt6_info *local = NULL;
242         struct rt6_info *sprt;
243
244         if (oif) {
245                 for (sprt = rt; sprt; sprt = sprt->u.next) {
246                         struct net_device *dev = sprt->rt6i_dev;
247                         if (dev->ifindex == oif)
248                                 return sprt;
249                         if (dev->flags & IFF_LOOPBACK) {
250                                 if (sprt->rt6i_idev == NULL ||
251                                     sprt->rt6i_idev->dev->ifindex != oif) {
252                                         if (strict && oif)
253                                                 continue;
254                                         if (local && (!oif || 
255                                                       local->rt6i_idev->dev->ifindex == oif))
256                                                 continue;
257                                 }
258                                 local = sprt;
259                         }
260                 }
261
262                 if (local)
263                         return local;
264
265                 if (strict)
266                         return &ip6_null_entry;
267         }
268         return rt;
269 }
270
271 #ifdef CONFIG_IPV6_ROUTER_PREF
272 static void rt6_probe(struct rt6_info *rt)
273 {
274         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
275         /*
276          * Okay, this does not seem to be appropriate
277          * for now, however, we need to check if it
278          * is really so; aka Router Reachability Probing.
279          *
280          * Router Reachability Probe MUST be rate-limited
281          * to no more than one per minute.
282          */
283         if (!neigh || (neigh->nud_state & NUD_VALID))
284                 return;
285         read_lock_bh(&neigh->lock);
286         if (!(neigh->nud_state & NUD_VALID) &&
287             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
288                 struct in6_addr mcaddr;
289                 struct in6_addr *target;
290
291                 neigh->updated = jiffies;
292                 read_unlock_bh(&neigh->lock);
293
294                 target = (struct in6_addr *)&neigh->primary_key;
295                 addrconf_addr_solict_mult(target, &mcaddr);
296                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
297         } else
298                 read_unlock_bh(&neigh->lock);
299 }
300 #else
301 static inline void rt6_probe(struct rt6_info *rt)
302 {
303         return;
304 }
305 #endif
306
307 /*
308  * Default Router Selection (RFC 2461 6.3.6)
309  */
310 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
311 {
312         struct net_device *dev = rt->rt6i_dev;
313         if (!oif || dev->ifindex == oif)
314                 return 2;
315         if ((dev->flags & IFF_LOOPBACK) &&
316             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
317                 return 1;
318         return 0;
319 }
320
321 static int inline rt6_check_neigh(struct rt6_info *rt)
322 {
323         struct neighbour *neigh = rt->rt6i_nexthop;
324         int m = 0;
325         if (rt->rt6i_flags & RTF_NONEXTHOP ||
326             !(rt->rt6i_flags & RTF_GATEWAY))
327                 m = 1;
328         else if (neigh) {
329                 read_lock_bh(&neigh->lock);
330                 if (neigh->nud_state & NUD_VALID)
331                         m = 2;
332                 read_unlock_bh(&neigh->lock);
333         }
334         return m;
335 }
336
337 static int rt6_score_route(struct rt6_info *rt, int oif,
338                            int strict)
339 {
340         int m, n;
341                 
342         m = rt6_check_dev(rt, oif);
343         if (!m && (strict & RT6_SELECT_F_IFACE))
344                 return -1;
345 #ifdef CONFIG_IPV6_ROUTER_PREF
346         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
347 #endif
348         n = rt6_check_neigh(rt);
349         if (n > 1)
350                 m |= 16;
351         else if (!n && strict & RT6_SELECT_F_REACHABLE)
352                 return -1;
353         return m;
354 }
355
356 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
357                                    int strict)
358 {
359         struct rt6_info *match = NULL, *last = NULL;
360         struct rt6_info *rt, *rt0 = *head;
361         u32 metric;
362         int mpri = -1;
363
364         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
365                   __FUNCTION__, head, head ? *head : NULL, oif);
366
367         for (rt = rt0, metric = rt0->rt6i_metric;
368              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
369              rt = rt->u.next) {
370                 int m;
371
372                 if (rt6_check_expired(rt))
373                         continue;
374
375                 last = rt;
376
377                 m = rt6_score_route(rt, oif, strict);
378                 if (m < 0)
379                         continue;
380
381                 if (m > mpri) {
382                         rt6_probe(match);
383                         match = rt;
384                         mpri = m;
385                 } else {
386                         rt6_probe(rt);
387                 }
388         }
389
390         if (!match &&
391             (strict & RT6_SELECT_F_REACHABLE) &&
392             last && last != rt0) {
393                 /* no entries matched; do round-robin */
394                 static DEFINE_SPINLOCK(lock);
395                 spin_lock(&lock);
396                 *head = rt0->u.next;
397                 rt0->u.next = last->u.next;
398                 last->u.next = rt0;
399                 spin_unlock(&lock);
400         }
401
402         RT6_TRACE("%s() => %p, score=%d\n",
403                   __FUNCTION__, match, mpri);
404
405         return (match ? match : &ip6_null_entry);
406 }
407
408 #ifdef CONFIG_IPV6_ROUTE_INFO
409 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
410                   struct in6_addr *gwaddr)
411 {
412         struct route_info *rinfo = (struct route_info *) opt;
413         struct in6_addr prefix_buf, *prefix;
414         unsigned int pref;
415         u32 lifetime;
416         struct rt6_info *rt;
417
418         if (len < sizeof(struct route_info)) {
419                 return -EINVAL;
420         }
421
422         /* Sanity check for prefix_len and length */
423         if (rinfo->length > 3) {
424                 return -EINVAL;
425         } else if (rinfo->prefix_len > 128) {
426                 return -EINVAL;
427         } else if (rinfo->prefix_len > 64) {
428                 if (rinfo->length < 2) {
429                         return -EINVAL;
430                 }
431         } else if (rinfo->prefix_len > 0) {
432                 if (rinfo->length < 1) {
433                         return -EINVAL;
434                 }
435         }
436
437         pref = rinfo->route_pref;
438         if (pref == ICMPV6_ROUTER_PREF_INVALID)
439                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
440
441         lifetime = htonl(rinfo->lifetime);
442         if (lifetime == 0xffffffff) {
443                 /* infinity */
444         } else if (lifetime > 0x7fffffff/HZ) {
445                 /* Avoid arithmetic overflow */
446                 lifetime = 0x7fffffff/HZ - 1;
447         }
448
449         if (rinfo->length == 3)
450                 prefix = (struct in6_addr *)rinfo->prefix;
451         else {
452                 /* this function is safe */
453                 ipv6_addr_prefix(&prefix_buf,
454                                  (struct in6_addr *)rinfo->prefix,
455                                  rinfo->prefix_len);
456                 prefix = &prefix_buf;
457         }
458
459         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
460
461         if (rt && !lifetime) {
462                 ip6_del_rt(rt);
463                 rt = NULL;
464         }
465
466         if (!rt && lifetime)
467                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
468                                         pref);
469         else if (rt)
470                 rt->rt6i_flags = RTF_ROUTEINFO |
471                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
472
473         if (rt) {
474                 if (lifetime == 0xffffffff) {
475                         rt->rt6i_flags &= ~RTF_EXPIRES;
476                 } else {
477                         rt->rt6i_expires = jiffies + HZ * lifetime;
478                         rt->rt6i_flags |= RTF_EXPIRES;
479                 }
480                 dst_release(&rt->u.dst);
481         }
482         return 0;
483 }
484 #endif
485
486 #define BACKTRACK(saddr) \
487 do { \
488         if (rt == &ip6_null_entry) { \
489                 struct fib6_node *pn; \
490                 while (fn) { \
491                         if (fn->fn_flags & RTN_TL_ROOT) \
492                                 goto out; \
493                         pn = fn->parent; \
494                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
495                                 fn = fib6_lookup(pn->subtree, NULL, saddr); \
496                         else \
497                                 fn = pn; \
498                         if (fn->fn_flags & RTN_RTINFO) \
499                                 goto restart; \
500                 } \
501         } \
502 } while(0)
503
504 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
505                                              struct flowi *fl, int flags)
506 {
507         struct fib6_node *fn;
508         struct rt6_info *rt;
509
510         read_lock_bh(&table->tb6_lock);
511         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
512 restart:
513         rt = fn->leaf;
514         rt = rt6_device_match(rt, fl->oif, flags & RT6_F_STRICT);
515         BACKTRACK(&fl->fl6_src);
516         dst_hold(&rt->u.dst);
517 out:
518         read_unlock_bh(&table->tb6_lock);
519
520         rt->u.dst.lastuse = jiffies;
521         rt->u.dst.__use++;
522
523         return rt;
524
525 }
526
527 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
528                             int oif, int strict)
529 {
530         struct flowi fl = {
531                 .oif = oif,
532                 .nl_u = {
533                         .ip6_u = {
534                                 .daddr = *daddr,
535                                 /* TODO: saddr */
536                         },
537                 },
538         };
539         struct dst_entry *dst;
540         int flags = strict ? RT6_F_STRICT : 0;
541
542         dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
543         if (dst->error == 0)
544                 return (struct rt6_info *) dst;
545
546         dst_release(dst);
547
548         return NULL;
549 }
550
551 /* ip6_ins_rt is called with FREE table->tb6_lock.
552    It takes new route entry, the addition fails by any reason the
553    route is freed. In any case, if caller does not hold it, it may
554    be destroyed.
555  */
556
557 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
558 {
559         int err;
560         struct fib6_table *table;
561
562         table = rt->rt6i_table;
563         write_lock_bh(&table->tb6_lock);
564         err = fib6_add(&table->tb6_root, rt, info);
565         write_unlock_bh(&table->tb6_lock);
566
567         return err;
568 }
569
570 int ip6_ins_rt(struct rt6_info *rt)
571 {
572         return __ip6_ins_rt(rt, NULL);
573 }
574
575 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
576                                       struct in6_addr *saddr)
577 {
578         struct rt6_info *rt;
579
580         /*
581          *      Clone the route.
582          */
583
584         rt = ip6_rt_copy(ort);
585
586         if (rt) {
587                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
588                         if (rt->rt6i_dst.plen != 128 &&
589                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
590                                 rt->rt6i_flags |= RTF_ANYCAST;
591                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
592                 }
593
594                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
595                 rt->rt6i_dst.plen = 128;
596                 rt->rt6i_flags |= RTF_CACHE;
597                 rt->u.dst.flags |= DST_HOST;
598
599 #ifdef CONFIG_IPV6_SUBTREES
600                 if (rt->rt6i_src.plen && saddr) {
601                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
602                         rt->rt6i_src.plen = 128;
603                 }
604 #endif
605
606                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
607
608         }
609
610         return rt;
611 }
612
613 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
614 {
615         struct rt6_info *rt = ip6_rt_copy(ort);
616         if (rt) {
617                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
618                 rt->rt6i_dst.plen = 128;
619                 rt->rt6i_flags |= RTF_CACHE;
620                 if (rt->rt6i_flags & RTF_REJECT)
621                         rt->u.dst.error = ort->u.dst.error;
622                 rt->u.dst.flags |= DST_HOST;
623                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
624         }
625         return rt;
626 }
627
628 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
629                                             struct flowi *fl, int flags)
630 {
631         struct fib6_node *fn;
632         struct rt6_info *rt, *nrt;
633         int strict = 0;
634         int attempts = 3;
635         int err;
636         int reachable = RT6_SELECT_F_REACHABLE;
637
638         if (flags & RT6_F_STRICT)
639                 strict = RT6_SELECT_F_IFACE;
640
641 relookup:
642         read_lock_bh(&table->tb6_lock);
643
644 restart_2:
645         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
646
647 restart:
648         rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
649         BACKTRACK(&fl->fl6_src);
650         if (rt == &ip6_null_entry ||
651             rt->rt6i_flags & RTF_CACHE)
652                 goto out;
653
654         dst_hold(&rt->u.dst);
655         read_unlock_bh(&table->tb6_lock);
656
657         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
658                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
659         else {
660 #if CLONE_OFFLINK_ROUTE
661                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
662 #else
663                 goto out2;
664 #endif
665         }
666
667         dst_release(&rt->u.dst);
668         rt = nrt ? : &ip6_null_entry;
669
670         dst_hold(&rt->u.dst);
671         if (nrt) {
672                 err = ip6_ins_rt(nrt);
673                 if (!err)
674                         goto out2;
675         }
676
677         if (--attempts <= 0)
678                 goto out2;
679
680         /*
681          * Race condition! In the gap, when table->tb6_lock was
682          * released someone could insert this route.  Relookup.
683          */
684         dst_release(&rt->u.dst);
685         goto relookup;
686
687 out:
688         if (reachable) {
689                 reachable = 0;
690                 goto restart_2;
691         }
692         dst_hold(&rt->u.dst);
693         read_unlock_bh(&table->tb6_lock);
694 out2:
695         rt->u.dst.lastuse = jiffies;
696         rt->u.dst.__use++;
697
698         return rt;
699 }
700
701 void ip6_route_input(struct sk_buff *skb)
702 {
703         struct ipv6hdr *iph = skb->nh.ipv6h;
704         struct flowi fl = {
705                 .iif = skb->dev->ifindex,
706                 .nl_u = {
707                         .ip6_u = {
708                                 .daddr = iph->daddr,
709                                 .saddr = iph->saddr,
710                                 .flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
711                         },
712                 },
713                 .proto = iph->nexthdr,
714         };
715         int flags = 0;
716
717         if (rt6_need_strict(&iph->daddr))
718                 flags |= RT6_F_STRICT;
719
720         skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
721 }
722
723 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
724                                              struct flowi *fl, int flags)
725 {
726         struct fib6_node *fn;
727         struct rt6_info *rt, *nrt;
728         int strict = 0;
729         int attempts = 3;
730         int err;
731         int reachable = RT6_SELECT_F_REACHABLE;
732
733         if (flags & RT6_F_STRICT)
734                 strict = RT6_SELECT_F_IFACE;
735
736 relookup:
737         read_lock_bh(&table->tb6_lock);
738
739 restart_2:
740         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
741
742 restart:
743         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
744         BACKTRACK(&fl->fl6_src);
745         if (rt == &ip6_null_entry ||
746             rt->rt6i_flags & RTF_CACHE)
747                 goto out;
748
749         dst_hold(&rt->u.dst);
750         read_unlock_bh(&table->tb6_lock);
751
752         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
753                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
754         else {
755 #if CLONE_OFFLINK_ROUTE
756                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
757 #else
758                 goto out2;
759 #endif
760         }
761
762         dst_release(&rt->u.dst);
763         rt = nrt ? : &ip6_null_entry;
764
765         dst_hold(&rt->u.dst);
766         if (nrt) {
767                 err = ip6_ins_rt(nrt);
768                 if (!err)
769                         goto out2;
770         }
771
772         if (--attempts <= 0)
773                 goto out2;
774
775         /*
776          * Race condition! In the gap, when table->tb6_lock was
777          * released someone could insert this route.  Relookup.
778          */
779         dst_release(&rt->u.dst);
780         goto relookup;
781
782 out:
783         if (reachable) {
784                 reachable = 0;
785                 goto restart_2;
786         }
787         dst_hold(&rt->u.dst);
788         read_unlock_bh(&table->tb6_lock);
789 out2:
790         rt->u.dst.lastuse = jiffies;
791         rt->u.dst.__use++;
792         return rt;
793 }
794
795 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
796 {
797         int flags = 0;
798
799         if (rt6_need_strict(&fl->fl6_dst))
800                 flags |= RT6_F_STRICT;
801
802         return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
803 }
804
805
806 /*
807  *      Destination cache support functions
808  */
809
810 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
811 {
812         struct rt6_info *rt;
813
814         rt = (struct rt6_info *) dst;
815
816         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
817                 return dst;
818
819         return NULL;
820 }
821
822 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
823 {
824         struct rt6_info *rt = (struct rt6_info *) dst;
825
826         if (rt) {
827                 if (rt->rt6i_flags & RTF_CACHE)
828                         ip6_del_rt(rt);
829                 else
830                         dst_release(dst);
831         }
832         return NULL;
833 }
834
835 static void ip6_link_failure(struct sk_buff *skb)
836 {
837         struct rt6_info *rt;
838
839         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
840
841         rt = (struct rt6_info *) skb->dst;
842         if (rt) {
843                 if (rt->rt6i_flags&RTF_CACHE) {
844                         dst_set_expires(&rt->u.dst, 0);
845                         rt->rt6i_flags |= RTF_EXPIRES;
846                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
847                         rt->rt6i_node->fn_sernum = -1;
848         }
849 }
850
851 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
852 {
853         struct rt6_info *rt6 = (struct rt6_info*)dst;
854
855         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
856                 rt6->rt6i_flags |= RTF_MODIFIED;
857                 if (mtu < IPV6_MIN_MTU) {
858                         mtu = IPV6_MIN_MTU;
859                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
860                 }
861                 dst->metrics[RTAX_MTU-1] = mtu;
862                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
863         }
864 }
865
866 static int ipv6_get_mtu(struct net_device *dev);
867
868 static inline unsigned int ipv6_advmss(unsigned int mtu)
869 {
870         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
871
872         if (mtu < ip6_rt_min_advmss)
873                 mtu = ip6_rt_min_advmss;
874
875         /*
876          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
877          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
878          * IPV6_MAXPLEN is also valid and means: "any MSS, 
879          * rely only on pmtu discovery"
880          */
881         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
882                 mtu = IPV6_MAXPLEN;
883         return mtu;
884 }
885
886 static struct dst_entry *ndisc_dst_gc_list;
887 static DEFINE_SPINLOCK(ndisc_lock);
888
889 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
890                                   struct neighbour *neigh,
891                                   struct in6_addr *addr,
892                                   int (*output)(struct sk_buff *))
893 {
894         struct rt6_info *rt;
895         struct inet6_dev *idev = in6_dev_get(dev);
896
897         if (unlikely(idev == NULL))
898                 return NULL;
899
900         rt = ip6_dst_alloc();
901         if (unlikely(rt == NULL)) {
902                 in6_dev_put(idev);
903                 goto out;
904         }
905
906         dev_hold(dev);
907         if (neigh)
908                 neigh_hold(neigh);
909         else
910                 neigh = ndisc_get_neigh(dev, addr);
911
912         rt->rt6i_dev      = dev;
913         rt->rt6i_idev     = idev;
914         rt->rt6i_nexthop  = neigh;
915         atomic_set(&rt->u.dst.__refcnt, 1);
916         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
917         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
918         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
919         rt->u.dst.output  = output;
920
921 #if 0   /* there's no chance to use these for ndisc */
922         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
923                                 ? DST_HOST 
924                                 : 0;
925         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
926         rt->rt6i_dst.plen = 128;
927 #endif
928
929         spin_lock_bh(&ndisc_lock);
930         rt->u.dst.next = ndisc_dst_gc_list;
931         ndisc_dst_gc_list = &rt->u.dst;
932         spin_unlock_bh(&ndisc_lock);
933
934         fib6_force_start_gc();
935
936 out:
937         return (struct dst_entry *)rt;
938 }
939
940 int ndisc_dst_gc(int *more)
941 {
942         struct dst_entry *dst, *next, **pprev;
943         int freed;
944
945         next = NULL;
946         freed = 0;
947
948         spin_lock_bh(&ndisc_lock);
949         pprev = &ndisc_dst_gc_list;
950
951         while ((dst = *pprev) != NULL) {
952                 if (!atomic_read(&dst->__refcnt)) {
953                         *pprev = dst->next;
954                         dst_free(dst);
955                         freed++;
956                 } else {
957                         pprev = &dst->next;
958                         (*more)++;
959                 }
960         }
961
962         spin_unlock_bh(&ndisc_lock);
963
964         return freed;
965 }
966
967 static int ip6_dst_gc(void)
968 {
969         static unsigned expire = 30*HZ;
970         static unsigned long last_gc;
971         unsigned long now = jiffies;
972
973         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
974             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
975                 goto out;
976
977         expire++;
978         fib6_run_gc(expire);
979         last_gc = now;
980         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
981                 expire = ip6_rt_gc_timeout>>1;
982
983 out:
984         expire -= expire>>ip6_rt_gc_elasticity;
985         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
986 }
987
988 /* Clean host part of a prefix. Not necessary in radix tree,
989    but results in cleaner routing tables.
990
991    Remove it only when all the things will work!
992  */
993
994 static int ipv6_get_mtu(struct net_device *dev)
995 {
996         int mtu = IPV6_MIN_MTU;
997         struct inet6_dev *idev;
998
999         idev = in6_dev_get(dev);
1000         if (idev) {
1001                 mtu = idev->cnf.mtu6;
1002                 in6_dev_put(idev);
1003         }
1004         return mtu;
1005 }
1006
1007 int ipv6_get_hoplimit(struct net_device *dev)
1008 {
1009         int hoplimit = ipv6_devconf.hop_limit;
1010         struct inet6_dev *idev;
1011
1012         idev = in6_dev_get(dev);
1013         if (idev) {
1014                 hoplimit = idev->cnf.hop_limit;
1015                 in6_dev_put(idev);
1016         }
1017         return hoplimit;
1018 }
1019
1020 /*
1021  *
1022  */
1023
1024 int ip6_route_add(struct fib6_config *cfg)
1025 {
1026         int err;
1027         struct rt6_info *rt = NULL;
1028         struct net_device *dev = NULL;
1029         struct inet6_dev *idev = NULL;
1030         struct fib6_table *table;
1031         int addr_type;
1032
1033         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1034                 return -EINVAL;
1035 #ifndef CONFIG_IPV6_SUBTREES
1036         if (cfg->fc_src_len)
1037                 return -EINVAL;
1038 #endif
1039         if (cfg->fc_ifindex) {
1040                 err = -ENODEV;
1041                 dev = dev_get_by_index(cfg->fc_ifindex);
1042                 if (!dev)
1043                         goto out;
1044                 idev = in6_dev_get(dev);
1045                 if (!idev)
1046                         goto out;
1047         }
1048
1049         if (cfg->fc_metric == 0)
1050                 cfg->fc_metric = IP6_RT_PRIO_USER;
1051
1052         table = fib6_new_table(cfg->fc_table);
1053         if (table == NULL) {
1054                 err = -ENOBUFS;
1055                 goto out;
1056         }
1057
1058         rt = ip6_dst_alloc();
1059
1060         if (rt == NULL) {
1061                 err = -ENOMEM;
1062                 goto out;
1063         }
1064
1065         rt->u.dst.obsolete = -1;
1066         rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1067
1068         if (cfg->fc_protocol == RTPROT_UNSPEC)
1069                 cfg->fc_protocol = RTPROT_BOOT;
1070         rt->rt6i_protocol = cfg->fc_protocol;
1071
1072         addr_type = ipv6_addr_type(&cfg->fc_dst);
1073
1074         if (addr_type & IPV6_ADDR_MULTICAST)
1075                 rt->u.dst.input = ip6_mc_input;
1076         else
1077                 rt->u.dst.input = ip6_forward;
1078
1079         rt->u.dst.output = ip6_output;
1080
1081         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1082         rt->rt6i_dst.plen = cfg->fc_dst_len;
1083         if (rt->rt6i_dst.plen == 128)
1084                rt->u.dst.flags = DST_HOST;
1085
1086 #ifdef CONFIG_IPV6_SUBTREES
1087         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1088         rt->rt6i_src.plen = cfg->fc_src_len;
1089 #endif
1090
1091         rt->rt6i_metric = cfg->fc_metric;
1092
1093         /* We cannot add true routes via loopback here,
1094            they would result in kernel looping; promote them to reject routes
1095          */
1096         if ((cfg->fc_flags & RTF_REJECT) ||
1097             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1098                 /* hold loopback dev/idev if we haven't done so. */
1099                 if (dev != &loopback_dev) {
1100                         if (dev) {
1101                                 dev_put(dev);
1102                                 in6_dev_put(idev);
1103                         }
1104                         dev = &loopback_dev;
1105                         dev_hold(dev);
1106                         idev = in6_dev_get(dev);
1107                         if (!idev) {
1108                                 err = -ENODEV;
1109                                 goto out;
1110                         }
1111                 }
1112                 rt->u.dst.output = ip6_pkt_discard_out;
1113                 rt->u.dst.input = ip6_pkt_discard;
1114                 rt->u.dst.error = -ENETUNREACH;
1115                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1116                 goto install_route;
1117         }
1118
1119         if (cfg->fc_flags & RTF_GATEWAY) {
1120                 struct in6_addr *gw_addr;
1121                 int gwa_type;
1122
1123                 gw_addr = &cfg->fc_gateway;
1124                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1125                 gwa_type = ipv6_addr_type(gw_addr);
1126
1127                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1128                         struct rt6_info *grt;
1129
1130                         /* IPv6 strictly inhibits using not link-local
1131                            addresses as nexthop address.
1132                            Otherwise, router will not able to send redirects.
1133                            It is very good, but in some (rare!) circumstances
1134                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1135                            some exceptions. --ANK
1136                          */
1137                         err = -EINVAL;
1138                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1139                                 goto out;
1140
1141                         grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1142
1143                         err = -EHOSTUNREACH;
1144                         if (grt == NULL)
1145                                 goto out;
1146                         if (dev) {
1147                                 if (dev != grt->rt6i_dev) {
1148                                         dst_release(&grt->u.dst);
1149                                         goto out;
1150                                 }
1151                         } else {
1152                                 dev = grt->rt6i_dev;
1153                                 idev = grt->rt6i_idev;
1154                                 dev_hold(dev);
1155                                 in6_dev_hold(grt->rt6i_idev);
1156                         }
1157                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1158                                 err = 0;
1159                         dst_release(&grt->u.dst);
1160
1161                         if (err)
1162                                 goto out;
1163                 }
1164                 err = -EINVAL;
1165                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1166                         goto out;
1167         }
1168
1169         err = -ENODEV;
1170         if (dev == NULL)
1171                 goto out;
1172
1173         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1174                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1175                 if (IS_ERR(rt->rt6i_nexthop)) {
1176                         err = PTR_ERR(rt->rt6i_nexthop);
1177                         rt->rt6i_nexthop = NULL;
1178                         goto out;
1179                 }
1180         }
1181
1182         rt->rt6i_flags = cfg->fc_flags;
1183
1184 install_route:
1185         if (cfg->fc_mx) {
1186                 struct nlattr *nla;
1187                 int remaining;
1188
1189                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1190                         int type = nla->nla_type;
1191
1192                         if (type) {
1193                                 if (type > RTAX_MAX) {
1194                                         err = -EINVAL;
1195                                         goto out;
1196                                 }
1197
1198                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1199                         }
1200                 }
1201         }
1202
1203         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1204                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1205         if (!rt->u.dst.metrics[RTAX_MTU-1])
1206                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1207         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1208                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1209         rt->u.dst.dev = dev;
1210         rt->rt6i_idev = idev;
1211         rt->rt6i_table = table;
1212         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1213
1214 out:
1215         if (dev)
1216                 dev_put(dev);
1217         if (idev)
1218                 in6_dev_put(idev);
1219         if (rt)
1220                 dst_free((struct dst_entry *) rt);
1221         return err;
1222 }
1223
1224 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1225 {
1226         int err;
1227         struct fib6_table *table;
1228
1229         if (rt == &ip6_null_entry)
1230                 return -ENOENT;
1231
1232         table = rt->rt6i_table;
1233         write_lock_bh(&table->tb6_lock);
1234
1235         err = fib6_del(rt, info);
1236         dst_release(&rt->u.dst);
1237
1238         write_unlock_bh(&table->tb6_lock);
1239
1240         return err;
1241 }
1242
1243 int ip6_del_rt(struct rt6_info *rt)
1244 {
1245         return __ip6_del_rt(rt, NULL);
1246 }
1247
1248 static int ip6_route_del(struct fib6_config *cfg)
1249 {
1250         struct fib6_table *table;
1251         struct fib6_node *fn;
1252         struct rt6_info *rt;
1253         int err = -ESRCH;
1254
1255         table = fib6_get_table(cfg->fc_table);
1256         if (table == NULL)
1257                 return err;
1258
1259         read_lock_bh(&table->tb6_lock);
1260
1261         fn = fib6_locate(&table->tb6_root,
1262                          &cfg->fc_dst, cfg->fc_dst_len,
1263                          &cfg->fc_src, cfg->fc_src_len);
1264         
1265         if (fn) {
1266                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1267                         if (cfg->fc_ifindex &&
1268                             (rt->rt6i_dev == NULL ||
1269                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1270                                 continue;
1271                         if (cfg->fc_flags & RTF_GATEWAY &&
1272                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1273                                 continue;
1274                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1275                                 continue;
1276                         dst_hold(&rt->u.dst);
1277                         read_unlock_bh(&table->tb6_lock);
1278
1279                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1280                 }
1281         }
1282         read_unlock_bh(&table->tb6_lock);
1283
1284         return err;
1285 }
1286
1287 /*
1288  *      Handle redirects
1289  */
1290 struct ip6rd_flowi {
1291         struct flowi fl;
1292         struct in6_addr gateway;
1293 };
1294
1295 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1296                                              struct flowi *fl,
1297                                              int flags)
1298 {
1299         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1300         struct rt6_info *rt;
1301         struct fib6_node *fn;
1302
1303         /*
1304          * Get the "current" route for this destination and
1305          * check if the redirect has come from approriate router.
1306          *
1307          * RFC 2461 specifies that redirects should only be
1308          * accepted if they come from the nexthop to the target.
1309          * Due to the way the routes are chosen, this notion
1310          * is a bit fuzzy and one might need to check all possible
1311          * routes.
1312          */
1313
1314         read_lock_bh(&table->tb6_lock);
1315         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1316 restart:
1317         for (rt = fn->leaf; rt; rt = rt->u.next) {
1318                 /*
1319                  * Current route is on-link; redirect is always invalid.
1320                  *
1321                  * Seems, previous statement is not true. It could
1322                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1323                  * But then router serving it might decide, that we should
1324                  * know truth 8)8) --ANK (980726).
1325                  */
1326                 if (rt6_check_expired(rt))
1327                         continue;
1328                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1329                         continue;
1330                 if (fl->oif != rt->rt6i_dev->ifindex)
1331                         continue;
1332                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1333                         continue;
1334                 break;
1335         }
1336
1337         if (!rt)
1338                 rt = &ip6_null_entry;
1339         BACKTRACK(&fl->fl6_src);
1340 out:
1341         dst_hold(&rt->u.dst);
1342
1343         read_unlock_bh(&table->tb6_lock);
1344
1345         return rt;
1346 };
1347
1348 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1349                                            struct in6_addr *src,
1350                                            struct in6_addr *gateway,
1351                                            struct net_device *dev)
1352 {
1353         struct ip6rd_flowi rdfl = {
1354                 .fl = {
1355                         .oif = dev->ifindex,
1356                         .nl_u = {
1357                                 .ip6_u = {
1358                                         .daddr = *dest,
1359                                         .saddr = *src,
1360                                 },
1361                         },
1362                 },
1363                 .gateway = *gateway,
1364         };
1365         int flags = rt6_need_strict(dest) ? RT6_F_STRICT : 0;
1366
1367         return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1368 }
1369
1370 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1371                   struct in6_addr *saddr,
1372                   struct neighbour *neigh, u8 *lladdr, int on_link)
1373 {
1374         struct rt6_info *rt, *nrt = NULL;
1375         struct netevent_redirect netevent;
1376
1377         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1378
1379         if (rt == &ip6_null_entry) {
1380                 if (net_ratelimit())
1381                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1382                                "for redirect target\n");
1383                 goto out;
1384         }
1385
1386         /*
1387          *      We have finally decided to accept it.
1388          */
1389
1390         neigh_update(neigh, lladdr, NUD_STALE, 
1391                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1392                      NEIGH_UPDATE_F_OVERRIDE|
1393                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1394                                      NEIGH_UPDATE_F_ISROUTER))
1395                      );
1396
1397         /*
1398          * Redirect received -> path was valid.
1399          * Look, redirects are sent only in response to data packets,
1400          * so that this nexthop apparently is reachable. --ANK
1401          */
1402         dst_confirm(&rt->u.dst);
1403
1404         /* Duplicate redirect: silently ignore. */
1405         if (neigh == rt->u.dst.neighbour)
1406                 goto out;
1407
1408         nrt = ip6_rt_copy(rt);
1409         if (nrt == NULL)
1410                 goto out;
1411
1412         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1413         if (on_link)
1414                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1415
1416         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1417         nrt->rt6i_dst.plen = 128;
1418         nrt->u.dst.flags |= DST_HOST;
1419
1420         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1421         nrt->rt6i_nexthop = neigh_clone(neigh);
1422         /* Reset pmtu, it may be better */
1423         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1424         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1425
1426         if (ip6_ins_rt(nrt))
1427                 goto out;
1428
1429         netevent.old = &rt->u.dst;
1430         netevent.new = &nrt->u.dst;
1431         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1432
1433         if (rt->rt6i_flags&RTF_CACHE) {
1434                 ip6_del_rt(rt);
1435                 return;
1436         }
1437
1438 out:
1439         dst_release(&rt->u.dst);
1440         return;
1441 }
1442
1443 /*
1444  *      Handle ICMP "packet too big" messages
1445  *      i.e. Path MTU discovery
1446  */
1447
1448 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1449                         struct net_device *dev, u32 pmtu)
1450 {
1451         struct rt6_info *rt, *nrt;
1452         int allfrag = 0;
1453
1454         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1455         if (rt == NULL)
1456                 return;
1457
1458         if (pmtu >= dst_mtu(&rt->u.dst))
1459                 goto out;
1460
1461         if (pmtu < IPV6_MIN_MTU) {
1462                 /*
1463                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1464                  * MTU (1280) and a fragment header should always be included
1465                  * after a node receiving Too Big message reporting PMTU is
1466                  * less than the IPv6 Minimum Link MTU.
1467                  */
1468                 pmtu = IPV6_MIN_MTU;
1469                 allfrag = 1;
1470         }
1471
1472         /* New mtu received -> path was valid.
1473            They are sent only in response to data packets,
1474            so that this nexthop apparently is reachable. --ANK
1475          */
1476         dst_confirm(&rt->u.dst);
1477
1478         /* Host route. If it is static, it would be better
1479            not to override it, but add new one, so that
1480            when cache entry will expire old pmtu
1481            would return automatically.
1482          */
1483         if (rt->rt6i_flags & RTF_CACHE) {
1484                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1485                 if (allfrag)
1486                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1487                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1488                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1489                 goto out;
1490         }
1491
1492         /* Network route.
1493            Two cases are possible:
1494            1. It is connected route. Action: COW
1495            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1496          */
1497         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1498                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1499         else
1500                 nrt = rt6_alloc_clone(rt, daddr);
1501
1502         if (nrt) {
1503                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1504                 if (allfrag)
1505                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1506
1507                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1508                  * happened within 5 mins, the recommended timer is 10 mins.
1509                  * Here this route expiration time is set to ip6_rt_mtu_expires
1510                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1511                  * and detecting PMTU increase will be automatically happened.
1512                  */
1513                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1514                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1515
1516                 ip6_ins_rt(nrt);
1517         }
1518 out:
1519         dst_release(&rt->u.dst);
1520 }
1521
1522 /*
1523  *      Misc support functions
1524  */
1525
1526 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1527 {
1528         struct rt6_info *rt = ip6_dst_alloc();
1529
1530         if (rt) {
1531                 rt->u.dst.input = ort->u.dst.input;
1532                 rt->u.dst.output = ort->u.dst.output;
1533
1534                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1535                 rt->u.dst.dev = ort->u.dst.dev;
1536                 if (rt->u.dst.dev)
1537                         dev_hold(rt->u.dst.dev);
1538                 rt->rt6i_idev = ort->rt6i_idev;
1539                 if (rt->rt6i_idev)
1540                         in6_dev_hold(rt->rt6i_idev);
1541                 rt->u.dst.lastuse = jiffies;
1542                 rt->rt6i_expires = 0;
1543
1544                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1545                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1546                 rt->rt6i_metric = 0;
1547
1548                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1549 #ifdef CONFIG_IPV6_SUBTREES
1550                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1551 #endif
1552                 rt->rt6i_table = ort->rt6i_table;
1553         }
1554         return rt;
1555 }
1556
1557 #ifdef CONFIG_IPV6_ROUTE_INFO
1558 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1559                                            struct in6_addr *gwaddr, int ifindex)
1560 {
1561         struct fib6_node *fn;
1562         struct rt6_info *rt = NULL;
1563         struct fib6_table *table;
1564
1565         table = fib6_get_table(RT6_TABLE_INFO);
1566         if (table == NULL)
1567                 return NULL;
1568
1569         write_lock_bh(&table->tb6_lock);
1570         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1571         if (!fn)
1572                 goto out;
1573
1574         for (rt = fn->leaf; rt; rt = rt->u.next) {
1575                 if (rt->rt6i_dev->ifindex != ifindex)
1576                         continue;
1577                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1578                         continue;
1579                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1580                         continue;
1581                 dst_hold(&rt->u.dst);
1582                 break;
1583         }
1584 out:
1585         write_unlock_bh(&table->tb6_lock);
1586         return rt;
1587 }
1588
1589 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1590                                            struct in6_addr *gwaddr, int ifindex,
1591                                            unsigned pref)
1592 {
1593         struct fib6_config cfg = {
1594                 .fc_table       = RT6_TABLE_INFO,
1595                 .fc_metric      = 1024,
1596                 .fc_ifindex     = ifindex,
1597                 .fc_dst_len     = prefixlen,
1598                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1599                                   RTF_UP | RTF_PREF(pref),
1600         };
1601
1602         ipv6_addr_copy(&cfg.fc_dst, prefix);
1603         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1604
1605         /* We should treat it as a default route if prefix length is 0. */
1606         if (!prefixlen)
1607                 cfg.fc_flags |= RTF_DEFAULT;
1608
1609         ip6_route_add(&cfg);
1610
1611         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1612 }
1613 #endif
1614
1615 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1616 {       
1617         struct rt6_info *rt;
1618         struct fib6_table *table;
1619
1620         table = fib6_get_table(RT6_TABLE_DFLT);
1621         if (table == NULL)
1622                 return NULL;
1623
1624         write_lock_bh(&table->tb6_lock);
1625         for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1626                 if (dev == rt->rt6i_dev &&
1627                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1628                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1629                         break;
1630         }
1631         if (rt)
1632                 dst_hold(&rt->u.dst);
1633         write_unlock_bh(&table->tb6_lock);
1634         return rt;
1635 }
1636
1637 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1638                                      struct net_device *dev,
1639                                      unsigned int pref)
1640 {
1641         struct fib6_config cfg = {
1642                 .fc_table       = RT6_TABLE_DFLT,
1643                 .fc_metric      = 1024,
1644                 .fc_ifindex     = dev->ifindex,
1645                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1646                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1647         };
1648
1649         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1650
1651         ip6_route_add(&cfg);
1652
1653         return rt6_get_dflt_router(gwaddr, dev);
1654 }
1655
1656 void rt6_purge_dflt_routers(void)
1657 {
1658         struct rt6_info *rt;
1659         struct fib6_table *table;
1660
1661         /* NOTE: Keep consistent with rt6_get_dflt_router */
1662         table = fib6_get_table(RT6_TABLE_DFLT);
1663         if (table == NULL)
1664                 return;
1665
1666 restart:
1667         read_lock_bh(&table->tb6_lock);
1668         for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1669                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1670                         dst_hold(&rt->u.dst);
1671                         read_unlock_bh(&table->tb6_lock);
1672                         ip6_del_rt(rt);
1673                         goto restart;
1674                 }
1675         }
1676         read_unlock_bh(&table->tb6_lock);
1677 }
1678
1679 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1680                                  struct fib6_config *cfg)
1681 {
1682         memset(cfg, 0, sizeof(*cfg));
1683
1684         cfg->fc_table = RT6_TABLE_MAIN;
1685         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1686         cfg->fc_metric = rtmsg->rtmsg_metric;
1687         cfg->fc_expires = rtmsg->rtmsg_info;
1688         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1689         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1690         cfg->fc_flags = rtmsg->rtmsg_flags;
1691
1692         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1693         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1694         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1695 }
1696
1697 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1698 {
1699         struct fib6_config cfg;
1700         struct in6_rtmsg rtmsg;
1701         int err;
1702
1703         switch(cmd) {
1704         case SIOCADDRT:         /* Add a route */
1705         case SIOCDELRT:         /* Delete a route */
1706                 if (!capable(CAP_NET_ADMIN))
1707                         return -EPERM;
1708                 err = copy_from_user(&rtmsg, arg,
1709                                      sizeof(struct in6_rtmsg));
1710                 if (err)
1711                         return -EFAULT;
1712
1713                 rtmsg_to_fib6_config(&rtmsg, &cfg);
1714
1715                 rtnl_lock();
1716                 switch (cmd) {
1717                 case SIOCADDRT:
1718                         err = ip6_route_add(&cfg);
1719                         break;
1720                 case SIOCDELRT:
1721                         err = ip6_route_del(&cfg);
1722                         break;
1723                 default:
1724                         err = -EINVAL;
1725                 }
1726                 rtnl_unlock();
1727
1728                 return err;
1729         };
1730
1731         return -EINVAL;
1732 }
1733
1734 /*
1735  *      Drop the packet on the floor
1736  */
1737
1738 static int ip6_pkt_discard(struct sk_buff *skb)
1739 {
1740         int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1741         if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1742                 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1743
1744         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1745         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1746         kfree_skb(skb);
1747         return 0;
1748 }
1749
1750 static int ip6_pkt_discard_out(struct sk_buff *skb)
1751 {
1752         skb->dev = skb->dst->dev;
1753         return ip6_pkt_discard(skb);
1754 }
1755
1756 /*
1757  *      Allocate a dst for local (unicast / anycast) address.
1758  */
1759
1760 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1761                                     const struct in6_addr *addr,
1762                                     int anycast)
1763 {
1764         struct rt6_info *rt = ip6_dst_alloc();
1765
1766         if (rt == NULL)
1767                 return ERR_PTR(-ENOMEM);
1768
1769         dev_hold(&loopback_dev);
1770         in6_dev_hold(idev);
1771
1772         rt->u.dst.flags = DST_HOST;
1773         rt->u.dst.input = ip6_input;
1774         rt->u.dst.output = ip6_output;
1775         rt->rt6i_dev = &loopback_dev;
1776         rt->rt6i_idev = idev;
1777         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1778         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1779         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1780         rt->u.dst.obsolete = -1;
1781
1782         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1783         if (anycast)
1784                 rt->rt6i_flags |= RTF_ANYCAST;
1785         else
1786                 rt->rt6i_flags |= RTF_LOCAL;
1787         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1788         if (rt->rt6i_nexthop == NULL) {
1789                 dst_free((struct dst_entry *) rt);
1790                 return ERR_PTR(-ENOMEM);
1791         }
1792
1793         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1794         rt->rt6i_dst.plen = 128;
1795         rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1796
1797         atomic_set(&rt->u.dst.__refcnt, 1);
1798
1799         return rt;
1800 }
1801
1802 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1803 {
1804         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1805             rt != &ip6_null_entry) {
1806                 RT6_TRACE("deleted by ifdown %p\n", rt);
1807                 return -1;
1808         }
1809         return 0;
1810 }
1811
1812 void rt6_ifdown(struct net_device *dev)
1813 {
1814         fib6_clean_all(fib6_ifdown, 0, dev);
1815 }
1816
1817 struct rt6_mtu_change_arg
1818 {
1819         struct net_device *dev;
1820         unsigned mtu;
1821 };
1822
1823 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1824 {
1825         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1826         struct inet6_dev *idev;
1827
1828         /* In IPv6 pmtu discovery is not optional,
1829            so that RTAX_MTU lock cannot disable it.
1830            We still use this lock to block changes
1831            caused by addrconf/ndisc.
1832         */
1833
1834         idev = __in6_dev_get(arg->dev);
1835         if (idev == NULL)
1836                 return 0;
1837
1838         /* For administrative MTU increase, there is no way to discover
1839            IPv6 PMTU increase, so PMTU increase should be updated here.
1840            Since RFC 1981 doesn't include administrative MTU increase
1841            update PMTU increase is a MUST. (i.e. jumbo frame)
1842          */
1843         /*
1844            If new MTU is less than route PMTU, this new MTU will be the
1845            lowest MTU in the path, update the route PMTU to reflect PMTU
1846            decreases; if new MTU is greater than route PMTU, and the
1847            old MTU is the lowest MTU in the path, update the route PMTU
1848            to reflect the increase. In this case if the other nodes' MTU
1849            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1850            PMTU discouvery.
1851          */
1852         if (rt->rt6i_dev == arg->dev &&
1853             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1854             (dst_mtu(&rt->u.dst) > arg->mtu ||
1855              (dst_mtu(&rt->u.dst) < arg->mtu &&
1856               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1857                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1858         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1859         return 0;
1860 }
1861
1862 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1863 {
1864         struct rt6_mtu_change_arg arg = {
1865                 .dev = dev,
1866                 .mtu = mtu,
1867         };
1868
1869         fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1870 }
1871
1872 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1873         [RTA_GATEWAY]           = { .minlen = sizeof(struct in6_addr) },
1874         [RTA_OIF]               = { .type = NLA_U32 },
1875         [RTA_IIF]               = { .type = NLA_U32 },
1876         [RTA_PRIORITY]          = { .type = NLA_U32 },
1877         [RTA_METRICS]           = { .type = NLA_NESTED },
1878 };
1879
1880 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1881                               struct fib6_config *cfg)
1882 {
1883         struct rtmsg *rtm;
1884         struct nlattr *tb[RTA_MAX+1];
1885         int err;
1886
1887         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1888         if (err < 0)
1889                 goto errout;
1890
1891         err = -EINVAL;
1892         rtm = nlmsg_data(nlh);
1893         memset(cfg, 0, sizeof(*cfg));
1894
1895         cfg->fc_table = rtm->rtm_table;
1896         cfg->fc_dst_len = rtm->rtm_dst_len;
1897         cfg->fc_src_len = rtm->rtm_src_len;
1898         cfg->fc_flags = RTF_UP;
1899         cfg->fc_protocol = rtm->rtm_protocol;
1900
1901         if (rtm->rtm_type == RTN_UNREACHABLE)
1902                 cfg->fc_flags |= RTF_REJECT;
1903
1904         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1905         cfg->fc_nlinfo.nlh = nlh;
1906
1907         if (tb[RTA_GATEWAY]) {
1908                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1909                 cfg->fc_flags |= RTF_GATEWAY;
1910         }
1911
1912         if (tb[RTA_DST]) {
1913                 int plen = (rtm->rtm_dst_len + 7) >> 3;
1914
1915                 if (nla_len(tb[RTA_DST]) < plen)
1916                         goto errout;
1917
1918                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1919         }
1920
1921         if (tb[RTA_SRC]) {
1922                 int plen = (rtm->rtm_src_len + 7) >> 3;
1923
1924                 if (nla_len(tb[RTA_SRC]) < plen)
1925                         goto errout;
1926
1927                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1928         }
1929
1930         if (tb[RTA_OIF])
1931                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1932
1933         if (tb[RTA_PRIORITY])
1934                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1935
1936         if (tb[RTA_METRICS]) {
1937                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1938                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1939         }
1940
1941         if (tb[RTA_TABLE])
1942                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1943
1944         err = 0;
1945 errout:
1946         return err;
1947 }
1948
1949 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1950 {
1951         struct fib6_config cfg;
1952         int err;
1953
1954         err = rtm_to_fib6_config(skb, nlh, &cfg);
1955         if (err < 0)
1956                 return err;
1957
1958         return ip6_route_del(&cfg);
1959 }
1960
1961 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1962 {
1963         struct fib6_config cfg;
1964         int err;
1965
1966         err = rtm_to_fib6_config(skb, nlh, &cfg);
1967         if (err < 0)
1968                 return err;
1969
1970         return ip6_route_add(&cfg);
1971 }
1972
1973 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1974                          struct in6_addr *dst, struct in6_addr *src,
1975                          int iif, int type, u32 pid, u32 seq,
1976                          int prefix, unsigned int flags)
1977 {
1978         struct rtmsg *rtm;
1979         struct nlmsghdr *nlh;
1980         struct rta_cacheinfo ci;
1981         u32 table;
1982
1983         if (prefix) {   /* user wants prefix routes only */
1984                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1985                         /* success since this is not a prefix route */
1986                         return 1;
1987                 }
1988         }
1989
1990         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
1991         if (nlh == NULL)
1992                 return -ENOBUFS;
1993
1994         rtm = nlmsg_data(nlh);
1995         rtm->rtm_family = AF_INET6;
1996         rtm->rtm_dst_len = rt->rt6i_dst.plen;
1997         rtm->rtm_src_len = rt->rt6i_src.plen;
1998         rtm->rtm_tos = 0;
1999         if (rt->rt6i_table)
2000                 table = rt->rt6i_table->tb6_id;
2001         else
2002                 table = RT6_TABLE_UNSPEC;
2003         rtm->rtm_table = table;
2004         NLA_PUT_U32(skb, RTA_TABLE, table);
2005         if (rt->rt6i_flags&RTF_REJECT)
2006                 rtm->rtm_type = RTN_UNREACHABLE;
2007         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2008                 rtm->rtm_type = RTN_LOCAL;
2009         else
2010                 rtm->rtm_type = RTN_UNICAST;
2011         rtm->rtm_flags = 0;
2012         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2013         rtm->rtm_protocol = rt->rt6i_protocol;
2014         if (rt->rt6i_flags&RTF_DYNAMIC)
2015                 rtm->rtm_protocol = RTPROT_REDIRECT;
2016         else if (rt->rt6i_flags & RTF_ADDRCONF)
2017                 rtm->rtm_protocol = RTPROT_KERNEL;
2018         else if (rt->rt6i_flags&RTF_DEFAULT)
2019                 rtm->rtm_protocol = RTPROT_RA;
2020
2021         if (rt->rt6i_flags&RTF_CACHE)
2022                 rtm->rtm_flags |= RTM_F_CLONED;
2023
2024         if (dst) {
2025                 NLA_PUT(skb, RTA_DST, 16, dst);
2026                 rtm->rtm_dst_len = 128;
2027         } else if (rtm->rtm_dst_len)
2028                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2029 #ifdef CONFIG_IPV6_SUBTREES
2030         if (src) {
2031                 NLA_PUT(skb, RTA_SRC, 16, src);
2032                 rtm->rtm_src_len = 128;
2033         } else if (rtm->rtm_src_len)
2034                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2035 #endif
2036         if (iif)
2037                 NLA_PUT_U32(skb, RTA_IIF, iif);
2038         else if (dst) {
2039                 struct in6_addr saddr_buf;
2040                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2041                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2042         }
2043
2044         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2045                 goto nla_put_failure;
2046
2047         if (rt->u.dst.neighbour)
2048                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2049
2050         if (rt->u.dst.dev)
2051                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2052
2053         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2054         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2055         if (rt->rt6i_expires)
2056                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
2057         else
2058                 ci.rta_expires = 0;
2059         ci.rta_used = rt->u.dst.__use;
2060         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2061         ci.rta_error = rt->u.dst.error;
2062         ci.rta_id = 0;
2063         ci.rta_ts = 0;
2064         ci.rta_tsage = 0;
2065         NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2066
2067         return nlmsg_end(skb, nlh);
2068
2069 nla_put_failure:
2070         return nlmsg_cancel(skb, nlh);
2071 }
2072
2073 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2074 {
2075         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2076         int prefix;
2077
2078         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2079                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2080                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2081         } else
2082                 prefix = 0;
2083
2084         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2085                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2086                      prefix, NLM_F_MULTI);
2087 }
2088
2089 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2090 {
2091         struct nlattr *tb[RTA_MAX+1];
2092         struct rt6_info *rt;
2093         struct sk_buff *skb;
2094         struct rtmsg *rtm;
2095         struct flowi fl;
2096         int err, iif = 0;
2097
2098         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2099         if (err < 0)
2100                 goto errout;
2101
2102         err = -EINVAL;
2103         memset(&fl, 0, sizeof(fl));
2104
2105         if (tb[RTA_SRC]) {
2106                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2107                         goto errout;
2108
2109                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2110         }
2111
2112         if (tb[RTA_DST]) {
2113                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2114                         goto errout;
2115
2116                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2117         }
2118
2119         if (tb[RTA_IIF])
2120                 iif = nla_get_u32(tb[RTA_IIF]);
2121
2122         if (tb[RTA_OIF])
2123                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2124
2125         if (iif) {
2126                 struct net_device *dev;
2127                 dev = __dev_get_by_index(iif);
2128                 if (!dev) {
2129                         err = -ENODEV;
2130                         goto errout;
2131                 }
2132         }
2133
2134         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2135         if (skb == NULL) {
2136                 err = -ENOBUFS;
2137                 goto errout;
2138         }
2139
2140         /* Reserve room for dummy headers, this skb can pass
2141            through good chunk of routing engine.
2142          */
2143         skb->mac.raw = skb->data;
2144         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2145
2146         rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2147         skb->dst = &rt->u.dst;
2148
2149         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2150                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2151                             nlh->nlmsg_seq, 0, 0);
2152         if (err < 0) {
2153                 kfree_skb(skb);
2154                 goto errout;
2155         }
2156
2157         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2158 errout:
2159         return err;
2160 }
2161
2162 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2163 {
2164         struct sk_buff *skb;
2165         u32 pid = 0, seq = 0;
2166         struct nlmsghdr *nlh = NULL;
2167         int payload = sizeof(struct rtmsg) + 256;
2168         int err = -ENOBUFS;
2169
2170         if (info) {
2171                 pid = info->pid;
2172                 nlh = info->nlh;
2173                 if (nlh)
2174                         seq = nlh->nlmsg_seq;
2175         }
2176
2177         skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2178         if (skb == NULL)
2179                 goto errout;
2180
2181         err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2182         if (err < 0) {
2183                 kfree_skb(skb);
2184                 goto errout;
2185         }
2186
2187         err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2188 errout:
2189         if (err < 0)
2190                 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2191 }
2192
2193 /*
2194  *      /proc
2195  */
2196
2197 #ifdef CONFIG_PROC_FS
2198
2199 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2200
2201 struct rt6_proc_arg
2202 {
2203         char *buffer;
2204         int offset;
2205         int length;
2206         int skip;
2207         int len;
2208 };
2209
2210 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2211 {
2212         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2213         int i;
2214
2215         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2216                 arg->skip++;
2217                 return 0;
2218         }
2219
2220         if (arg->len >= arg->length)
2221                 return 0;
2222
2223         for (i=0; i<16; i++) {
2224                 sprintf(arg->buffer + arg->len, "%02x",
2225                         rt->rt6i_dst.addr.s6_addr[i]);
2226                 arg->len += 2;
2227         }
2228         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2229                             rt->rt6i_dst.plen);
2230
2231 #ifdef CONFIG_IPV6_SUBTREES
2232         for (i=0; i<16; i++) {
2233                 sprintf(arg->buffer + arg->len, "%02x",
2234                         rt->rt6i_src.addr.s6_addr[i]);
2235                 arg->len += 2;
2236         }
2237         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2238                             rt->rt6i_src.plen);
2239 #else
2240         sprintf(arg->buffer + arg->len,
2241                 "00000000000000000000000000000000 00 ");
2242         arg->len += 36;
2243 #endif
2244
2245         if (rt->rt6i_nexthop) {
2246                 for (i=0; i<16; i++) {
2247                         sprintf(arg->buffer + arg->len, "%02x",
2248                                 rt->rt6i_nexthop->primary_key[i]);
2249                         arg->len += 2;
2250                 }
2251         } else {
2252                 sprintf(arg->buffer + arg->len,
2253                         "00000000000000000000000000000000");
2254                 arg->len += 32;
2255         }
2256         arg->len += sprintf(arg->buffer + arg->len,
2257                             " %08x %08x %08x %08x %8s\n",
2258                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2259                             rt->u.dst.__use, rt->rt6i_flags, 
2260                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2261         return 0;
2262 }
2263
2264 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2265 {
2266         struct rt6_proc_arg arg = {
2267                 .buffer = buffer,
2268                 .offset = offset,
2269                 .length = length,
2270         };
2271
2272         fib6_clean_all(rt6_info_route, 0, &arg);
2273
2274         *start = buffer;
2275         if (offset)
2276                 *start += offset % RT6_INFO_LEN;
2277
2278         arg.len -= offset % RT6_INFO_LEN;
2279
2280         if (arg.len > length)
2281                 arg.len = length;
2282         if (arg.len < 0)
2283                 arg.len = 0;
2284
2285         return arg.len;
2286 }
2287
2288 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2289 {
2290         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2291                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2292                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2293                       rt6_stats.fib_rt_cache,
2294                       atomic_read(&ip6_dst_ops.entries),
2295                       rt6_stats.fib_discarded_routes);
2296
2297         return 0;
2298 }
2299
2300 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2301 {
2302         return single_open(file, rt6_stats_seq_show, NULL);
2303 }
2304
2305 static struct file_operations rt6_stats_seq_fops = {
2306         .owner   = THIS_MODULE,
2307         .open    = rt6_stats_seq_open,
2308         .read    = seq_read,
2309         .llseek  = seq_lseek,
2310         .release = single_release,
2311 };
2312 #endif  /* CONFIG_PROC_FS */
2313
2314 #ifdef CONFIG_SYSCTL
2315
2316 static int flush_delay;
2317
2318 static
2319 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2320                               void __user *buffer, size_t *lenp, loff_t *ppos)
2321 {
2322         if (write) {
2323                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2324                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2325                 return 0;
2326         } else
2327                 return -EINVAL;
2328 }
2329
2330 ctl_table ipv6_route_table[] = {
2331         {
2332                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2333                 .procname       =       "flush",
2334                 .data           =       &flush_delay,
2335                 .maxlen         =       sizeof(int),
2336                 .mode           =       0200,
2337                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2338         },
2339         {
2340                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2341                 .procname       =       "gc_thresh",
2342                 .data           =       &ip6_dst_ops.gc_thresh,
2343                 .maxlen         =       sizeof(int),
2344                 .mode           =       0644,
2345                 .proc_handler   =       &proc_dointvec,
2346         },
2347         {
2348                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2349                 .procname       =       "max_size",
2350                 .data           =       &ip6_rt_max_size,
2351                 .maxlen         =       sizeof(int),
2352                 .mode           =       0644,
2353                 .proc_handler   =       &proc_dointvec,
2354         },
2355         {
2356                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2357                 .procname       =       "gc_min_interval",
2358                 .data           =       &ip6_rt_gc_min_interval,
2359                 .maxlen         =       sizeof(int),
2360                 .mode           =       0644,
2361                 .proc_handler   =       &proc_dointvec_jiffies,
2362                 .strategy       =       &sysctl_jiffies,
2363         },
2364         {
2365                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2366                 .procname       =       "gc_timeout",
2367                 .data           =       &ip6_rt_gc_timeout,
2368                 .maxlen         =       sizeof(int),
2369                 .mode           =       0644,
2370                 .proc_handler   =       &proc_dointvec_jiffies,
2371                 .strategy       =       &sysctl_jiffies,
2372         },
2373         {
2374                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2375                 .procname       =       "gc_interval",
2376                 .data           =       &ip6_rt_gc_interval,
2377                 .maxlen         =       sizeof(int),
2378                 .mode           =       0644,
2379                 .proc_handler   =       &proc_dointvec_jiffies,
2380                 .strategy       =       &sysctl_jiffies,
2381         },
2382         {
2383                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2384                 .procname       =       "gc_elasticity",
2385                 .data           =       &ip6_rt_gc_elasticity,
2386                 .maxlen         =       sizeof(int),
2387                 .mode           =       0644,
2388                 .proc_handler   =       &proc_dointvec_jiffies,
2389                 .strategy       =       &sysctl_jiffies,
2390         },
2391         {
2392                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2393                 .procname       =       "mtu_expires",
2394                 .data           =       &ip6_rt_mtu_expires,
2395                 .maxlen         =       sizeof(int),
2396                 .mode           =       0644,
2397                 .proc_handler   =       &proc_dointvec_jiffies,
2398                 .strategy       =       &sysctl_jiffies,
2399         },
2400         {
2401                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2402                 .procname       =       "min_adv_mss",
2403                 .data           =       &ip6_rt_min_advmss,
2404                 .maxlen         =       sizeof(int),
2405                 .mode           =       0644,
2406                 .proc_handler   =       &proc_dointvec_jiffies,
2407                 .strategy       =       &sysctl_jiffies,
2408         },
2409         {
2410                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2411                 .procname       =       "gc_min_interval_ms",
2412                 .data           =       &ip6_rt_gc_min_interval,
2413                 .maxlen         =       sizeof(int),
2414                 .mode           =       0644,
2415                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2416                 .strategy       =       &sysctl_ms_jiffies,
2417         },
2418         { .ctl_name = 0 }
2419 };
2420
2421 #endif
2422
2423 void __init ip6_route_init(void)
2424 {
2425         struct proc_dir_entry *p;
2426
2427         ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2428                                                      sizeof(struct rt6_info),
2429                                                      0, SLAB_HWCACHE_ALIGN,
2430                                                      NULL, NULL);
2431         if (!ip6_dst_ops.kmem_cachep)
2432                 panic("cannot create ip6_dst_cache");
2433
2434         fib6_init();
2435 #ifdef  CONFIG_PROC_FS
2436         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2437         if (p)
2438                 p->owner = THIS_MODULE;
2439
2440         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2441 #endif
2442 #ifdef CONFIG_XFRM
2443         xfrm6_init();
2444 #endif
2445 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2446         fib6_rules_init();
2447 #endif
2448 }
2449
2450 void ip6_route_cleanup(void)
2451 {
2452 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2453         fib6_rules_cleanup();
2454 #endif
2455 #ifdef CONFIG_PROC_FS
2456         proc_net_remove("ipv6_route");
2457         proc_net_remove("rt6_stats");
2458 #endif
2459 #ifdef CONFIG_XFRM
2460         xfrm6_fini();
2461 #endif
2462         rt6_ifdown(NULL);
2463         fib6_gc_cleanup();
2464         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2465 }