]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv6/route.c
[NETLINK]: Make use of NLA_STRING/NLA_NUL_STRING attribute validation
[net-next-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41
42 #ifdef  CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57 #include <net/netevent.h>
58 #include <net/netlink.h>
59
60 #include <asm/uaccess.h>
61
62 #ifdef CONFIG_SYSCTL
63 #include <linux/sysctl.h>
64 #endif
65
66 /* Set to 3 to get tracing. */
67 #define RT6_DEBUG 2
68
69 #if RT6_DEBUG >= 3
70 #define RDBG(x) printk x
71 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #else
73 #define RDBG(x)
74 #define RT6_TRACE(x...) do { ; } while (0)
75 #endif
76
77 #define CLONE_OFFLINK_ROUTE 0
78
79 static int ip6_rt_max_size = 4096;
80 static int ip6_rt_gc_min_interval = HZ / 2;
81 static int ip6_rt_gc_timeout = 60*HZ;
82 int ip6_rt_gc_interval = 30*HZ;
83 static int ip6_rt_gc_elasticity = 9;
84 static int ip6_rt_mtu_expires = 10*60*HZ;
85 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
86
87 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
88 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
90 static void             ip6_dst_destroy(struct dst_entry *);
91 static void             ip6_dst_ifdown(struct dst_entry *,
92                                        struct net_device *dev, int how);
93 static int               ip6_dst_gc(void);
94
95 static int              ip6_pkt_discard(struct sk_buff *skb);
96 static int              ip6_pkt_discard_out(struct sk_buff *skb);
97 static void             ip6_link_failure(struct sk_buff *skb);
98 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
99
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
102                                            struct in6_addr *gwaddr, int ifindex,
103                                            unsigned pref);
104 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
105                                            struct in6_addr *gwaddr, int ifindex);
106 #endif
107
108 static struct dst_ops ip6_dst_ops = {
109         .family                 =       AF_INET6,
110         .protocol               =       __constant_htons(ETH_P_IPV6),
111         .gc                     =       ip6_dst_gc,
112         .gc_thresh              =       1024,
113         .check                  =       ip6_dst_check,
114         .destroy                =       ip6_dst_destroy,
115         .ifdown                 =       ip6_dst_ifdown,
116         .negative_advice        =       ip6_negative_advice,
117         .link_failure           =       ip6_link_failure,
118         .update_pmtu            =       ip6_rt_update_pmtu,
119         .entry_size             =       sizeof(struct rt6_info),
120 };
121
122 struct rt6_info ip6_null_entry = {
123         .u = {
124                 .dst = {
125                         .__refcnt       = ATOMIC_INIT(1),
126                         .__use          = 1,
127                         .dev            = &loopback_dev,
128                         .obsolete       = -1,
129                         .error          = -ENETUNREACH,
130                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
131                         .input          = ip6_pkt_discard,
132                         .output         = ip6_pkt_discard_out,
133                         .ops            = &ip6_dst_ops,
134                         .path           = (struct dst_entry*)&ip6_null_entry,
135                 }
136         },
137         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
138         .rt6i_metric    = ~(u32) 0,
139         .rt6i_ref       = ATOMIC_INIT(1),
140 };
141
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
143
144 struct rt6_info ip6_prohibit_entry = {
145         .u = {
146                 .dst = {
147                         .__refcnt       = ATOMIC_INIT(1),
148                         .__use          = 1,
149                         .dev            = &loopback_dev,
150                         .obsolete       = -1,
151                         .error          = -EACCES,
152                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
153                         .input          = ip6_pkt_discard,
154                         .output         = ip6_pkt_discard_out,
155                         .ops            = &ip6_dst_ops,
156                         .path           = (struct dst_entry*)&ip6_prohibit_entry,
157                 }
158         },
159         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
160         .rt6i_metric    = ~(u32) 0,
161         .rt6i_ref       = ATOMIC_INIT(1),
162 };
163
164 struct rt6_info ip6_blk_hole_entry = {
165         .u = {
166                 .dst = {
167                         .__refcnt       = ATOMIC_INIT(1),
168                         .__use          = 1,
169                         .dev            = &loopback_dev,
170                         .obsolete       = -1,
171                         .error          = -EINVAL,
172                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
173                         .input          = ip6_pkt_discard,
174                         .output         = ip6_pkt_discard_out,
175                         .ops            = &ip6_dst_ops,
176                         .path           = (struct dst_entry*)&ip6_blk_hole_entry,
177                 }
178         },
179         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
180         .rt6i_metric    = ~(u32) 0,
181         .rt6i_ref       = ATOMIC_INIT(1),
182 };
183
184 #endif
185
186 /* allocate dst with ip6_dst_ops */
187 static __inline__ struct rt6_info *ip6_dst_alloc(void)
188 {
189         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
190 }
191
192 static void ip6_dst_destroy(struct dst_entry *dst)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195         struct inet6_dev *idev = rt->rt6i_idev;
196
197         if (idev != NULL) {
198                 rt->rt6i_idev = NULL;
199                 in6_dev_put(idev);
200         }       
201 }
202
203 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
204                            int how)
205 {
206         struct rt6_info *rt = (struct rt6_info *)dst;
207         struct inet6_dev *idev = rt->rt6i_idev;
208
209         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
210                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
211                 if (loopback_idev != NULL) {
212                         rt->rt6i_idev = loopback_idev;
213                         in6_dev_put(idev);
214                 }
215         }
216 }
217
218 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
219 {
220         return (rt->rt6i_flags & RTF_EXPIRES &&
221                 time_after(jiffies, rt->rt6i_expires));
222 }
223
224 static inline int rt6_need_strict(struct in6_addr *daddr)
225 {
226         return (ipv6_addr_type(daddr) &
227                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
228 }
229
230 /*
231  *      Route lookup. Any table->tb6_lock is implied.
232  */
233
234 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
235                                                     int oif,
236                                                     int strict)
237 {
238         struct rt6_info *local = NULL;
239         struct rt6_info *sprt;
240
241         if (oif) {
242                 for (sprt = rt; sprt; sprt = sprt->u.next) {
243                         struct net_device *dev = sprt->rt6i_dev;
244                         if (dev->ifindex == oif)
245                                 return sprt;
246                         if (dev->flags & IFF_LOOPBACK) {
247                                 if (sprt->rt6i_idev == NULL ||
248                                     sprt->rt6i_idev->dev->ifindex != oif) {
249                                         if (strict && oif)
250                                                 continue;
251                                         if (local && (!oif || 
252                                                       local->rt6i_idev->dev->ifindex == oif))
253                                                 continue;
254                                 }
255                                 local = sprt;
256                         }
257                 }
258
259                 if (local)
260                         return local;
261
262                 if (strict)
263                         return &ip6_null_entry;
264         }
265         return rt;
266 }
267
268 #ifdef CONFIG_IPV6_ROUTER_PREF
269 static void rt6_probe(struct rt6_info *rt)
270 {
271         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
272         /*
273          * Okay, this does not seem to be appropriate
274          * for now, however, we need to check if it
275          * is really so; aka Router Reachability Probing.
276          *
277          * Router Reachability Probe MUST be rate-limited
278          * to no more than one per minute.
279          */
280         if (!neigh || (neigh->nud_state & NUD_VALID))
281                 return;
282         read_lock_bh(&neigh->lock);
283         if (!(neigh->nud_state & NUD_VALID) &&
284             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
285                 struct in6_addr mcaddr;
286                 struct in6_addr *target;
287
288                 neigh->updated = jiffies;
289                 read_unlock_bh(&neigh->lock);
290
291                 target = (struct in6_addr *)&neigh->primary_key;
292                 addrconf_addr_solict_mult(target, &mcaddr);
293                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
294         } else
295                 read_unlock_bh(&neigh->lock);
296 }
297 #else
298 static inline void rt6_probe(struct rt6_info *rt)
299 {
300         return;
301 }
302 #endif
303
304 /*
305  * Default Router Selection (RFC 2461 6.3.6)
306  */
307 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
308 {
309         struct net_device *dev = rt->rt6i_dev;
310         if (!oif || dev->ifindex == oif)
311                 return 2;
312         if ((dev->flags & IFF_LOOPBACK) &&
313             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
314                 return 1;
315         return 0;
316 }
317
318 static int inline rt6_check_neigh(struct rt6_info *rt)
319 {
320         struct neighbour *neigh = rt->rt6i_nexthop;
321         int m = 0;
322         if (rt->rt6i_flags & RTF_NONEXTHOP ||
323             !(rt->rt6i_flags & RTF_GATEWAY))
324                 m = 1;
325         else if (neigh) {
326                 read_lock_bh(&neigh->lock);
327                 if (neigh->nud_state & NUD_VALID)
328                         m = 2;
329                 read_unlock_bh(&neigh->lock);
330         }
331         return m;
332 }
333
334 static int rt6_score_route(struct rt6_info *rt, int oif,
335                            int strict)
336 {
337         int m, n;
338                 
339         m = rt6_check_dev(rt, oif);
340         if (!m && (strict & RT6_LOOKUP_F_IFACE))
341                 return -1;
342 #ifdef CONFIG_IPV6_ROUTER_PREF
343         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
344 #endif
345         n = rt6_check_neigh(rt);
346         if (n > 1)
347                 m |= 16;
348         else if (!n && strict & RT6_LOOKUP_F_REACHABLE)
349                 return -1;
350         return m;
351 }
352
353 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
354                                    int strict)
355 {
356         struct rt6_info *match = NULL, *last = NULL;
357         struct rt6_info *rt, *rt0 = *head;
358         u32 metric;
359         int mpri = -1;
360
361         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
362                   __FUNCTION__, head, head ? *head : NULL, oif);
363
364         for (rt = rt0, metric = rt0->rt6i_metric;
365              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
366              rt = rt->u.next) {
367                 int m;
368
369                 if (rt6_check_expired(rt))
370                         continue;
371
372                 last = rt;
373
374                 m = rt6_score_route(rt, oif, strict);
375                 if (m < 0)
376                         continue;
377
378                 if (m > mpri) {
379                         rt6_probe(match);
380                         match = rt;
381                         mpri = m;
382                 } else {
383                         rt6_probe(rt);
384                 }
385         }
386
387         if (!match &&
388             (strict & RT6_LOOKUP_F_REACHABLE) &&
389             last && last != rt0) {
390                 /* no entries matched; do round-robin */
391                 static DEFINE_SPINLOCK(lock);
392                 spin_lock(&lock);
393                 *head = rt0->u.next;
394                 rt0->u.next = last->u.next;
395                 last->u.next = rt0;
396                 spin_unlock(&lock);
397         }
398
399         RT6_TRACE("%s() => %p, score=%d\n",
400                   __FUNCTION__, match, mpri);
401
402         return (match ? match : &ip6_null_entry);
403 }
404
405 #ifdef CONFIG_IPV6_ROUTE_INFO
406 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
407                   struct in6_addr *gwaddr)
408 {
409         struct route_info *rinfo = (struct route_info *) opt;
410         struct in6_addr prefix_buf, *prefix;
411         unsigned int pref;
412         u32 lifetime;
413         struct rt6_info *rt;
414
415         if (len < sizeof(struct route_info)) {
416                 return -EINVAL;
417         }
418
419         /* Sanity check for prefix_len and length */
420         if (rinfo->length > 3) {
421                 return -EINVAL;
422         } else if (rinfo->prefix_len > 128) {
423                 return -EINVAL;
424         } else if (rinfo->prefix_len > 64) {
425                 if (rinfo->length < 2) {
426                         return -EINVAL;
427                 }
428         } else if (rinfo->prefix_len > 0) {
429                 if (rinfo->length < 1) {
430                         return -EINVAL;
431                 }
432         }
433
434         pref = rinfo->route_pref;
435         if (pref == ICMPV6_ROUTER_PREF_INVALID)
436                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
437
438         lifetime = htonl(rinfo->lifetime);
439         if (lifetime == 0xffffffff) {
440                 /* infinity */
441         } else if (lifetime > 0x7fffffff/HZ) {
442                 /* Avoid arithmetic overflow */
443                 lifetime = 0x7fffffff/HZ - 1;
444         }
445
446         if (rinfo->length == 3)
447                 prefix = (struct in6_addr *)rinfo->prefix;
448         else {
449                 /* this function is safe */
450                 ipv6_addr_prefix(&prefix_buf,
451                                  (struct in6_addr *)rinfo->prefix,
452                                  rinfo->prefix_len);
453                 prefix = &prefix_buf;
454         }
455
456         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
457
458         if (rt && !lifetime) {
459                 ip6_del_rt(rt);
460                 rt = NULL;
461         }
462
463         if (!rt && lifetime)
464                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
465                                         pref);
466         else if (rt)
467                 rt->rt6i_flags = RTF_ROUTEINFO |
468                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
469
470         if (rt) {
471                 if (lifetime == 0xffffffff) {
472                         rt->rt6i_flags &= ~RTF_EXPIRES;
473                 } else {
474                         rt->rt6i_expires = jiffies + HZ * lifetime;
475                         rt->rt6i_flags |= RTF_EXPIRES;
476                 }
477                 dst_release(&rt->u.dst);
478         }
479         return 0;
480 }
481 #endif
482
483 #define BACKTRACK(saddr) \
484 do { \
485         if (rt == &ip6_null_entry) { \
486                 struct fib6_node *pn; \
487                 while (fn) { \
488                         if (fn->fn_flags & RTN_TL_ROOT) \
489                                 goto out; \
490                         pn = fn->parent; \
491                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
492                                 fn = fib6_lookup(pn->subtree, NULL, saddr); \
493                         else \
494                                 fn = pn; \
495                         if (fn->fn_flags & RTN_RTINFO) \
496                                 goto restart; \
497                 } \
498         } \
499 } while(0)
500
501 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
502                                              struct flowi *fl, int flags)
503 {
504         struct fib6_node *fn;
505         struct rt6_info *rt;
506
507         read_lock_bh(&table->tb6_lock);
508         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
509 restart:
510         rt = fn->leaf;
511         rt = rt6_device_match(rt, fl->oif, flags);
512         BACKTRACK(&fl->fl6_src);
513         dst_hold(&rt->u.dst);
514 out:
515         read_unlock_bh(&table->tb6_lock);
516
517         rt->u.dst.lastuse = jiffies;
518         rt->u.dst.__use++;
519
520         return rt;
521
522 }
523
524 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
525                             int oif, int strict)
526 {
527         struct flowi fl = {
528                 .oif = oif,
529                 .nl_u = {
530                         .ip6_u = {
531                                 .daddr = *daddr,
532                                 /* TODO: saddr */
533                         },
534                 },
535         };
536         struct dst_entry *dst;
537         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
538
539         dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
540         if (dst->error == 0)
541                 return (struct rt6_info *) dst;
542
543         dst_release(dst);
544
545         return NULL;
546 }
547
548 /* ip6_ins_rt is called with FREE table->tb6_lock.
549    It takes new route entry, the addition fails by any reason the
550    route is freed. In any case, if caller does not hold it, it may
551    be destroyed.
552  */
553
554 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
555 {
556         int err;
557         struct fib6_table *table;
558
559         table = rt->rt6i_table;
560         write_lock_bh(&table->tb6_lock);
561         err = fib6_add(&table->tb6_root, rt, info);
562         write_unlock_bh(&table->tb6_lock);
563
564         return err;
565 }
566
567 int ip6_ins_rt(struct rt6_info *rt)
568 {
569         return __ip6_ins_rt(rt, NULL);
570 }
571
572 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
573                                       struct in6_addr *saddr)
574 {
575         struct rt6_info *rt;
576
577         /*
578          *      Clone the route.
579          */
580
581         rt = ip6_rt_copy(ort);
582
583         if (rt) {
584                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
585                         if (rt->rt6i_dst.plen != 128 &&
586                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
587                                 rt->rt6i_flags |= RTF_ANYCAST;
588                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
589                 }
590
591                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
592                 rt->rt6i_dst.plen = 128;
593                 rt->rt6i_flags |= RTF_CACHE;
594                 rt->u.dst.flags |= DST_HOST;
595
596 #ifdef CONFIG_IPV6_SUBTREES
597                 if (rt->rt6i_src.plen && saddr) {
598                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
599                         rt->rt6i_src.plen = 128;
600                 }
601 #endif
602
603                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
604
605         }
606
607         return rt;
608 }
609
610 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
611 {
612         struct rt6_info *rt = ip6_rt_copy(ort);
613         if (rt) {
614                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
615                 rt->rt6i_dst.plen = 128;
616                 rt->rt6i_flags |= RTF_CACHE;
617                 if (rt->rt6i_flags & RTF_REJECT)
618                         rt->u.dst.error = ort->u.dst.error;
619                 rt->u.dst.flags |= DST_HOST;
620                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
621         }
622         return rt;
623 }
624
625 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
626                                             struct flowi *fl, int flags)
627 {
628         struct fib6_node *fn;
629         struct rt6_info *rt, *nrt;
630         int strict = 0;
631         int attempts = 3;
632         int err;
633         int reachable = RT6_LOOKUP_F_REACHABLE;
634
635         strict |= flags & RT6_LOOKUP_F_IFACE;
636
637 relookup:
638         read_lock_bh(&table->tb6_lock);
639
640 restart_2:
641         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
642
643 restart:
644         rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
645         BACKTRACK(&fl->fl6_src);
646         if (rt == &ip6_null_entry ||
647             rt->rt6i_flags & RTF_CACHE)
648                 goto out;
649
650         dst_hold(&rt->u.dst);
651         read_unlock_bh(&table->tb6_lock);
652
653         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
654                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
655         else {
656 #if CLONE_OFFLINK_ROUTE
657                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
658 #else
659                 goto out2;
660 #endif
661         }
662
663         dst_release(&rt->u.dst);
664         rt = nrt ? : &ip6_null_entry;
665
666         dst_hold(&rt->u.dst);
667         if (nrt) {
668                 err = ip6_ins_rt(nrt);
669                 if (!err)
670                         goto out2;
671         }
672
673         if (--attempts <= 0)
674                 goto out2;
675
676         /*
677          * Race condition! In the gap, when table->tb6_lock was
678          * released someone could insert this route.  Relookup.
679          */
680         dst_release(&rt->u.dst);
681         goto relookup;
682
683 out:
684         if (reachable) {
685                 reachable = 0;
686                 goto restart_2;
687         }
688         dst_hold(&rt->u.dst);
689         read_unlock_bh(&table->tb6_lock);
690 out2:
691         rt->u.dst.lastuse = jiffies;
692         rt->u.dst.__use++;
693
694         return rt;
695 }
696
697 void ip6_route_input(struct sk_buff *skb)
698 {
699         struct ipv6hdr *iph = skb->nh.ipv6h;
700         struct flowi fl = {
701                 .iif = skb->dev->ifindex,
702                 .nl_u = {
703                         .ip6_u = {
704                                 .daddr = iph->daddr,
705                                 .saddr = iph->saddr,
706 #ifdef CONFIG_IPV6_ROUTE_FWMARK
707                                 .fwmark = skb->nfmark,
708 #endif
709                                 .flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
710                         },
711                 },
712                 .proto = iph->nexthdr,
713         };
714         int flags = rt6_need_strict(&iph->daddr) ? RT6_LOOKUP_F_IFACE : 0;
715
716         skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
717 }
718
719 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
720                                              struct flowi *fl, int flags)
721 {
722         struct fib6_node *fn;
723         struct rt6_info *rt, *nrt;
724         int strict = 0;
725         int attempts = 3;
726         int err;
727         int reachable = RT6_LOOKUP_F_REACHABLE;
728
729         strict |= flags & RT6_LOOKUP_F_IFACE;
730
731 relookup:
732         read_lock_bh(&table->tb6_lock);
733
734 restart_2:
735         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
736
737 restart:
738         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
739         BACKTRACK(&fl->fl6_src);
740         if (rt == &ip6_null_entry ||
741             rt->rt6i_flags & RTF_CACHE)
742                 goto out;
743
744         dst_hold(&rt->u.dst);
745         read_unlock_bh(&table->tb6_lock);
746
747         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
748                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
749         else {
750 #if CLONE_OFFLINK_ROUTE
751                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
752 #else
753                 goto out2;
754 #endif
755         }
756
757         dst_release(&rt->u.dst);
758         rt = nrt ? : &ip6_null_entry;
759
760         dst_hold(&rt->u.dst);
761         if (nrt) {
762                 err = ip6_ins_rt(nrt);
763                 if (!err)
764                         goto out2;
765         }
766
767         if (--attempts <= 0)
768                 goto out2;
769
770         /*
771          * Race condition! In the gap, when table->tb6_lock was
772          * released someone could insert this route.  Relookup.
773          */
774         dst_release(&rt->u.dst);
775         goto relookup;
776
777 out:
778         if (reachable) {
779                 reachable = 0;
780                 goto restart_2;
781         }
782         dst_hold(&rt->u.dst);
783         read_unlock_bh(&table->tb6_lock);
784 out2:
785         rt->u.dst.lastuse = jiffies;
786         rt->u.dst.__use++;
787         return rt;
788 }
789
790 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
791 {
792         int flags = 0;
793
794         if (rt6_need_strict(&fl->fl6_dst))
795                 flags |= RT6_LOOKUP_F_IFACE;
796
797         return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
798 }
799
800
801 /*
802  *      Destination cache support functions
803  */
804
805 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
806 {
807         struct rt6_info *rt;
808
809         rt = (struct rt6_info *) dst;
810
811         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
812                 return dst;
813
814         return NULL;
815 }
816
817 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
818 {
819         struct rt6_info *rt = (struct rt6_info *) dst;
820
821         if (rt) {
822                 if (rt->rt6i_flags & RTF_CACHE)
823                         ip6_del_rt(rt);
824                 else
825                         dst_release(dst);
826         }
827         return NULL;
828 }
829
830 static void ip6_link_failure(struct sk_buff *skb)
831 {
832         struct rt6_info *rt;
833
834         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
835
836         rt = (struct rt6_info *) skb->dst;
837         if (rt) {
838                 if (rt->rt6i_flags&RTF_CACHE) {
839                         dst_set_expires(&rt->u.dst, 0);
840                         rt->rt6i_flags |= RTF_EXPIRES;
841                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
842                         rt->rt6i_node->fn_sernum = -1;
843         }
844 }
845
846 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
847 {
848         struct rt6_info *rt6 = (struct rt6_info*)dst;
849
850         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
851                 rt6->rt6i_flags |= RTF_MODIFIED;
852                 if (mtu < IPV6_MIN_MTU) {
853                         mtu = IPV6_MIN_MTU;
854                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
855                 }
856                 dst->metrics[RTAX_MTU-1] = mtu;
857                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
858         }
859 }
860
861 static int ipv6_get_mtu(struct net_device *dev);
862
863 static inline unsigned int ipv6_advmss(unsigned int mtu)
864 {
865         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
866
867         if (mtu < ip6_rt_min_advmss)
868                 mtu = ip6_rt_min_advmss;
869
870         /*
871          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
872          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
873          * IPV6_MAXPLEN is also valid and means: "any MSS, 
874          * rely only on pmtu discovery"
875          */
876         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
877                 mtu = IPV6_MAXPLEN;
878         return mtu;
879 }
880
881 static struct dst_entry *ndisc_dst_gc_list;
882 static DEFINE_SPINLOCK(ndisc_lock);
883
884 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
885                                   struct neighbour *neigh,
886                                   struct in6_addr *addr,
887                                   int (*output)(struct sk_buff *))
888 {
889         struct rt6_info *rt;
890         struct inet6_dev *idev = in6_dev_get(dev);
891
892         if (unlikely(idev == NULL))
893                 return NULL;
894
895         rt = ip6_dst_alloc();
896         if (unlikely(rt == NULL)) {
897                 in6_dev_put(idev);
898                 goto out;
899         }
900
901         dev_hold(dev);
902         if (neigh)
903                 neigh_hold(neigh);
904         else
905                 neigh = ndisc_get_neigh(dev, addr);
906
907         rt->rt6i_dev      = dev;
908         rt->rt6i_idev     = idev;
909         rt->rt6i_nexthop  = neigh;
910         atomic_set(&rt->u.dst.__refcnt, 1);
911         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
912         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
913         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
914         rt->u.dst.output  = output;
915
916 #if 0   /* there's no chance to use these for ndisc */
917         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
918                                 ? DST_HOST 
919                                 : 0;
920         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
921         rt->rt6i_dst.plen = 128;
922 #endif
923
924         spin_lock_bh(&ndisc_lock);
925         rt->u.dst.next = ndisc_dst_gc_list;
926         ndisc_dst_gc_list = &rt->u.dst;
927         spin_unlock_bh(&ndisc_lock);
928
929         fib6_force_start_gc();
930
931 out:
932         return (struct dst_entry *)rt;
933 }
934
935 int ndisc_dst_gc(int *more)
936 {
937         struct dst_entry *dst, *next, **pprev;
938         int freed;
939
940         next = NULL;
941         freed = 0;
942
943         spin_lock_bh(&ndisc_lock);
944         pprev = &ndisc_dst_gc_list;
945
946         while ((dst = *pprev) != NULL) {
947                 if (!atomic_read(&dst->__refcnt)) {
948                         *pprev = dst->next;
949                         dst_free(dst);
950                         freed++;
951                 } else {
952                         pprev = &dst->next;
953                         (*more)++;
954                 }
955         }
956
957         spin_unlock_bh(&ndisc_lock);
958
959         return freed;
960 }
961
962 static int ip6_dst_gc(void)
963 {
964         static unsigned expire = 30*HZ;
965         static unsigned long last_gc;
966         unsigned long now = jiffies;
967
968         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
969             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
970                 goto out;
971
972         expire++;
973         fib6_run_gc(expire);
974         last_gc = now;
975         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
976                 expire = ip6_rt_gc_timeout>>1;
977
978 out:
979         expire -= expire>>ip6_rt_gc_elasticity;
980         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
981 }
982
983 /* Clean host part of a prefix. Not necessary in radix tree,
984    but results in cleaner routing tables.
985
986    Remove it only when all the things will work!
987  */
988
989 static int ipv6_get_mtu(struct net_device *dev)
990 {
991         int mtu = IPV6_MIN_MTU;
992         struct inet6_dev *idev;
993
994         idev = in6_dev_get(dev);
995         if (idev) {
996                 mtu = idev->cnf.mtu6;
997                 in6_dev_put(idev);
998         }
999         return mtu;
1000 }
1001
1002 int ipv6_get_hoplimit(struct net_device *dev)
1003 {
1004         int hoplimit = ipv6_devconf.hop_limit;
1005         struct inet6_dev *idev;
1006
1007         idev = in6_dev_get(dev);
1008         if (idev) {
1009                 hoplimit = idev->cnf.hop_limit;
1010                 in6_dev_put(idev);
1011         }
1012         return hoplimit;
1013 }
1014
1015 /*
1016  *
1017  */
1018
1019 int ip6_route_add(struct fib6_config *cfg)
1020 {
1021         int err;
1022         struct rt6_info *rt = NULL;
1023         struct net_device *dev = NULL;
1024         struct inet6_dev *idev = NULL;
1025         struct fib6_table *table;
1026         int addr_type;
1027
1028         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1029                 return -EINVAL;
1030 #ifndef CONFIG_IPV6_SUBTREES
1031         if (cfg->fc_src_len)
1032                 return -EINVAL;
1033 #endif
1034         if (cfg->fc_ifindex) {
1035                 err = -ENODEV;
1036                 dev = dev_get_by_index(cfg->fc_ifindex);
1037                 if (!dev)
1038                         goto out;
1039                 idev = in6_dev_get(dev);
1040                 if (!idev)
1041                         goto out;
1042         }
1043
1044         if (cfg->fc_metric == 0)
1045                 cfg->fc_metric = IP6_RT_PRIO_USER;
1046
1047         table = fib6_new_table(cfg->fc_table);
1048         if (table == NULL) {
1049                 err = -ENOBUFS;
1050                 goto out;
1051         }
1052
1053         rt = ip6_dst_alloc();
1054
1055         if (rt == NULL) {
1056                 err = -ENOMEM;
1057                 goto out;
1058         }
1059
1060         rt->u.dst.obsolete = -1;
1061         rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1062
1063         if (cfg->fc_protocol == RTPROT_UNSPEC)
1064                 cfg->fc_protocol = RTPROT_BOOT;
1065         rt->rt6i_protocol = cfg->fc_protocol;
1066
1067         addr_type = ipv6_addr_type(&cfg->fc_dst);
1068
1069         if (addr_type & IPV6_ADDR_MULTICAST)
1070                 rt->u.dst.input = ip6_mc_input;
1071         else
1072                 rt->u.dst.input = ip6_forward;
1073
1074         rt->u.dst.output = ip6_output;
1075
1076         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1077         rt->rt6i_dst.plen = cfg->fc_dst_len;
1078         if (rt->rt6i_dst.plen == 128)
1079                rt->u.dst.flags = DST_HOST;
1080
1081 #ifdef CONFIG_IPV6_SUBTREES
1082         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1083         rt->rt6i_src.plen = cfg->fc_src_len;
1084 #endif
1085
1086         rt->rt6i_metric = cfg->fc_metric;
1087
1088         /* We cannot add true routes via loopback here,
1089            they would result in kernel looping; promote them to reject routes
1090          */
1091         if ((cfg->fc_flags & RTF_REJECT) ||
1092             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1093                 /* hold loopback dev/idev if we haven't done so. */
1094                 if (dev != &loopback_dev) {
1095                         if (dev) {
1096                                 dev_put(dev);
1097                                 in6_dev_put(idev);
1098                         }
1099                         dev = &loopback_dev;
1100                         dev_hold(dev);
1101                         idev = in6_dev_get(dev);
1102                         if (!idev) {
1103                                 err = -ENODEV;
1104                                 goto out;
1105                         }
1106                 }
1107                 rt->u.dst.output = ip6_pkt_discard_out;
1108                 rt->u.dst.input = ip6_pkt_discard;
1109                 rt->u.dst.error = -ENETUNREACH;
1110                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1111                 goto install_route;
1112         }
1113
1114         if (cfg->fc_flags & RTF_GATEWAY) {
1115                 struct in6_addr *gw_addr;
1116                 int gwa_type;
1117
1118                 gw_addr = &cfg->fc_gateway;
1119                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1120                 gwa_type = ipv6_addr_type(gw_addr);
1121
1122                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1123                         struct rt6_info *grt;
1124
1125                         /* IPv6 strictly inhibits using not link-local
1126                            addresses as nexthop address.
1127                            Otherwise, router will not able to send redirects.
1128                            It is very good, but in some (rare!) circumstances
1129                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1130                            some exceptions. --ANK
1131                          */
1132                         err = -EINVAL;
1133                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1134                                 goto out;
1135
1136                         grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1137
1138                         err = -EHOSTUNREACH;
1139                         if (grt == NULL)
1140                                 goto out;
1141                         if (dev) {
1142                                 if (dev != grt->rt6i_dev) {
1143                                         dst_release(&grt->u.dst);
1144                                         goto out;
1145                                 }
1146                         } else {
1147                                 dev = grt->rt6i_dev;
1148                                 idev = grt->rt6i_idev;
1149                                 dev_hold(dev);
1150                                 in6_dev_hold(grt->rt6i_idev);
1151                         }
1152                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1153                                 err = 0;
1154                         dst_release(&grt->u.dst);
1155
1156                         if (err)
1157                                 goto out;
1158                 }
1159                 err = -EINVAL;
1160                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1161                         goto out;
1162         }
1163
1164         err = -ENODEV;
1165         if (dev == NULL)
1166                 goto out;
1167
1168         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1169                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1170                 if (IS_ERR(rt->rt6i_nexthop)) {
1171                         err = PTR_ERR(rt->rt6i_nexthop);
1172                         rt->rt6i_nexthop = NULL;
1173                         goto out;
1174                 }
1175         }
1176
1177         rt->rt6i_flags = cfg->fc_flags;
1178
1179 install_route:
1180         if (cfg->fc_mx) {
1181                 struct nlattr *nla;
1182                 int remaining;
1183
1184                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1185                         int type = nla->nla_type;
1186
1187                         if (type) {
1188                                 if (type > RTAX_MAX) {
1189                                         err = -EINVAL;
1190                                         goto out;
1191                                 }
1192
1193                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1194                         }
1195                 }
1196         }
1197
1198         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1199                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1200         if (!rt->u.dst.metrics[RTAX_MTU-1])
1201                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1202         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1203                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1204         rt->u.dst.dev = dev;
1205         rt->rt6i_idev = idev;
1206         rt->rt6i_table = table;
1207         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1208
1209 out:
1210         if (dev)
1211                 dev_put(dev);
1212         if (idev)
1213                 in6_dev_put(idev);
1214         if (rt)
1215                 dst_free((struct dst_entry *) rt);
1216         return err;
1217 }
1218
1219 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1220 {
1221         int err;
1222         struct fib6_table *table;
1223
1224         if (rt == &ip6_null_entry)
1225                 return -ENOENT;
1226
1227         table = rt->rt6i_table;
1228         write_lock_bh(&table->tb6_lock);
1229
1230         err = fib6_del(rt, info);
1231         dst_release(&rt->u.dst);
1232
1233         write_unlock_bh(&table->tb6_lock);
1234
1235         return err;
1236 }
1237
1238 int ip6_del_rt(struct rt6_info *rt)
1239 {
1240         return __ip6_del_rt(rt, NULL);
1241 }
1242
1243 static int ip6_route_del(struct fib6_config *cfg)
1244 {
1245         struct fib6_table *table;
1246         struct fib6_node *fn;
1247         struct rt6_info *rt;
1248         int err = -ESRCH;
1249
1250         table = fib6_get_table(cfg->fc_table);
1251         if (table == NULL)
1252                 return err;
1253
1254         read_lock_bh(&table->tb6_lock);
1255
1256         fn = fib6_locate(&table->tb6_root,
1257                          &cfg->fc_dst, cfg->fc_dst_len,
1258                          &cfg->fc_src, cfg->fc_src_len);
1259         
1260         if (fn) {
1261                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1262                         if (cfg->fc_ifindex &&
1263                             (rt->rt6i_dev == NULL ||
1264                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1265                                 continue;
1266                         if (cfg->fc_flags & RTF_GATEWAY &&
1267                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1268                                 continue;
1269                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1270                                 continue;
1271                         dst_hold(&rt->u.dst);
1272                         read_unlock_bh(&table->tb6_lock);
1273
1274                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1275                 }
1276         }
1277         read_unlock_bh(&table->tb6_lock);
1278
1279         return err;
1280 }
1281
1282 /*
1283  *      Handle redirects
1284  */
1285 struct ip6rd_flowi {
1286         struct flowi fl;
1287         struct in6_addr gateway;
1288 };
1289
1290 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1291                                              struct flowi *fl,
1292                                              int flags)
1293 {
1294         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1295         struct rt6_info *rt;
1296         struct fib6_node *fn;
1297
1298         /*
1299          * Get the "current" route for this destination and
1300          * check if the redirect has come from approriate router.
1301          *
1302          * RFC 2461 specifies that redirects should only be
1303          * accepted if they come from the nexthop to the target.
1304          * Due to the way the routes are chosen, this notion
1305          * is a bit fuzzy and one might need to check all possible
1306          * routes.
1307          */
1308
1309         read_lock_bh(&table->tb6_lock);
1310         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1311 restart:
1312         for (rt = fn->leaf; rt; rt = rt->u.next) {
1313                 /*
1314                  * Current route is on-link; redirect is always invalid.
1315                  *
1316                  * Seems, previous statement is not true. It could
1317                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1318                  * But then router serving it might decide, that we should
1319                  * know truth 8)8) --ANK (980726).
1320                  */
1321                 if (rt6_check_expired(rt))
1322                         continue;
1323                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1324                         continue;
1325                 if (fl->oif != rt->rt6i_dev->ifindex)
1326                         continue;
1327                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1328                         continue;
1329                 break;
1330         }
1331
1332         if (!rt)
1333                 rt = &ip6_null_entry;
1334         BACKTRACK(&fl->fl6_src);
1335 out:
1336         dst_hold(&rt->u.dst);
1337
1338         read_unlock_bh(&table->tb6_lock);
1339
1340         return rt;
1341 };
1342
1343 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1344                                            struct in6_addr *src,
1345                                            struct in6_addr *gateway,
1346                                            struct net_device *dev)
1347 {
1348         struct ip6rd_flowi rdfl = {
1349                 .fl = {
1350                         .oif = dev->ifindex,
1351                         .nl_u = {
1352                                 .ip6_u = {
1353                                         .daddr = *dest,
1354                                         .saddr = *src,
1355                                 },
1356                         },
1357                 },
1358                 .gateway = *gateway,
1359         };
1360         int flags = rt6_need_strict(dest) ? RT6_LOOKUP_F_IFACE : 0;
1361
1362         return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1363 }
1364
1365 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1366                   struct in6_addr *saddr,
1367                   struct neighbour *neigh, u8 *lladdr, int on_link)
1368 {
1369         struct rt6_info *rt, *nrt = NULL;
1370         struct netevent_redirect netevent;
1371
1372         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1373
1374         if (rt == &ip6_null_entry) {
1375                 if (net_ratelimit())
1376                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1377                                "for redirect target\n");
1378                 goto out;
1379         }
1380
1381         /*
1382          *      We have finally decided to accept it.
1383          */
1384
1385         neigh_update(neigh, lladdr, NUD_STALE, 
1386                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1387                      NEIGH_UPDATE_F_OVERRIDE|
1388                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1389                                      NEIGH_UPDATE_F_ISROUTER))
1390                      );
1391
1392         /*
1393          * Redirect received -> path was valid.
1394          * Look, redirects are sent only in response to data packets,
1395          * so that this nexthop apparently is reachable. --ANK
1396          */
1397         dst_confirm(&rt->u.dst);
1398
1399         /* Duplicate redirect: silently ignore. */
1400         if (neigh == rt->u.dst.neighbour)
1401                 goto out;
1402
1403         nrt = ip6_rt_copy(rt);
1404         if (nrt == NULL)
1405                 goto out;
1406
1407         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1408         if (on_link)
1409                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1410
1411         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1412         nrt->rt6i_dst.plen = 128;
1413         nrt->u.dst.flags |= DST_HOST;
1414
1415         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1416         nrt->rt6i_nexthop = neigh_clone(neigh);
1417         /* Reset pmtu, it may be better */
1418         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1419         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1420
1421         if (ip6_ins_rt(nrt))
1422                 goto out;
1423
1424         netevent.old = &rt->u.dst;
1425         netevent.new = &nrt->u.dst;
1426         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1427
1428         if (rt->rt6i_flags&RTF_CACHE) {
1429                 ip6_del_rt(rt);
1430                 return;
1431         }
1432
1433 out:
1434         dst_release(&rt->u.dst);
1435         return;
1436 }
1437
1438 /*
1439  *      Handle ICMP "packet too big" messages
1440  *      i.e. Path MTU discovery
1441  */
1442
1443 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1444                         struct net_device *dev, u32 pmtu)
1445 {
1446         struct rt6_info *rt, *nrt;
1447         int allfrag = 0;
1448
1449         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1450         if (rt == NULL)
1451                 return;
1452
1453         if (pmtu >= dst_mtu(&rt->u.dst))
1454                 goto out;
1455
1456         if (pmtu < IPV6_MIN_MTU) {
1457                 /*
1458                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1459                  * MTU (1280) and a fragment header should always be included
1460                  * after a node receiving Too Big message reporting PMTU is
1461                  * less than the IPv6 Minimum Link MTU.
1462                  */
1463                 pmtu = IPV6_MIN_MTU;
1464                 allfrag = 1;
1465         }
1466
1467         /* New mtu received -> path was valid.
1468            They are sent only in response to data packets,
1469            so that this nexthop apparently is reachable. --ANK
1470          */
1471         dst_confirm(&rt->u.dst);
1472
1473         /* Host route. If it is static, it would be better
1474            not to override it, but add new one, so that
1475            when cache entry will expire old pmtu
1476            would return automatically.
1477          */
1478         if (rt->rt6i_flags & RTF_CACHE) {
1479                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1480                 if (allfrag)
1481                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1482                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1483                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1484                 goto out;
1485         }
1486
1487         /* Network route.
1488            Two cases are possible:
1489            1. It is connected route. Action: COW
1490            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1491          */
1492         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1493                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1494         else
1495                 nrt = rt6_alloc_clone(rt, daddr);
1496
1497         if (nrt) {
1498                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1499                 if (allfrag)
1500                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1501
1502                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1503                  * happened within 5 mins, the recommended timer is 10 mins.
1504                  * Here this route expiration time is set to ip6_rt_mtu_expires
1505                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1506                  * and detecting PMTU increase will be automatically happened.
1507                  */
1508                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1509                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1510
1511                 ip6_ins_rt(nrt);
1512         }
1513 out:
1514         dst_release(&rt->u.dst);
1515 }
1516
1517 /*
1518  *      Misc support functions
1519  */
1520
1521 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1522 {
1523         struct rt6_info *rt = ip6_dst_alloc();
1524
1525         if (rt) {
1526                 rt->u.dst.input = ort->u.dst.input;
1527                 rt->u.dst.output = ort->u.dst.output;
1528
1529                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1530                 rt->u.dst.dev = ort->u.dst.dev;
1531                 if (rt->u.dst.dev)
1532                         dev_hold(rt->u.dst.dev);
1533                 rt->rt6i_idev = ort->rt6i_idev;
1534                 if (rt->rt6i_idev)
1535                         in6_dev_hold(rt->rt6i_idev);
1536                 rt->u.dst.lastuse = jiffies;
1537                 rt->rt6i_expires = 0;
1538
1539                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1540                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1541                 rt->rt6i_metric = 0;
1542
1543                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1544 #ifdef CONFIG_IPV6_SUBTREES
1545                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1546 #endif
1547                 rt->rt6i_table = ort->rt6i_table;
1548         }
1549         return rt;
1550 }
1551
1552 #ifdef CONFIG_IPV6_ROUTE_INFO
1553 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1554                                            struct in6_addr *gwaddr, int ifindex)
1555 {
1556         struct fib6_node *fn;
1557         struct rt6_info *rt = NULL;
1558         struct fib6_table *table;
1559
1560         table = fib6_get_table(RT6_TABLE_INFO);
1561         if (table == NULL)
1562                 return NULL;
1563
1564         write_lock_bh(&table->tb6_lock);
1565         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1566         if (!fn)
1567                 goto out;
1568
1569         for (rt = fn->leaf; rt; rt = rt->u.next) {
1570                 if (rt->rt6i_dev->ifindex != ifindex)
1571                         continue;
1572                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1573                         continue;
1574                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1575                         continue;
1576                 dst_hold(&rt->u.dst);
1577                 break;
1578         }
1579 out:
1580         write_unlock_bh(&table->tb6_lock);
1581         return rt;
1582 }
1583
1584 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1585                                            struct in6_addr *gwaddr, int ifindex,
1586                                            unsigned pref)
1587 {
1588         struct fib6_config cfg = {
1589                 .fc_table       = RT6_TABLE_INFO,
1590                 .fc_metric      = 1024,
1591                 .fc_ifindex     = ifindex,
1592                 .fc_dst_len     = prefixlen,
1593                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1594                                   RTF_UP | RTF_PREF(pref),
1595         };
1596
1597         ipv6_addr_copy(&cfg.fc_dst, prefix);
1598         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1599
1600         /* We should treat it as a default route if prefix length is 0. */
1601         if (!prefixlen)
1602                 cfg.fc_flags |= RTF_DEFAULT;
1603
1604         ip6_route_add(&cfg);
1605
1606         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1607 }
1608 #endif
1609
1610 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1611 {       
1612         struct rt6_info *rt;
1613         struct fib6_table *table;
1614
1615         table = fib6_get_table(RT6_TABLE_DFLT);
1616         if (table == NULL)
1617                 return NULL;
1618
1619         write_lock_bh(&table->tb6_lock);
1620         for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1621                 if (dev == rt->rt6i_dev &&
1622                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1623                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1624                         break;
1625         }
1626         if (rt)
1627                 dst_hold(&rt->u.dst);
1628         write_unlock_bh(&table->tb6_lock);
1629         return rt;
1630 }
1631
1632 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1633                                      struct net_device *dev,
1634                                      unsigned int pref)
1635 {
1636         struct fib6_config cfg = {
1637                 .fc_table       = RT6_TABLE_DFLT,
1638                 .fc_metric      = 1024,
1639                 .fc_ifindex     = dev->ifindex,
1640                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1641                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1642         };
1643
1644         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1645
1646         ip6_route_add(&cfg);
1647
1648         return rt6_get_dflt_router(gwaddr, dev);
1649 }
1650
1651 void rt6_purge_dflt_routers(void)
1652 {
1653         struct rt6_info *rt;
1654         struct fib6_table *table;
1655
1656         /* NOTE: Keep consistent with rt6_get_dflt_router */
1657         table = fib6_get_table(RT6_TABLE_DFLT);
1658         if (table == NULL)
1659                 return;
1660
1661 restart:
1662         read_lock_bh(&table->tb6_lock);
1663         for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1664                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1665                         dst_hold(&rt->u.dst);
1666                         read_unlock_bh(&table->tb6_lock);
1667                         ip6_del_rt(rt);
1668                         goto restart;
1669                 }
1670         }
1671         read_unlock_bh(&table->tb6_lock);
1672 }
1673
1674 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1675                                  struct fib6_config *cfg)
1676 {
1677         memset(cfg, 0, sizeof(*cfg));
1678
1679         cfg->fc_table = RT6_TABLE_MAIN;
1680         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1681         cfg->fc_metric = rtmsg->rtmsg_metric;
1682         cfg->fc_expires = rtmsg->rtmsg_info;
1683         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1684         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1685         cfg->fc_flags = rtmsg->rtmsg_flags;
1686
1687         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1688         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1689         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1690 }
1691
1692 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1693 {
1694         struct fib6_config cfg;
1695         struct in6_rtmsg rtmsg;
1696         int err;
1697
1698         switch(cmd) {
1699         case SIOCADDRT:         /* Add a route */
1700         case SIOCDELRT:         /* Delete a route */
1701                 if (!capable(CAP_NET_ADMIN))
1702                         return -EPERM;
1703                 err = copy_from_user(&rtmsg, arg,
1704                                      sizeof(struct in6_rtmsg));
1705                 if (err)
1706                         return -EFAULT;
1707
1708                 rtmsg_to_fib6_config(&rtmsg, &cfg);
1709
1710                 rtnl_lock();
1711                 switch (cmd) {
1712                 case SIOCADDRT:
1713                         err = ip6_route_add(&cfg);
1714                         break;
1715                 case SIOCDELRT:
1716                         err = ip6_route_del(&cfg);
1717                         break;
1718                 default:
1719                         err = -EINVAL;
1720                 }
1721                 rtnl_unlock();
1722
1723                 return err;
1724         };
1725
1726         return -EINVAL;
1727 }
1728
1729 /*
1730  *      Drop the packet on the floor
1731  */
1732
1733 static int ip6_pkt_discard(struct sk_buff *skb)
1734 {
1735         int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1736         if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1737                 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1738
1739         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1740         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1741         kfree_skb(skb);
1742         return 0;
1743 }
1744
1745 static int ip6_pkt_discard_out(struct sk_buff *skb)
1746 {
1747         skb->dev = skb->dst->dev;
1748         return ip6_pkt_discard(skb);
1749 }
1750
1751 /*
1752  *      Allocate a dst for local (unicast / anycast) address.
1753  */
1754
1755 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1756                                     const struct in6_addr *addr,
1757                                     int anycast)
1758 {
1759         struct rt6_info *rt = ip6_dst_alloc();
1760
1761         if (rt == NULL)
1762                 return ERR_PTR(-ENOMEM);
1763
1764         dev_hold(&loopback_dev);
1765         in6_dev_hold(idev);
1766
1767         rt->u.dst.flags = DST_HOST;
1768         rt->u.dst.input = ip6_input;
1769         rt->u.dst.output = ip6_output;
1770         rt->rt6i_dev = &loopback_dev;
1771         rt->rt6i_idev = idev;
1772         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1773         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1774         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1775         rt->u.dst.obsolete = -1;
1776
1777         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1778         if (anycast)
1779                 rt->rt6i_flags |= RTF_ANYCAST;
1780         else
1781                 rt->rt6i_flags |= RTF_LOCAL;
1782         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1783         if (rt->rt6i_nexthop == NULL) {
1784                 dst_free((struct dst_entry *) rt);
1785                 return ERR_PTR(-ENOMEM);
1786         }
1787
1788         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1789         rt->rt6i_dst.plen = 128;
1790         rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1791
1792         atomic_set(&rt->u.dst.__refcnt, 1);
1793
1794         return rt;
1795 }
1796
1797 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1798 {
1799         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1800             rt != &ip6_null_entry) {
1801                 RT6_TRACE("deleted by ifdown %p\n", rt);
1802                 return -1;
1803         }
1804         return 0;
1805 }
1806
1807 void rt6_ifdown(struct net_device *dev)
1808 {
1809         fib6_clean_all(fib6_ifdown, 0, dev);
1810 }
1811
1812 struct rt6_mtu_change_arg
1813 {
1814         struct net_device *dev;
1815         unsigned mtu;
1816 };
1817
1818 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1819 {
1820         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1821         struct inet6_dev *idev;
1822
1823         /* In IPv6 pmtu discovery is not optional,
1824            so that RTAX_MTU lock cannot disable it.
1825            We still use this lock to block changes
1826            caused by addrconf/ndisc.
1827         */
1828
1829         idev = __in6_dev_get(arg->dev);
1830         if (idev == NULL)
1831                 return 0;
1832
1833         /* For administrative MTU increase, there is no way to discover
1834            IPv6 PMTU increase, so PMTU increase should be updated here.
1835            Since RFC 1981 doesn't include administrative MTU increase
1836            update PMTU increase is a MUST. (i.e. jumbo frame)
1837          */
1838         /*
1839            If new MTU is less than route PMTU, this new MTU will be the
1840            lowest MTU in the path, update the route PMTU to reflect PMTU
1841            decreases; if new MTU is greater than route PMTU, and the
1842            old MTU is the lowest MTU in the path, update the route PMTU
1843            to reflect the increase. In this case if the other nodes' MTU
1844            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1845            PMTU discouvery.
1846          */
1847         if (rt->rt6i_dev == arg->dev &&
1848             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1849             (dst_mtu(&rt->u.dst) > arg->mtu ||
1850              (dst_mtu(&rt->u.dst) < arg->mtu &&
1851               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1852                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1853         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1854         return 0;
1855 }
1856
1857 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1858 {
1859         struct rt6_mtu_change_arg arg = {
1860                 .dev = dev,
1861                 .mtu = mtu,
1862         };
1863
1864         fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1865 }
1866
1867 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1868         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1869         [RTA_OIF]               = { .type = NLA_U32 },
1870         [RTA_IIF]               = { .type = NLA_U32 },
1871         [RTA_PRIORITY]          = { .type = NLA_U32 },
1872         [RTA_METRICS]           = { .type = NLA_NESTED },
1873 };
1874
1875 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1876                               struct fib6_config *cfg)
1877 {
1878         struct rtmsg *rtm;
1879         struct nlattr *tb[RTA_MAX+1];
1880         int err;
1881
1882         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1883         if (err < 0)
1884                 goto errout;
1885
1886         err = -EINVAL;
1887         rtm = nlmsg_data(nlh);
1888         memset(cfg, 0, sizeof(*cfg));
1889
1890         cfg->fc_table = rtm->rtm_table;
1891         cfg->fc_dst_len = rtm->rtm_dst_len;
1892         cfg->fc_src_len = rtm->rtm_src_len;
1893         cfg->fc_flags = RTF_UP;
1894         cfg->fc_protocol = rtm->rtm_protocol;
1895
1896         if (rtm->rtm_type == RTN_UNREACHABLE)
1897                 cfg->fc_flags |= RTF_REJECT;
1898
1899         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1900         cfg->fc_nlinfo.nlh = nlh;
1901
1902         if (tb[RTA_GATEWAY]) {
1903                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1904                 cfg->fc_flags |= RTF_GATEWAY;
1905         }
1906
1907         if (tb[RTA_DST]) {
1908                 int plen = (rtm->rtm_dst_len + 7) >> 3;
1909
1910                 if (nla_len(tb[RTA_DST]) < plen)
1911                         goto errout;
1912
1913                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1914         }
1915
1916         if (tb[RTA_SRC]) {
1917                 int plen = (rtm->rtm_src_len + 7) >> 3;
1918
1919                 if (nla_len(tb[RTA_SRC]) < plen)
1920                         goto errout;
1921
1922                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1923         }
1924
1925         if (tb[RTA_OIF])
1926                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1927
1928         if (tb[RTA_PRIORITY])
1929                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1930
1931         if (tb[RTA_METRICS]) {
1932                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1933                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1934         }
1935
1936         if (tb[RTA_TABLE])
1937                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1938
1939         err = 0;
1940 errout:
1941         return err;
1942 }
1943
1944 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1945 {
1946         struct fib6_config cfg;
1947         int err;
1948
1949         err = rtm_to_fib6_config(skb, nlh, &cfg);
1950         if (err < 0)
1951                 return err;
1952
1953         return ip6_route_del(&cfg);
1954 }
1955
1956 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1957 {
1958         struct fib6_config cfg;
1959         int err;
1960
1961         err = rtm_to_fib6_config(skb, nlh, &cfg);
1962         if (err < 0)
1963                 return err;
1964
1965         return ip6_route_add(&cfg);
1966 }
1967
1968 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1969                          struct in6_addr *dst, struct in6_addr *src,
1970                          int iif, int type, u32 pid, u32 seq,
1971                          int prefix, unsigned int flags)
1972 {
1973         struct rtmsg *rtm;
1974         struct nlmsghdr *nlh;
1975         struct rta_cacheinfo ci;
1976         u32 table;
1977
1978         if (prefix) {   /* user wants prefix routes only */
1979                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1980                         /* success since this is not a prefix route */
1981                         return 1;
1982                 }
1983         }
1984
1985         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
1986         if (nlh == NULL)
1987                 return -ENOBUFS;
1988
1989         rtm = nlmsg_data(nlh);
1990         rtm->rtm_family = AF_INET6;
1991         rtm->rtm_dst_len = rt->rt6i_dst.plen;
1992         rtm->rtm_src_len = rt->rt6i_src.plen;
1993         rtm->rtm_tos = 0;
1994         if (rt->rt6i_table)
1995                 table = rt->rt6i_table->tb6_id;
1996         else
1997                 table = RT6_TABLE_UNSPEC;
1998         rtm->rtm_table = table;
1999         NLA_PUT_U32(skb, RTA_TABLE, table);
2000         if (rt->rt6i_flags&RTF_REJECT)
2001                 rtm->rtm_type = RTN_UNREACHABLE;
2002         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2003                 rtm->rtm_type = RTN_LOCAL;
2004         else
2005                 rtm->rtm_type = RTN_UNICAST;
2006         rtm->rtm_flags = 0;
2007         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2008         rtm->rtm_protocol = rt->rt6i_protocol;
2009         if (rt->rt6i_flags&RTF_DYNAMIC)
2010                 rtm->rtm_protocol = RTPROT_REDIRECT;
2011         else if (rt->rt6i_flags & RTF_ADDRCONF)
2012                 rtm->rtm_protocol = RTPROT_KERNEL;
2013         else if (rt->rt6i_flags&RTF_DEFAULT)
2014                 rtm->rtm_protocol = RTPROT_RA;
2015
2016         if (rt->rt6i_flags&RTF_CACHE)
2017                 rtm->rtm_flags |= RTM_F_CLONED;
2018
2019         if (dst) {
2020                 NLA_PUT(skb, RTA_DST, 16, dst);
2021                 rtm->rtm_dst_len = 128;
2022         } else if (rtm->rtm_dst_len)
2023                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2024 #ifdef CONFIG_IPV6_SUBTREES
2025         if (src) {
2026                 NLA_PUT(skb, RTA_SRC, 16, src);
2027                 rtm->rtm_src_len = 128;
2028         } else if (rtm->rtm_src_len)
2029                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2030 #endif
2031         if (iif)
2032                 NLA_PUT_U32(skb, RTA_IIF, iif);
2033         else if (dst) {
2034                 struct in6_addr saddr_buf;
2035                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2036                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2037         }
2038
2039         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2040                 goto nla_put_failure;
2041
2042         if (rt->u.dst.neighbour)
2043                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2044
2045         if (rt->u.dst.dev)
2046                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2047
2048         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2049         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2050         if (rt->rt6i_expires)
2051                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
2052         else
2053                 ci.rta_expires = 0;
2054         ci.rta_used = rt->u.dst.__use;
2055         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2056         ci.rta_error = rt->u.dst.error;
2057         ci.rta_id = 0;
2058         ci.rta_ts = 0;
2059         ci.rta_tsage = 0;
2060         NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2061
2062         return nlmsg_end(skb, nlh);
2063
2064 nla_put_failure:
2065         return nlmsg_cancel(skb, nlh);
2066 }
2067
2068 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2069 {
2070         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2071         int prefix;
2072
2073         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2074                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2075                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2076         } else
2077                 prefix = 0;
2078
2079         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2080                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2081                      prefix, NLM_F_MULTI);
2082 }
2083
2084 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2085 {
2086         struct nlattr *tb[RTA_MAX+1];
2087         struct rt6_info *rt;
2088         struct sk_buff *skb;
2089         struct rtmsg *rtm;
2090         struct flowi fl;
2091         int err, iif = 0;
2092
2093         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2094         if (err < 0)
2095                 goto errout;
2096
2097         err = -EINVAL;
2098         memset(&fl, 0, sizeof(fl));
2099
2100         if (tb[RTA_SRC]) {
2101                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2102                         goto errout;
2103
2104                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2105         }
2106
2107         if (tb[RTA_DST]) {
2108                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2109                         goto errout;
2110
2111                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2112         }
2113
2114         if (tb[RTA_IIF])
2115                 iif = nla_get_u32(tb[RTA_IIF]);
2116
2117         if (tb[RTA_OIF])
2118                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2119
2120         if (iif) {
2121                 struct net_device *dev;
2122                 dev = __dev_get_by_index(iif);
2123                 if (!dev) {
2124                         err = -ENODEV;
2125                         goto errout;
2126                 }
2127         }
2128
2129         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2130         if (skb == NULL) {
2131                 err = -ENOBUFS;
2132                 goto errout;
2133         }
2134
2135         /* Reserve room for dummy headers, this skb can pass
2136            through good chunk of routing engine.
2137          */
2138         skb->mac.raw = skb->data;
2139         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2140
2141         rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2142         skb->dst = &rt->u.dst;
2143
2144         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2145                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2146                             nlh->nlmsg_seq, 0, 0);
2147         if (err < 0) {
2148                 kfree_skb(skb);
2149                 goto errout;
2150         }
2151
2152         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2153 errout:
2154         return err;
2155 }
2156
2157 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2158 {
2159         struct sk_buff *skb;
2160         u32 pid = 0, seq = 0;
2161         struct nlmsghdr *nlh = NULL;
2162         int payload = sizeof(struct rtmsg) + 256;
2163         int err = -ENOBUFS;
2164
2165         if (info) {
2166                 pid = info->pid;
2167                 nlh = info->nlh;
2168                 if (nlh)
2169                         seq = nlh->nlmsg_seq;
2170         }
2171
2172         skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2173         if (skb == NULL)
2174                 goto errout;
2175
2176         err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2177         if (err < 0) {
2178                 kfree_skb(skb);
2179                 goto errout;
2180         }
2181
2182         err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2183 errout:
2184         if (err < 0)
2185                 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2186 }
2187
2188 /*
2189  *      /proc
2190  */
2191
2192 #ifdef CONFIG_PROC_FS
2193
2194 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2195
2196 struct rt6_proc_arg
2197 {
2198         char *buffer;
2199         int offset;
2200         int length;
2201         int skip;
2202         int len;
2203 };
2204
2205 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2206 {
2207         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2208         int i;
2209
2210         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2211                 arg->skip++;
2212                 return 0;
2213         }
2214
2215         if (arg->len >= arg->length)
2216                 return 0;
2217
2218         for (i=0; i<16; i++) {
2219                 sprintf(arg->buffer + arg->len, "%02x",
2220                         rt->rt6i_dst.addr.s6_addr[i]);
2221                 arg->len += 2;
2222         }
2223         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2224                             rt->rt6i_dst.plen);
2225
2226 #ifdef CONFIG_IPV6_SUBTREES
2227         for (i=0; i<16; i++) {
2228                 sprintf(arg->buffer + arg->len, "%02x",
2229                         rt->rt6i_src.addr.s6_addr[i]);
2230                 arg->len += 2;
2231         }
2232         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2233                             rt->rt6i_src.plen);
2234 #else
2235         sprintf(arg->buffer + arg->len,
2236                 "00000000000000000000000000000000 00 ");
2237         arg->len += 36;
2238 #endif
2239
2240         if (rt->rt6i_nexthop) {
2241                 for (i=0; i<16; i++) {
2242                         sprintf(arg->buffer + arg->len, "%02x",
2243                                 rt->rt6i_nexthop->primary_key[i]);
2244                         arg->len += 2;
2245                 }
2246         } else {
2247                 sprintf(arg->buffer + arg->len,
2248                         "00000000000000000000000000000000");
2249                 arg->len += 32;
2250         }
2251         arg->len += sprintf(arg->buffer + arg->len,
2252                             " %08x %08x %08x %08x %8s\n",
2253                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2254                             rt->u.dst.__use, rt->rt6i_flags, 
2255                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2256         return 0;
2257 }
2258
2259 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2260 {
2261         struct rt6_proc_arg arg = {
2262                 .buffer = buffer,
2263                 .offset = offset,
2264                 .length = length,
2265         };
2266
2267         fib6_clean_all(rt6_info_route, 0, &arg);
2268
2269         *start = buffer;
2270         if (offset)
2271                 *start += offset % RT6_INFO_LEN;
2272
2273         arg.len -= offset % RT6_INFO_LEN;
2274
2275         if (arg.len > length)
2276                 arg.len = length;
2277         if (arg.len < 0)
2278                 arg.len = 0;
2279
2280         return arg.len;
2281 }
2282
2283 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2284 {
2285         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2286                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2287                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2288                       rt6_stats.fib_rt_cache,
2289                       atomic_read(&ip6_dst_ops.entries),
2290                       rt6_stats.fib_discarded_routes);
2291
2292         return 0;
2293 }
2294
2295 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2296 {
2297         return single_open(file, rt6_stats_seq_show, NULL);
2298 }
2299
2300 static struct file_operations rt6_stats_seq_fops = {
2301         .owner   = THIS_MODULE,
2302         .open    = rt6_stats_seq_open,
2303         .read    = seq_read,
2304         .llseek  = seq_lseek,
2305         .release = single_release,
2306 };
2307 #endif  /* CONFIG_PROC_FS */
2308
2309 #ifdef CONFIG_SYSCTL
2310
2311 static int flush_delay;
2312
2313 static
2314 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2315                               void __user *buffer, size_t *lenp, loff_t *ppos)
2316 {
2317         if (write) {
2318                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2319                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2320                 return 0;
2321         } else
2322                 return -EINVAL;
2323 }
2324
2325 ctl_table ipv6_route_table[] = {
2326         {
2327                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2328                 .procname       =       "flush",
2329                 .data           =       &flush_delay,
2330                 .maxlen         =       sizeof(int),
2331                 .mode           =       0200,
2332                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2333         },
2334         {
2335                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2336                 .procname       =       "gc_thresh",
2337                 .data           =       &ip6_dst_ops.gc_thresh,
2338                 .maxlen         =       sizeof(int),
2339                 .mode           =       0644,
2340                 .proc_handler   =       &proc_dointvec,
2341         },
2342         {
2343                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2344                 .procname       =       "max_size",
2345                 .data           =       &ip6_rt_max_size,
2346                 .maxlen         =       sizeof(int),
2347                 .mode           =       0644,
2348                 .proc_handler   =       &proc_dointvec,
2349         },
2350         {
2351                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2352                 .procname       =       "gc_min_interval",
2353                 .data           =       &ip6_rt_gc_min_interval,
2354                 .maxlen         =       sizeof(int),
2355                 .mode           =       0644,
2356                 .proc_handler   =       &proc_dointvec_jiffies,
2357                 .strategy       =       &sysctl_jiffies,
2358         },
2359         {
2360                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2361                 .procname       =       "gc_timeout",
2362                 .data           =       &ip6_rt_gc_timeout,
2363                 .maxlen         =       sizeof(int),
2364                 .mode           =       0644,
2365                 .proc_handler   =       &proc_dointvec_jiffies,
2366                 .strategy       =       &sysctl_jiffies,
2367         },
2368         {
2369                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2370                 .procname       =       "gc_interval",
2371                 .data           =       &ip6_rt_gc_interval,
2372                 .maxlen         =       sizeof(int),
2373                 .mode           =       0644,
2374                 .proc_handler   =       &proc_dointvec_jiffies,
2375                 .strategy       =       &sysctl_jiffies,
2376         },
2377         {
2378                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2379                 .procname       =       "gc_elasticity",
2380                 .data           =       &ip6_rt_gc_elasticity,
2381                 .maxlen         =       sizeof(int),
2382                 .mode           =       0644,
2383                 .proc_handler   =       &proc_dointvec_jiffies,
2384                 .strategy       =       &sysctl_jiffies,
2385         },
2386         {
2387                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2388                 .procname       =       "mtu_expires",
2389                 .data           =       &ip6_rt_mtu_expires,
2390                 .maxlen         =       sizeof(int),
2391                 .mode           =       0644,
2392                 .proc_handler   =       &proc_dointvec_jiffies,
2393                 .strategy       =       &sysctl_jiffies,
2394         },
2395         {
2396                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2397                 .procname       =       "min_adv_mss",
2398                 .data           =       &ip6_rt_min_advmss,
2399                 .maxlen         =       sizeof(int),
2400                 .mode           =       0644,
2401                 .proc_handler   =       &proc_dointvec_jiffies,
2402                 .strategy       =       &sysctl_jiffies,
2403         },
2404         {
2405                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2406                 .procname       =       "gc_min_interval_ms",
2407                 .data           =       &ip6_rt_gc_min_interval,
2408                 .maxlen         =       sizeof(int),
2409                 .mode           =       0644,
2410                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2411                 .strategy       =       &sysctl_ms_jiffies,
2412         },
2413         { .ctl_name = 0 }
2414 };
2415
2416 #endif
2417
2418 void __init ip6_route_init(void)
2419 {
2420         struct proc_dir_entry *p;
2421
2422         ip6_dst_ops.kmem_cachep =
2423                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2424                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
2425         fib6_init();
2426 #ifdef  CONFIG_PROC_FS
2427         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2428         if (p)
2429                 p->owner = THIS_MODULE;
2430
2431         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2432 #endif
2433 #ifdef CONFIG_XFRM
2434         xfrm6_init();
2435 #endif
2436 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2437         fib6_rules_init();
2438 #endif
2439 }
2440
2441 void ip6_route_cleanup(void)
2442 {
2443 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2444         fib6_rules_cleanup();
2445 #endif
2446 #ifdef CONFIG_PROC_FS
2447         proc_net_remove("ipv6_route");
2448         proc_net_remove("rt6_stats");
2449 #endif
2450 #ifdef CONFIG_XFRM
2451         xfrm6_fini();
2452 #endif
2453         rt6_ifdown(NULL);
2454         fib6_gc_cleanup();
2455         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2456 }