]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv6/route.c
[DECNET]: Fix input routing bug
[net-next-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41
42 #ifdef  CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57 #include <net/netevent.h>
58 #include <net/netlink.h>
59
60 #include <asm/uaccess.h>
61
62 #ifdef CONFIG_SYSCTL
63 #include <linux/sysctl.h>
64 #endif
65
66 /* Set to 3 to get tracing. */
67 #define RT6_DEBUG 2
68
69 #if RT6_DEBUG >= 3
70 #define RDBG(x) printk x
71 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #else
73 #define RDBG(x)
74 #define RT6_TRACE(x...) do { ; } while (0)
75 #endif
76
77 #define CLONE_OFFLINK_ROUTE 0
78
79 static int ip6_rt_max_size = 4096;
80 static int ip6_rt_gc_min_interval = HZ / 2;
81 static int ip6_rt_gc_timeout = 60*HZ;
82 int ip6_rt_gc_interval = 30*HZ;
83 static int ip6_rt_gc_elasticity = 9;
84 static int ip6_rt_mtu_expires = 10*60*HZ;
85 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
86
87 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
88 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
90 static void             ip6_dst_destroy(struct dst_entry *);
91 static void             ip6_dst_ifdown(struct dst_entry *,
92                                        struct net_device *dev, int how);
93 static int               ip6_dst_gc(void);
94
95 static int              ip6_pkt_discard(struct sk_buff *skb);
96 static int              ip6_pkt_discard_out(struct sk_buff *skb);
97 static void             ip6_link_failure(struct sk_buff *skb);
98 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
99
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
102                                            struct in6_addr *gwaddr, int ifindex,
103                                            unsigned pref);
104 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
105                                            struct in6_addr *gwaddr, int ifindex);
106 #endif
107
108 static struct dst_ops ip6_dst_ops = {
109         .family                 =       AF_INET6,
110         .protocol               =       __constant_htons(ETH_P_IPV6),
111         .gc                     =       ip6_dst_gc,
112         .gc_thresh              =       1024,
113         .check                  =       ip6_dst_check,
114         .destroy                =       ip6_dst_destroy,
115         .ifdown                 =       ip6_dst_ifdown,
116         .negative_advice        =       ip6_negative_advice,
117         .link_failure           =       ip6_link_failure,
118         .update_pmtu            =       ip6_rt_update_pmtu,
119         .entry_size             =       sizeof(struct rt6_info),
120 };
121
122 struct rt6_info ip6_null_entry = {
123         .u = {
124                 .dst = {
125                         .__refcnt       = ATOMIC_INIT(1),
126                         .__use          = 1,
127                         .dev            = &loopback_dev,
128                         .obsolete       = -1,
129                         .error          = -ENETUNREACH,
130                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
131                         .input          = ip6_pkt_discard,
132                         .output         = ip6_pkt_discard_out,
133                         .ops            = &ip6_dst_ops,
134                         .path           = (struct dst_entry*)&ip6_null_entry,
135                 }
136         },
137         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
138         .rt6i_metric    = ~(u32) 0,
139         .rt6i_ref       = ATOMIC_INIT(1),
140 };
141
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
143
144 struct rt6_info ip6_prohibit_entry = {
145         .u = {
146                 .dst = {
147                         .__refcnt       = ATOMIC_INIT(1),
148                         .__use          = 1,
149                         .dev            = &loopback_dev,
150                         .obsolete       = -1,
151                         .error          = -EACCES,
152                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
153                         .input          = ip6_pkt_discard,
154                         .output         = ip6_pkt_discard_out,
155                         .ops            = &ip6_dst_ops,
156                         .path           = (struct dst_entry*)&ip6_prohibit_entry,
157                 }
158         },
159         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
160         .rt6i_metric    = ~(u32) 0,
161         .rt6i_ref       = ATOMIC_INIT(1),
162 };
163
164 struct rt6_info ip6_blk_hole_entry = {
165         .u = {
166                 .dst = {
167                         .__refcnt       = ATOMIC_INIT(1),
168                         .__use          = 1,
169                         .dev            = &loopback_dev,
170                         .obsolete       = -1,
171                         .error          = -EINVAL,
172                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
173                         .input          = ip6_pkt_discard,
174                         .output         = ip6_pkt_discard_out,
175                         .ops            = &ip6_dst_ops,
176                         .path           = (struct dst_entry*)&ip6_blk_hole_entry,
177                 }
178         },
179         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
180         .rt6i_metric    = ~(u32) 0,
181         .rt6i_ref       = ATOMIC_INIT(1),
182 };
183
184 #endif
185
186 /* allocate dst with ip6_dst_ops */
187 static __inline__ struct rt6_info *ip6_dst_alloc(void)
188 {
189         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
190 }
191
192 static void ip6_dst_destroy(struct dst_entry *dst)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195         struct inet6_dev *idev = rt->rt6i_idev;
196
197         if (idev != NULL) {
198                 rt->rt6i_idev = NULL;
199                 in6_dev_put(idev);
200         }       
201 }
202
203 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
204                            int how)
205 {
206         struct rt6_info *rt = (struct rt6_info *)dst;
207         struct inet6_dev *idev = rt->rt6i_idev;
208
209         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
210                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
211                 if (loopback_idev != NULL) {
212                         rt->rt6i_idev = loopback_idev;
213                         in6_dev_put(idev);
214                 }
215         }
216 }
217
218 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
219 {
220         return (rt->rt6i_flags & RTF_EXPIRES &&
221                 time_after(jiffies, rt->rt6i_expires));
222 }
223
224 static inline int rt6_need_strict(struct in6_addr *daddr)
225 {
226         return (ipv6_addr_type(daddr) &
227                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
228 }
229
230 /*
231  *      Route lookup. Any table->tb6_lock is implied.
232  */
233
234 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
235                                                     int oif,
236                                                     int strict)
237 {
238         struct rt6_info *local = NULL;
239         struct rt6_info *sprt;
240
241         if (oif) {
242                 for (sprt = rt; sprt; sprt = sprt->u.next) {
243                         struct net_device *dev = sprt->rt6i_dev;
244                         if (dev->ifindex == oif)
245                                 return sprt;
246                         if (dev->flags & IFF_LOOPBACK) {
247                                 if (sprt->rt6i_idev == NULL ||
248                                     sprt->rt6i_idev->dev->ifindex != oif) {
249                                         if (strict && oif)
250                                                 continue;
251                                         if (local && (!oif || 
252                                                       local->rt6i_idev->dev->ifindex == oif))
253                                                 continue;
254                                 }
255                                 local = sprt;
256                         }
257                 }
258
259                 if (local)
260                         return local;
261
262                 if (strict)
263                         return &ip6_null_entry;
264         }
265         return rt;
266 }
267
268 #ifdef CONFIG_IPV6_ROUTER_PREF
269 static void rt6_probe(struct rt6_info *rt)
270 {
271         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
272         /*
273          * Okay, this does not seem to be appropriate
274          * for now, however, we need to check if it
275          * is really so; aka Router Reachability Probing.
276          *
277          * Router Reachability Probe MUST be rate-limited
278          * to no more than one per minute.
279          */
280         if (!neigh || (neigh->nud_state & NUD_VALID))
281                 return;
282         read_lock_bh(&neigh->lock);
283         if (!(neigh->nud_state & NUD_VALID) &&
284             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
285                 struct in6_addr mcaddr;
286                 struct in6_addr *target;
287
288                 neigh->updated = jiffies;
289                 read_unlock_bh(&neigh->lock);
290
291                 target = (struct in6_addr *)&neigh->primary_key;
292                 addrconf_addr_solict_mult(target, &mcaddr);
293                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
294         } else
295                 read_unlock_bh(&neigh->lock);
296 }
297 #else
298 static inline void rt6_probe(struct rt6_info *rt)
299 {
300         return;
301 }
302 #endif
303
304 /*
305  * Default Router Selection (RFC 2461 6.3.6)
306  */
307 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
308 {
309         struct net_device *dev = rt->rt6i_dev;
310         if (!oif || dev->ifindex == oif)
311                 return 2;
312         if ((dev->flags & IFF_LOOPBACK) &&
313             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
314                 return 1;
315         return 0;
316 }
317
318 static int inline rt6_check_neigh(struct rt6_info *rt)
319 {
320         struct neighbour *neigh = rt->rt6i_nexthop;
321         int m = 0;
322         if (rt->rt6i_flags & RTF_NONEXTHOP ||
323             !(rt->rt6i_flags & RTF_GATEWAY))
324                 m = 1;
325         else if (neigh) {
326                 read_lock_bh(&neigh->lock);
327                 if (neigh->nud_state & NUD_VALID)
328                         m = 2;
329                 read_unlock_bh(&neigh->lock);
330         }
331         return m;
332 }
333
334 static int rt6_score_route(struct rt6_info *rt, int oif,
335                            int strict)
336 {
337         int m, n;
338                 
339         m = rt6_check_dev(rt, oif);
340         if (!m && (strict & RT6_LOOKUP_F_IFACE))
341                 return -1;
342 #ifdef CONFIG_IPV6_ROUTER_PREF
343         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
344 #endif
345         n = rt6_check_neigh(rt);
346         if (n > 1)
347                 m |= 16;
348         else if (!n && strict & RT6_LOOKUP_F_REACHABLE)
349                 return -1;
350         return m;
351 }
352
353 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
354                                    int strict)
355 {
356         struct rt6_info *match = NULL, *last = NULL;
357         struct rt6_info *rt, *rt0 = *head;
358         u32 metric;
359         int mpri = -1;
360
361         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
362                   __FUNCTION__, head, head ? *head : NULL, oif);
363
364         for (rt = rt0, metric = rt0->rt6i_metric;
365              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
366              rt = rt->u.next) {
367                 int m;
368
369                 if (rt6_check_expired(rt))
370                         continue;
371
372                 last = rt;
373
374                 m = rt6_score_route(rt, oif, strict);
375                 if (m < 0)
376                         continue;
377
378                 if (m > mpri) {
379                         rt6_probe(match);
380                         match = rt;
381                         mpri = m;
382                 } else {
383                         rt6_probe(rt);
384                 }
385         }
386
387         if (!match &&
388             (strict & RT6_LOOKUP_F_REACHABLE) &&
389             last && last != rt0) {
390                 /* no entries matched; do round-robin */
391                 static DEFINE_SPINLOCK(lock);
392                 spin_lock(&lock);
393                 *head = rt0->u.next;
394                 rt0->u.next = last->u.next;
395                 last->u.next = rt0;
396                 spin_unlock(&lock);
397         }
398
399         RT6_TRACE("%s() => %p, score=%d\n",
400                   __FUNCTION__, match, mpri);
401
402         return (match ? match : &ip6_null_entry);
403 }
404
405 #ifdef CONFIG_IPV6_ROUTE_INFO
406 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
407                   struct in6_addr *gwaddr)
408 {
409         struct route_info *rinfo = (struct route_info *) opt;
410         struct in6_addr prefix_buf, *prefix;
411         unsigned int pref;
412         u32 lifetime;
413         struct rt6_info *rt;
414
415         if (len < sizeof(struct route_info)) {
416                 return -EINVAL;
417         }
418
419         /* Sanity check for prefix_len and length */
420         if (rinfo->length > 3) {
421                 return -EINVAL;
422         } else if (rinfo->prefix_len > 128) {
423                 return -EINVAL;
424         } else if (rinfo->prefix_len > 64) {
425                 if (rinfo->length < 2) {
426                         return -EINVAL;
427                 }
428         } else if (rinfo->prefix_len > 0) {
429                 if (rinfo->length < 1) {
430                         return -EINVAL;
431                 }
432         }
433
434         pref = rinfo->route_pref;
435         if (pref == ICMPV6_ROUTER_PREF_INVALID)
436                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
437
438         lifetime = htonl(rinfo->lifetime);
439         if (lifetime == 0xffffffff) {
440                 /* infinity */
441         } else if (lifetime > 0x7fffffff/HZ) {
442                 /* Avoid arithmetic overflow */
443                 lifetime = 0x7fffffff/HZ - 1;
444         }
445
446         if (rinfo->length == 3)
447                 prefix = (struct in6_addr *)rinfo->prefix;
448         else {
449                 /* this function is safe */
450                 ipv6_addr_prefix(&prefix_buf,
451                                  (struct in6_addr *)rinfo->prefix,
452                                  rinfo->prefix_len);
453                 prefix = &prefix_buf;
454         }
455
456         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
457
458         if (rt && !lifetime) {
459                 ip6_del_rt(rt);
460                 rt = NULL;
461         }
462
463         if (!rt && lifetime)
464                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
465                                         pref);
466         else if (rt)
467                 rt->rt6i_flags = RTF_ROUTEINFO |
468                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
469
470         if (rt) {
471                 if (lifetime == 0xffffffff) {
472                         rt->rt6i_flags &= ~RTF_EXPIRES;
473                 } else {
474                         rt->rt6i_expires = jiffies + HZ * lifetime;
475                         rt->rt6i_flags |= RTF_EXPIRES;
476                 }
477                 dst_release(&rt->u.dst);
478         }
479         return 0;
480 }
481 #endif
482
483 #define BACKTRACK(saddr) \
484 do { \
485         if (rt == &ip6_null_entry) { \
486                 struct fib6_node *pn; \
487                 while (1) { \
488                         if (fn->fn_flags & RTN_TL_ROOT) \
489                                 goto out; \
490                         pn = fn->parent; \
491                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
492                                 fn = fib6_lookup(pn->subtree, NULL, saddr); \
493                         else \
494                                 fn = pn; \
495                         if (fn->fn_flags & RTN_RTINFO) \
496                                 goto restart; \
497                 } \
498         } \
499 } while(0)
500
501 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
502                                              struct flowi *fl, int flags)
503 {
504         struct fib6_node *fn;
505         struct rt6_info *rt;
506
507         read_lock_bh(&table->tb6_lock);
508         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
509 restart:
510         rt = fn->leaf;
511         rt = rt6_device_match(rt, fl->oif, flags);
512         BACKTRACK(&fl->fl6_src);
513 out:
514         dst_hold(&rt->u.dst);
515         read_unlock_bh(&table->tb6_lock);
516
517         rt->u.dst.lastuse = jiffies;
518         rt->u.dst.__use++;
519
520         return rt;
521
522 }
523
524 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
525                             int oif, int strict)
526 {
527         struct flowi fl = {
528                 .oif = oif,
529                 .nl_u = {
530                         .ip6_u = {
531                                 .daddr = *daddr,
532                         },
533                 },
534         };
535         struct dst_entry *dst;
536         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
537
538         if (saddr) {
539                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
540                 flags |= RT6_LOOKUP_F_HAS_SADDR;
541         }
542
543         dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
544         if (dst->error == 0)
545                 return (struct rt6_info *) dst;
546
547         dst_release(dst);
548
549         return NULL;
550 }
551
552 /* ip6_ins_rt is called with FREE table->tb6_lock.
553    It takes new route entry, the addition fails by any reason the
554    route is freed. In any case, if caller does not hold it, it may
555    be destroyed.
556  */
557
558 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
559 {
560         int err;
561         struct fib6_table *table;
562
563         table = rt->rt6i_table;
564         write_lock_bh(&table->tb6_lock);
565         err = fib6_add(&table->tb6_root, rt, info);
566         write_unlock_bh(&table->tb6_lock);
567
568         return err;
569 }
570
571 int ip6_ins_rt(struct rt6_info *rt)
572 {
573         return __ip6_ins_rt(rt, NULL);
574 }
575
576 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
577                                       struct in6_addr *saddr)
578 {
579         struct rt6_info *rt;
580
581         /*
582          *      Clone the route.
583          */
584
585         rt = ip6_rt_copy(ort);
586
587         if (rt) {
588                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
589                         if (rt->rt6i_dst.plen != 128 &&
590                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
591                                 rt->rt6i_flags |= RTF_ANYCAST;
592                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
593                 }
594
595                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
596                 rt->rt6i_dst.plen = 128;
597                 rt->rt6i_flags |= RTF_CACHE;
598                 rt->u.dst.flags |= DST_HOST;
599
600 #ifdef CONFIG_IPV6_SUBTREES
601                 if (rt->rt6i_src.plen && saddr) {
602                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
603                         rt->rt6i_src.plen = 128;
604                 }
605 #endif
606
607                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
608
609         }
610
611         return rt;
612 }
613
614 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
615 {
616         struct rt6_info *rt = ip6_rt_copy(ort);
617         if (rt) {
618                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
619                 rt->rt6i_dst.plen = 128;
620                 rt->rt6i_flags |= RTF_CACHE;
621                 rt->u.dst.flags |= DST_HOST;
622                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
623         }
624         return rt;
625 }
626
627 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
628                                             struct flowi *fl, int flags)
629 {
630         struct fib6_node *fn;
631         struct rt6_info *rt, *nrt;
632         int strict = 0;
633         int attempts = 3;
634         int err;
635         int reachable = RT6_LOOKUP_F_REACHABLE;
636
637         strict |= flags & RT6_LOOKUP_F_IFACE;
638
639 relookup:
640         read_lock_bh(&table->tb6_lock);
641
642 restart_2:
643         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
644
645 restart:
646         rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
647         BACKTRACK(&fl->fl6_src);
648         if (rt == &ip6_null_entry ||
649             rt->rt6i_flags & RTF_CACHE)
650                 goto out;
651
652         dst_hold(&rt->u.dst);
653         read_unlock_bh(&table->tb6_lock);
654
655         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
656                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
657         else {
658 #if CLONE_OFFLINK_ROUTE
659                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
660 #else
661                 goto out2;
662 #endif
663         }
664
665         dst_release(&rt->u.dst);
666         rt = nrt ? : &ip6_null_entry;
667
668         dst_hold(&rt->u.dst);
669         if (nrt) {
670                 err = ip6_ins_rt(nrt);
671                 if (!err)
672                         goto out2;
673         }
674
675         if (--attempts <= 0)
676                 goto out2;
677
678         /*
679          * Race condition! In the gap, when table->tb6_lock was
680          * released someone could insert this route.  Relookup.
681          */
682         dst_release(&rt->u.dst);
683         goto relookup;
684
685 out:
686         if (reachable) {
687                 reachable = 0;
688                 goto restart_2;
689         }
690         dst_hold(&rt->u.dst);
691         read_unlock_bh(&table->tb6_lock);
692 out2:
693         rt->u.dst.lastuse = jiffies;
694         rt->u.dst.__use++;
695
696         return rt;
697 }
698
699 void ip6_route_input(struct sk_buff *skb)
700 {
701         struct ipv6hdr *iph = skb->nh.ipv6h;
702         int flags = RT6_LOOKUP_F_HAS_SADDR;
703         struct flowi fl = {
704                 .iif = skb->dev->ifindex,
705                 .nl_u = {
706                         .ip6_u = {
707                                 .daddr = iph->daddr,
708                                 .saddr = iph->saddr,
709 #ifdef CONFIG_IPV6_ROUTE_FWMARK
710                                 .fwmark = skb->nfmark,
711 #endif
712                                 .flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
713                         },
714                 },
715                 .proto = iph->nexthdr,
716         };
717
718         if (rt6_need_strict(&iph->daddr))
719                 flags |= RT6_LOOKUP_F_IFACE;
720
721         skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
722 }
723
724 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
725                                              struct flowi *fl, int flags)
726 {
727         struct fib6_node *fn;
728         struct rt6_info *rt, *nrt;
729         int strict = 0;
730         int attempts = 3;
731         int err;
732         int reachable = RT6_LOOKUP_F_REACHABLE;
733
734         strict |= flags & RT6_LOOKUP_F_IFACE;
735
736 relookup:
737         read_lock_bh(&table->tb6_lock);
738
739 restart_2:
740         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
741
742 restart:
743         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
744         BACKTRACK(&fl->fl6_src);
745         if (rt == &ip6_null_entry ||
746             rt->rt6i_flags & RTF_CACHE)
747                 goto out;
748
749         dst_hold(&rt->u.dst);
750         read_unlock_bh(&table->tb6_lock);
751
752         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
753                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
754         else {
755 #if CLONE_OFFLINK_ROUTE
756                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
757 #else
758                 goto out2;
759 #endif
760         }
761
762         dst_release(&rt->u.dst);
763         rt = nrt ? : &ip6_null_entry;
764
765         dst_hold(&rt->u.dst);
766         if (nrt) {
767                 err = ip6_ins_rt(nrt);
768                 if (!err)
769                         goto out2;
770         }
771
772         if (--attempts <= 0)
773                 goto out2;
774
775         /*
776          * Race condition! In the gap, when table->tb6_lock was
777          * released someone could insert this route.  Relookup.
778          */
779         dst_release(&rt->u.dst);
780         goto relookup;
781
782 out:
783         if (reachable) {
784                 reachable = 0;
785                 goto restart_2;
786         }
787         dst_hold(&rt->u.dst);
788         read_unlock_bh(&table->tb6_lock);
789 out2:
790         rt->u.dst.lastuse = jiffies;
791         rt->u.dst.__use++;
792         return rt;
793 }
794
795 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
796 {
797         int flags = 0;
798
799         if (rt6_need_strict(&fl->fl6_dst))
800                 flags |= RT6_LOOKUP_F_IFACE;
801
802         if (!ipv6_addr_any(&fl->fl6_src))
803                 flags |= RT6_LOOKUP_F_HAS_SADDR;
804
805         return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
806 }
807
808
809 /*
810  *      Destination cache support functions
811  */
812
813 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
814 {
815         struct rt6_info *rt;
816
817         rt = (struct rt6_info *) dst;
818
819         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
820                 return dst;
821
822         return NULL;
823 }
824
825 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
826 {
827         struct rt6_info *rt = (struct rt6_info *) dst;
828
829         if (rt) {
830                 if (rt->rt6i_flags & RTF_CACHE)
831                         ip6_del_rt(rt);
832                 else
833                         dst_release(dst);
834         }
835         return NULL;
836 }
837
838 static void ip6_link_failure(struct sk_buff *skb)
839 {
840         struct rt6_info *rt;
841
842         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
843
844         rt = (struct rt6_info *) skb->dst;
845         if (rt) {
846                 if (rt->rt6i_flags&RTF_CACHE) {
847                         dst_set_expires(&rt->u.dst, 0);
848                         rt->rt6i_flags |= RTF_EXPIRES;
849                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
850                         rt->rt6i_node->fn_sernum = -1;
851         }
852 }
853
854 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
855 {
856         struct rt6_info *rt6 = (struct rt6_info*)dst;
857
858         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
859                 rt6->rt6i_flags |= RTF_MODIFIED;
860                 if (mtu < IPV6_MIN_MTU) {
861                         mtu = IPV6_MIN_MTU;
862                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
863                 }
864                 dst->metrics[RTAX_MTU-1] = mtu;
865                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
866         }
867 }
868
869 static int ipv6_get_mtu(struct net_device *dev);
870
871 static inline unsigned int ipv6_advmss(unsigned int mtu)
872 {
873         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
874
875         if (mtu < ip6_rt_min_advmss)
876                 mtu = ip6_rt_min_advmss;
877
878         /*
879          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
880          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
881          * IPV6_MAXPLEN is also valid and means: "any MSS, 
882          * rely only on pmtu discovery"
883          */
884         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
885                 mtu = IPV6_MAXPLEN;
886         return mtu;
887 }
888
889 static struct dst_entry *ndisc_dst_gc_list;
890 static DEFINE_SPINLOCK(ndisc_lock);
891
892 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
893                                   struct neighbour *neigh,
894                                   struct in6_addr *addr,
895                                   int (*output)(struct sk_buff *))
896 {
897         struct rt6_info *rt;
898         struct inet6_dev *idev = in6_dev_get(dev);
899
900         if (unlikely(idev == NULL))
901                 return NULL;
902
903         rt = ip6_dst_alloc();
904         if (unlikely(rt == NULL)) {
905                 in6_dev_put(idev);
906                 goto out;
907         }
908
909         dev_hold(dev);
910         if (neigh)
911                 neigh_hold(neigh);
912         else
913                 neigh = ndisc_get_neigh(dev, addr);
914
915         rt->rt6i_dev      = dev;
916         rt->rt6i_idev     = idev;
917         rt->rt6i_nexthop  = neigh;
918         atomic_set(&rt->u.dst.__refcnt, 1);
919         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
920         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
921         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
922         rt->u.dst.output  = output;
923
924 #if 0   /* there's no chance to use these for ndisc */
925         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
926                                 ? DST_HOST 
927                                 : 0;
928         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
929         rt->rt6i_dst.plen = 128;
930 #endif
931
932         spin_lock_bh(&ndisc_lock);
933         rt->u.dst.next = ndisc_dst_gc_list;
934         ndisc_dst_gc_list = &rt->u.dst;
935         spin_unlock_bh(&ndisc_lock);
936
937         fib6_force_start_gc();
938
939 out:
940         return (struct dst_entry *)rt;
941 }
942
943 int ndisc_dst_gc(int *more)
944 {
945         struct dst_entry *dst, *next, **pprev;
946         int freed;
947
948         next = NULL;
949         freed = 0;
950
951         spin_lock_bh(&ndisc_lock);
952         pprev = &ndisc_dst_gc_list;
953
954         while ((dst = *pprev) != NULL) {
955                 if (!atomic_read(&dst->__refcnt)) {
956                         *pprev = dst->next;
957                         dst_free(dst);
958                         freed++;
959                 } else {
960                         pprev = &dst->next;
961                         (*more)++;
962                 }
963         }
964
965         spin_unlock_bh(&ndisc_lock);
966
967         return freed;
968 }
969
970 static int ip6_dst_gc(void)
971 {
972         static unsigned expire = 30*HZ;
973         static unsigned long last_gc;
974         unsigned long now = jiffies;
975
976         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
977             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
978                 goto out;
979
980         expire++;
981         fib6_run_gc(expire);
982         last_gc = now;
983         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
984                 expire = ip6_rt_gc_timeout>>1;
985
986 out:
987         expire -= expire>>ip6_rt_gc_elasticity;
988         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
989 }
990
991 /* Clean host part of a prefix. Not necessary in radix tree,
992    but results in cleaner routing tables.
993
994    Remove it only when all the things will work!
995  */
996
997 static int ipv6_get_mtu(struct net_device *dev)
998 {
999         int mtu = IPV6_MIN_MTU;
1000         struct inet6_dev *idev;
1001
1002         idev = in6_dev_get(dev);
1003         if (idev) {
1004                 mtu = idev->cnf.mtu6;
1005                 in6_dev_put(idev);
1006         }
1007         return mtu;
1008 }
1009
1010 int ipv6_get_hoplimit(struct net_device *dev)
1011 {
1012         int hoplimit = ipv6_devconf.hop_limit;
1013         struct inet6_dev *idev;
1014
1015         idev = in6_dev_get(dev);
1016         if (idev) {
1017                 hoplimit = idev->cnf.hop_limit;
1018                 in6_dev_put(idev);
1019         }
1020         return hoplimit;
1021 }
1022
1023 /*
1024  *
1025  */
1026
1027 int ip6_route_add(struct fib6_config *cfg)
1028 {
1029         int err;
1030         struct rt6_info *rt = NULL;
1031         struct net_device *dev = NULL;
1032         struct inet6_dev *idev = NULL;
1033         struct fib6_table *table;
1034         int addr_type;
1035
1036         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1037                 return -EINVAL;
1038 #ifndef CONFIG_IPV6_SUBTREES
1039         if (cfg->fc_src_len)
1040                 return -EINVAL;
1041 #endif
1042         if (cfg->fc_ifindex) {
1043                 err = -ENODEV;
1044                 dev = dev_get_by_index(cfg->fc_ifindex);
1045                 if (!dev)
1046                         goto out;
1047                 idev = in6_dev_get(dev);
1048                 if (!idev)
1049                         goto out;
1050         }
1051
1052         if (cfg->fc_metric == 0)
1053                 cfg->fc_metric = IP6_RT_PRIO_USER;
1054
1055         table = fib6_new_table(cfg->fc_table);
1056         if (table == NULL) {
1057                 err = -ENOBUFS;
1058                 goto out;
1059         }
1060
1061         rt = ip6_dst_alloc();
1062
1063         if (rt == NULL) {
1064                 err = -ENOMEM;
1065                 goto out;
1066         }
1067
1068         rt->u.dst.obsolete = -1;
1069         rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1070
1071         if (cfg->fc_protocol == RTPROT_UNSPEC)
1072                 cfg->fc_protocol = RTPROT_BOOT;
1073         rt->rt6i_protocol = cfg->fc_protocol;
1074
1075         addr_type = ipv6_addr_type(&cfg->fc_dst);
1076
1077         if (addr_type & IPV6_ADDR_MULTICAST)
1078                 rt->u.dst.input = ip6_mc_input;
1079         else
1080                 rt->u.dst.input = ip6_forward;
1081
1082         rt->u.dst.output = ip6_output;
1083
1084         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1085         rt->rt6i_dst.plen = cfg->fc_dst_len;
1086         if (rt->rt6i_dst.plen == 128)
1087                rt->u.dst.flags = DST_HOST;
1088
1089 #ifdef CONFIG_IPV6_SUBTREES
1090         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1091         rt->rt6i_src.plen = cfg->fc_src_len;
1092 #endif
1093
1094         rt->rt6i_metric = cfg->fc_metric;
1095
1096         /* We cannot add true routes via loopback here,
1097            they would result in kernel looping; promote them to reject routes
1098          */
1099         if ((cfg->fc_flags & RTF_REJECT) ||
1100             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1101                 /* hold loopback dev/idev if we haven't done so. */
1102                 if (dev != &loopback_dev) {
1103                         if (dev) {
1104                                 dev_put(dev);
1105                                 in6_dev_put(idev);
1106                         }
1107                         dev = &loopback_dev;
1108                         dev_hold(dev);
1109                         idev = in6_dev_get(dev);
1110                         if (!idev) {
1111                                 err = -ENODEV;
1112                                 goto out;
1113                         }
1114                 }
1115                 rt->u.dst.output = ip6_pkt_discard_out;
1116                 rt->u.dst.input = ip6_pkt_discard;
1117                 rt->u.dst.error = -ENETUNREACH;
1118                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1119                 goto install_route;
1120         }
1121
1122         if (cfg->fc_flags & RTF_GATEWAY) {
1123                 struct in6_addr *gw_addr;
1124                 int gwa_type;
1125
1126                 gw_addr = &cfg->fc_gateway;
1127                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1128                 gwa_type = ipv6_addr_type(gw_addr);
1129
1130                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1131                         struct rt6_info *grt;
1132
1133                         /* IPv6 strictly inhibits using not link-local
1134                            addresses as nexthop address.
1135                            Otherwise, router will not able to send redirects.
1136                            It is very good, but in some (rare!) circumstances
1137                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1138                            some exceptions. --ANK
1139                          */
1140                         err = -EINVAL;
1141                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1142                                 goto out;
1143
1144                         grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1145
1146                         err = -EHOSTUNREACH;
1147                         if (grt == NULL)
1148                                 goto out;
1149                         if (dev) {
1150                                 if (dev != grt->rt6i_dev) {
1151                                         dst_release(&grt->u.dst);
1152                                         goto out;
1153                                 }
1154                         } else {
1155                                 dev = grt->rt6i_dev;
1156                                 idev = grt->rt6i_idev;
1157                                 dev_hold(dev);
1158                                 in6_dev_hold(grt->rt6i_idev);
1159                         }
1160                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1161                                 err = 0;
1162                         dst_release(&grt->u.dst);
1163
1164                         if (err)
1165                                 goto out;
1166                 }
1167                 err = -EINVAL;
1168                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1169                         goto out;
1170         }
1171
1172         err = -ENODEV;
1173         if (dev == NULL)
1174                 goto out;
1175
1176         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1177                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1178                 if (IS_ERR(rt->rt6i_nexthop)) {
1179                         err = PTR_ERR(rt->rt6i_nexthop);
1180                         rt->rt6i_nexthop = NULL;
1181                         goto out;
1182                 }
1183         }
1184
1185         rt->rt6i_flags = cfg->fc_flags;
1186
1187 install_route:
1188         if (cfg->fc_mx) {
1189                 struct nlattr *nla;
1190                 int remaining;
1191
1192                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1193                         int type = nla->nla_type;
1194
1195                         if (type) {
1196                                 if (type > RTAX_MAX) {
1197                                         err = -EINVAL;
1198                                         goto out;
1199                                 }
1200
1201                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1202                         }
1203                 }
1204         }
1205
1206         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1207                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1208         if (!rt->u.dst.metrics[RTAX_MTU-1])
1209                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1210         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1211                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1212         rt->u.dst.dev = dev;
1213         rt->rt6i_idev = idev;
1214         rt->rt6i_table = table;
1215         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1216
1217 out:
1218         if (dev)
1219                 dev_put(dev);
1220         if (idev)
1221                 in6_dev_put(idev);
1222         if (rt)
1223                 dst_free((struct dst_entry *) rt);
1224         return err;
1225 }
1226
1227 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1228 {
1229         int err;
1230         struct fib6_table *table;
1231
1232         if (rt == &ip6_null_entry)
1233                 return -ENOENT;
1234
1235         table = rt->rt6i_table;
1236         write_lock_bh(&table->tb6_lock);
1237
1238         err = fib6_del(rt, info);
1239         dst_release(&rt->u.dst);
1240
1241         write_unlock_bh(&table->tb6_lock);
1242
1243         return err;
1244 }
1245
1246 int ip6_del_rt(struct rt6_info *rt)
1247 {
1248         return __ip6_del_rt(rt, NULL);
1249 }
1250
1251 static int ip6_route_del(struct fib6_config *cfg)
1252 {
1253         struct fib6_table *table;
1254         struct fib6_node *fn;
1255         struct rt6_info *rt;
1256         int err = -ESRCH;
1257
1258         table = fib6_get_table(cfg->fc_table);
1259         if (table == NULL)
1260                 return err;
1261
1262         read_lock_bh(&table->tb6_lock);
1263
1264         fn = fib6_locate(&table->tb6_root,
1265                          &cfg->fc_dst, cfg->fc_dst_len,
1266                          &cfg->fc_src, cfg->fc_src_len);
1267         
1268         if (fn) {
1269                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1270                         if (cfg->fc_ifindex &&
1271                             (rt->rt6i_dev == NULL ||
1272                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1273                                 continue;
1274                         if (cfg->fc_flags & RTF_GATEWAY &&
1275                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1276                                 continue;
1277                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1278                                 continue;
1279                         dst_hold(&rt->u.dst);
1280                         read_unlock_bh(&table->tb6_lock);
1281
1282                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1283                 }
1284         }
1285         read_unlock_bh(&table->tb6_lock);
1286
1287         return err;
1288 }
1289
1290 /*
1291  *      Handle redirects
1292  */
1293 struct ip6rd_flowi {
1294         struct flowi fl;
1295         struct in6_addr gateway;
1296 };
1297
1298 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1299                                              struct flowi *fl,
1300                                              int flags)
1301 {
1302         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1303         struct rt6_info *rt;
1304         struct fib6_node *fn;
1305
1306         /*
1307          * Get the "current" route for this destination and
1308          * check if the redirect has come from approriate router.
1309          *
1310          * RFC 2461 specifies that redirects should only be
1311          * accepted if they come from the nexthop to the target.
1312          * Due to the way the routes are chosen, this notion
1313          * is a bit fuzzy and one might need to check all possible
1314          * routes.
1315          */
1316
1317         read_lock_bh(&table->tb6_lock);
1318         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1319 restart:
1320         for (rt = fn->leaf; rt; rt = rt->u.next) {
1321                 /*
1322                  * Current route is on-link; redirect is always invalid.
1323                  *
1324                  * Seems, previous statement is not true. It could
1325                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1326                  * But then router serving it might decide, that we should
1327                  * know truth 8)8) --ANK (980726).
1328                  */
1329                 if (rt6_check_expired(rt))
1330                         continue;
1331                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1332                         continue;
1333                 if (fl->oif != rt->rt6i_dev->ifindex)
1334                         continue;
1335                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1336                         continue;
1337                 break;
1338         }
1339
1340         if (!rt)
1341                 rt = &ip6_null_entry;
1342         BACKTRACK(&fl->fl6_src);
1343 out:
1344         dst_hold(&rt->u.dst);
1345
1346         read_unlock_bh(&table->tb6_lock);
1347
1348         return rt;
1349 };
1350
1351 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1352                                            struct in6_addr *src,
1353                                            struct in6_addr *gateway,
1354                                            struct net_device *dev)
1355 {
1356         int flags = RT6_LOOKUP_F_HAS_SADDR;
1357         struct ip6rd_flowi rdfl = {
1358                 .fl = {
1359                         .oif = dev->ifindex,
1360                         .nl_u = {
1361                                 .ip6_u = {
1362                                         .daddr = *dest,
1363                                         .saddr = *src,
1364                                 },
1365                         },
1366                 },
1367                 .gateway = *gateway,
1368         };
1369
1370         if (rt6_need_strict(dest))
1371                 flags |= RT6_LOOKUP_F_IFACE;
1372
1373         return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1374 }
1375
1376 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1377                   struct in6_addr *saddr,
1378                   struct neighbour *neigh, u8 *lladdr, int on_link)
1379 {
1380         struct rt6_info *rt, *nrt = NULL;
1381         struct netevent_redirect netevent;
1382
1383         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1384
1385         if (rt == &ip6_null_entry) {
1386                 if (net_ratelimit())
1387                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1388                                "for redirect target\n");
1389                 goto out;
1390         }
1391
1392         /*
1393          *      We have finally decided to accept it.
1394          */
1395
1396         neigh_update(neigh, lladdr, NUD_STALE, 
1397                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1398                      NEIGH_UPDATE_F_OVERRIDE|
1399                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1400                                      NEIGH_UPDATE_F_ISROUTER))
1401                      );
1402
1403         /*
1404          * Redirect received -> path was valid.
1405          * Look, redirects are sent only in response to data packets,
1406          * so that this nexthop apparently is reachable. --ANK
1407          */
1408         dst_confirm(&rt->u.dst);
1409
1410         /* Duplicate redirect: silently ignore. */
1411         if (neigh == rt->u.dst.neighbour)
1412                 goto out;
1413
1414         nrt = ip6_rt_copy(rt);
1415         if (nrt == NULL)
1416                 goto out;
1417
1418         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1419         if (on_link)
1420                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1421
1422         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1423         nrt->rt6i_dst.plen = 128;
1424         nrt->u.dst.flags |= DST_HOST;
1425
1426         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1427         nrt->rt6i_nexthop = neigh_clone(neigh);
1428         /* Reset pmtu, it may be better */
1429         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1430         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1431
1432         if (ip6_ins_rt(nrt))
1433                 goto out;
1434
1435         netevent.old = &rt->u.dst;
1436         netevent.new = &nrt->u.dst;
1437         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1438
1439         if (rt->rt6i_flags&RTF_CACHE) {
1440                 ip6_del_rt(rt);
1441                 return;
1442         }
1443
1444 out:
1445         dst_release(&rt->u.dst);
1446         return;
1447 }
1448
1449 /*
1450  *      Handle ICMP "packet too big" messages
1451  *      i.e. Path MTU discovery
1452  */
1453
1454 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1455                         struct net_device *dev, u32 pmtu)
1456 {
1457         struct rt6_info *rt, *nrt;
1458         int allfrag = 0;
1459
1460         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1461         if (rt == NULL)
1462                 return;
1463
1464         if (pmtu >= dst_mtu(&rt->u.dst))
1465                 goto out;
1466
1467         if (pmtu < IPV6_MIN_MTU) {
1468                 /*
1469                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1470                  * MTU (1280) and a fragment header should always be included
1471                  * after a node receiving Too Big message reporting PMTU is
1472                  * less than the IPv6 Minimum Link MTU.
1473                  */
1474                 pmtu = IPV6_MIN_MTU;
1475                 allfrag = 1;
1476         }
1477
1478         /* New mtu received -> path was valid.
1479            They are sent only in response to data packets,
1480            so that this nexthop apparently is reachable. --ANK
1481          */
1482         dst_confirm(&rt->u.dst);
1483
1484         /* Host route. If it is static, it would be better
1485            not to override it, but add new one, so that
1486            when cache entry will expire old pmtu
1487            would return automatically.
1488          */
1489         if (rt->rt6i_flags & RTF_CACHE) {
1490                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1491                 if (allfrag)
1492                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1493                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1494                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1495                 goto out;
1496         }
1497
1498         /* Network route.
1499            Two cases are possible:
1500            1. It is connected route. Action: COW
1501            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1502          */
1503         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1504                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1505         else
1506                 nrt = rt6_alloc_clone(rt, daddr);
1507
1508         if (nrt) {
1509                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1510                 if (allfrag)
1511                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1512
1513                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1514                  * happened within 5 mins, the recommended timer is 10 mins.
1515                  * Here this route expiration time is set to ip6_rt_mtu_expires
1516                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1517                  * and detecting PMTU increase will be automatically happened.
1518                  */
1519                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1520                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1521
1522                 ip6_ins_rt(nrt);
1523         }
1524 out:
1525         dst_release(&rt->u.dst);
1526 }
1527
1528 /*
1529  *      Misc support functions
1530  */
1531
1532 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1533 {
1534         struct rt6_info *rt = ip6_dst_alloc();
1535
1536         if (rt) {
1537                 rt->u.dst.input = ort->u.dst.input;
1538                 rt->u.dst.output = ort->u.dst.output;
1539
1540                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1541                 rt->u.dst.error = ort->u.dst.error;
1542                 rt->u.dst.dev = ort->u.dst.dev;
1543                 if (rt->u.dst.dev)
1544                         dev_hold(rt->u.dst.dev);
1545                 rt->rt6i_idev = ort->rt6i_idev;
1546                 if (rt->rt6i_idev)
1547                         in6_dev_hold(rt->rt6i_idev);
1548                 rt->u.dst.lastuse = jiffies;
1549                 rt->rt6i_expires = 0;
1550
1551                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1552                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1553                 rt->rt6i_metric = 0;
1554
1555                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1556 #ifdef CONFIG_IPV6_SUBTREES
1557                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1558 #endif
1559                 rt->rt6i_table = ort->rt6i_table;
1560         }
1561         return rt;
1562 }
1563
1564 #ifdef CONFIG_IPV6_ROUTE_INFO
1565 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1566                                            struct in6_addr *gwaddr, int ifindex)
1567 {
1568         struct fib6_node *fn;
1569         struct rt6_info *rt = NULL;
1570         struct fib6_table *table;
1571
1572         table = fib6_get_table(RT6_TABLE_INFO);
1573         if (table == NULL)
1574                 return NULL;
1575
1576         write_lock_bh(&table->tb6_lock);
1577         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1578         if (!fn)
1579                 goto out;
1580
1581         for (rt = fn->leaf; rt; rt = rt->u.next) {
1582                 if (rt->rt6i_dev->ifindex != ifindex)
1583                         continue;
1584                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1585                         continue;
1586                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1587                         continue;
1588                 dst_hold(&rt->u.dst);
1589                 break;
1590         }
1591 out:
1592         write_unlock_bh(&table->tb6_lock);
1593         return rt;
1594 }
1595
1596 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1597                                            struct in6_addr *gwaddr, int ifindex,
1598                                            unsigned pref)
1599 {
1600         struct fib6_config cfg = {
1601                 .fc_table       = RT6_TABLE_INFO,
1602                 .fc_metric      = 1024,
1603                 .fc_ifindex     = ifindex,
1604                 .fc_dst_len     = prefixlen,
1605                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1606                                   RTF_UP | RTF_PREF(pref),
1607         };
1608
1609         ipv6_addr_copy(&cfg.fc_dst, prefix);
1610         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1611
1612         /* We should treat it as a default route if prefix length is 0. */
1613         if (!prefixlen)
1614                 cfg.fc_flags |= RTF_DEFAULT;
1615
1616         ip6_route_add(&cfg);
1617
1618         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1619 }
1620 #endif
1621
1622 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1623 {       
1624         struct rt6_info *rt;
1625         struct fib6_table *table;
1626
1627         table = fib6_get_table(RT6_TABLE_DFLT);
1628         if (table == NULL)
1629                 return NULL;
1630
1631         write_lock_bh(&table->tb6_lock);
1632         for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1633                 if (dev == rt->rt6i_dev &&
1634                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1635                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1636                         break;
1637         }
1638         if (rt)
1639                 dst_hold(&rt->u.dst);
1640         write_unlock_bh(&table->tb6_lock);
1641         return rt;
1642 }
1643
1644 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1645                                      struct net_device *dev,
1646                                      unsigned int pref)
1647 {
1648         struct fib6_config cfg = {
1649                 .fc_table       = RT6_TABLE_DFLT,
1650                 .fc_metric      = 1024,
1651                 .fc_ifindex     = dev->ifindex,
1652                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1653                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1654         };
1655
1656         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1657
1658         ip6_route_add(&cfg);
1659
1660         return rt6_get_dflt_router(gwaddr, dev);
1661 }
1662
1663 void rt6_purge_dflt_routers(void)
1664 {
1665         struct rt6_info *rt;
1666         struct fib6_table *table;
1667
1668         /* NOTE: Keep consistent with rt6_get_dflt_router */
1669         table = fib6_get_table(RT6_TABLE_DFLT);
1670         if (table == NULL)
1671                 return;
1672
1673 restart:
1674         read_lock_bh(&table->tb6_lock);
1675         for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1676                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1677                         dst_hold(&rt->u.dst);
1678                         read_unlock_bh(&table->tb6_lock);
1679                         ip6_del_rt(rt);
1680                         goto restart;
1681                 }
1682         }
1683         read_unlock_bh(&table->tb6_lock);
1684 }
1685
1686 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1687                                  struct fib6_config *cfg)
1688 {
1689         memset(cfg, 0, sizeof(*cfg));
1690
1691         cfg->fc_table = RT6_TABLE_MAIN;
1692         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1693         cfg->fc_metric = rtmsg->rtmsg_metric;
1694         cfg->fc_expires = rtmsg->rtmsg_info;
1695         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1696         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1697         cfg->fc_flags = rtmsg->rtmsg_flags;
1698
1699         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1700         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1701         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1702 }
1703
1704 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1705 {
1706         struct fib6_config cfg;
1707         struct in6_rtmsg rtmsg;
1708         int err;
1709
1710         switch(cmd) {
1711         case SIOCADDRT:         /* Add a route */
1712         case SIOCDELRT:         /* Delete a route */
1713                 if (!capable(CAP_NET_ADMIN))
1714                         return -EPERM;
1715                 err = copy_from_user(&rtmsg, arg,
1716                                      sizeof(struct in6_rtmsg));
1717                 if (err)
1718                         return -EFAULT;
1719
1720                 rtmsg_to_fib6_config(&rtmsg, &cfg);
1721
1722                 rtnl_lock();
1723                 switch (cmd) {
1724                 case SIOCADDRT:
1725                         err = ip6_route_add(&cfg);
1726                         break;
1727                 case SIOCDELRT:
1728                         err = ip6_route_del(&cfg);
1729                         break;
1730                 default:
1731                         err = -EINVAL;
1732                 }
1733                 rtnl_unlock();
1734
1735                 return err;
1736         };
1737
1738         return -EINVAL;
1739 }
1740
1741 /*
1742  *      Drop the packet on the floor
1743  */
1744
1745 static int ip6_pkt_discard(struct sk_buff *skb)
1746 {
1747         int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1748         if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1749                 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1750
1751         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1752         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1753         kfree_skb(skb);
1754         return 0;
1755 }
1756
1757 static int ip6_pkt_discard_out(struct sk_buff *skb)
1758 {
1759         skb->dev = skb->dst->dev;
1760         return ip6_pkt_discard(skb);
1761 }
1762
1763 /*
1764  *      Allocate a dst for local (unicast / anycast) address.
1765  */
1766
1767 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1768                                     const struct in6_addr *addr,
1769                                     int anycast)
1770 {
1771         struct rt6_info *rt = ip6_dst_alloc();
1772
1773         if (rt == NULL)
1774                 return ERR_PTR(-ENOMEM);
1775
1776         dev_hold(&loopback_dev);
1777         in6_dev_hold(idev);
1778
1779         rt->u.dst.flags = DST_HOST;
1780         rt->u.dst.input = ip6_input;
1781         rt->u.dst.output = ip6_output;
1782         rt->rt6i_dev = &loopback_dev;
1783         rt->rt6i_idev = idev;
1784         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1785         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1786         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1787         rt->u.dst.obsolete = -1;
1788
1789         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1790         if (anycast)
1791                 rt->rt6i_flags |= RTF_ANYCAST;
1792         else
1793                 rt->rt6i_flags |= RTF_LOCAL;
1794         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1795         if (rt->rt6i_nexthop == NULL) {
1796                 dst_free((struct dst_entry *) rt);
1797                 return ERR_PTR(-ENOMEM);
1798         }
1799
1800         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1801         rt->rt6i_dst.plen = 128;
1802         rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1803
1804         atomic_set(&rt->u.dst.__refcnt, 1);
1805
1806         return rt;
1807 }
1808
1809 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1810 {
1811         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1812             rt != &ip6_null_entry) {
1813                 RT6_TRACE("deleted by ifdown %p\n", rt);
1814                 return -1;
1815         }
1816         return 0;
1817 }
1818
1819 void rt6_ifdown(struct net_device *dev)
1820 {
1821         fib6_clean_all(fib6_ifdown, 0, dev);
1822 }
1823
1824 struct rt6_mtu_change_arg
1825 {
1826         struct net_device *dev;
1827         unsigned mtu;
1828 };
1829
1830 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1831 {
1832         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1833         struct inet6_dev *idev;
1834
1835         /* In IPv6 pmtu discovery is not optional,
1836            so that RTAX_MTU lock cannot disable it.
1837            We still use this lock to block changes
1838            caused by addrconf/ndisc.
1839         */
1840
1841         idev = __in6_dev_get(arg->dev);
1842         if (idev == NULL)
1843                 return 0;
1844
1845         /* For administrative MTU increase, there is no way to discover
1846            IPv6 PMTU increase, so PMTU increase should be updated here.
1847            Since RFC 1981 doesn't include administrative MTU increase
1848            update PMTU increase is a MUST. (i.e. jumbo frame)
1849          */
1850         /*
1851            If new MTU is less than route PMTU, this new MTU will be the
1852            lowest MTU in the path, update the route PMTU to reflect PMTU
1853            decreases; if new MTU is greater than route PMTU, and the
1854            old MTU is the lowest MTU in the path, update the route PMTU
1855            to reflect the increase. In this case if the other nodes' MTU
1856            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1857            PMTU discouvery.
1858          */
1859         if (rt->rt6i_dev == arg->dev &&
1860             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1861             (dst_mtu(&rt->u.dst) > arg->mtu ||
1862              (dst_mtu(&rt->u.dst) < arg->mtu &&
1863               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1864                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1865         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1866         return 0;
1867 }
1868
1869 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1870 {
1871         struct rt6_mtu_change_arg arg = {
1872                 .dev = dev,
1873                 .mtu = mtu,
1874         };
1875
1876         fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1877 }
1878
1879 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1880         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1881         [RTA_OIF]               = { .type = NLA_U32 },
1882         [RTA_IIF]               = { .type = NLA_U32 },
1883         [RTA_PRIORITY]          = { .type = NLA_U32 },
1884         [RTA_METRICS]           = { .type = NLA_NESTED },
1885 };
1886
1887 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1888                               struct fib6_config *cfg)
1889 {
1890         struct rtmsg *rtm;
1891         struct nlattr *tb[RTA_MAX+1];
1892         int err;
1893
1894         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1895         if (err < 0)
1896                 goto errout;
1897
1898         err = -EINVAL;
1899         rtm = nlmsg_data(nlh);
1900         memset(cfg, 0, sizeof(*cfg));
1901
1902         cfg->fc_table = rtm->rtm_table;
1903         cfg->fc_dst_len = rtm->rtm_dst_len;
1904         cfg->fc_src_len = rtm->rtm_src_len;
1905         cfg->fc_flags = RTF_UP;
1906         cfg->fc_protocol = rtm->rtm_protocol;
1907
1908         if (rtm->rtm_type == RTN_UNREACHABLE)
1909                 cfg->fc_flags |= RTF_REJECT;
1910
1911         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1912         cfg->fc_nlinfo.nlh = nlh;
1913
1914         if (tb[RTA_GATEWAY]) {
1915                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1916                 cfg->fc_flags |= RTF_GATEWAY;
1917         }
1918
1919         if (tb[RTA_DST]) {
1920                 int plen = (rtm->rtm_dst_len + 7) >> 3;
1921
1922                 if (nla_len(tb[RTA_DST]) < plen)
1923                         goto errout;
1924
1925                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1926         }
1927
1928         if (tb[RTA_SRC]) {
1929                 int plen = (rtm->rtm_src_len + 7) >> 3;
1930
1931                 if (nla_len(tb[RTA_SRC]) < plen)
1932                         goto errout;
1933
1934                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1935         }
1936
1937         if (tb[RTA_OIF])
1938                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1939
1940         if (tb[RTA_PRIORITY])
1941                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1942
1943         if (tb[RTA_METRICS]) {
1944                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1945                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1946         }
1947
1948         if (tb[RTA_TABLE])
1949                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1950
1951         err = 0;
1952 errout:
1953         return err;
1954 }
1955
1956 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1957 {
1958         struct fib6_config cfg;
1959         int err;
1960
1961         err = rtm_to_fib6_config(skb, nlh, &cfg);
1962         if (err < 0)
1963                 return err;
1964
1965         return ip6_route_del(&cfg);
1966 }
1967
1968 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1969 {
1970         struct fib6_config cfg;
1971         int err;
1972
1973         err = rtm_to_fib6_config(skb, nlh, &cfg);
1974         if (err < 0)
1975                 return err;
1976
1977         return ip6_route_add(&cfg);
1978 }
1979
1980 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1981                          struct in6_addr *dst, struct in6_addr *src,
1982                          int iif, int type, u32 pid, u32 seq,
1983                          int prefix, unsigned int flags)
1984 {
1985         struct rtmsg *rtm;
1986         struct nlmsghdr *nlh;
1987         struct rta_cacheinfo ci;
1988         u32 table;
1989
1990         if (prefix) {   /* user wants prefix routes only */
1991                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1992                         /* success since this is not a prefix route */
1993                         return 1;
1994                 }
1995         }
1996
1997         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
1998         if (nlh == NULL)
1999                 return -ENOBUFS;
2000
2001         rtm = nlmsg_data(nlh);
2002         rtm->rtm_family = AF_INET6;
2003         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2004         rtm->rtm_src_len = rt->rt6i_src.plen;
2005         rtm->rtm_tos = 0;
2006         if (rt->rt6i_table)
2007                 table = rt->rt6i_table->tb6_id;
2008         else
2009                 table = RT6_TABLE_UNSPEC;
2010         rtm->rtm_table = table;
2011         NLA_PUT_U32(skb, RTA_TABLE, table);
2012         if (rt->rt6i_flags&RTF_REJECT)
2013                 rtm->rtm_type = RTN_UNREACHABLE;
2014         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2015                 rtm->rtm_type = RTN_LOCAL;
2016         else
2017                 rtm->rtm_type = RTN_UNICAST;
2018         rtm->rtm_flags = 0;
2019         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2020         rtm->rtm_protocol = rt->rt6i_protocol;
2021         if (rt->rt6i_flags&RTF_DYNAMIC)
2022                 rtm->rtm_protocol = RTPROT_REDIRECT;
2023         else if (rt->rt6i_flags & RTF_ADDRCONF)
2024                 rtm->rtm_protocol = RTPROT_KERNEL;
2025         else if (rt->rt6i_flags&RTF_DEFAULT)
2026                 rtm->rtm_protocol = RTPROT_RA;
2027
2028         if (rt->rt6i_flags&RTF_CACHE)
2029                 rtm->rtm_flags |= RTM_F_CLONED;
2030
2031         if (dst) {
2032                 NLA_PUT(skb, RTA_DST, 16, dst);
2033                 rtm->rtm_dst_len = 128;
2034         } else if (rtm->rtm_dst_len)
2035                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2036 #ifdef CONFIG_IPV6_SUBTREES
2037         if (src) {
2038                 NLA_PUT(skb, RTA_SRC, 16, src);
2039                 rtm->rtm_src_len = 128;
2040         } else if (rtm->rtm_src_len)
2041                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2042 #endif
2043         if (iif)
2044                 NLA_PUT_U32(skb, RTA_IIF, iif);
2045         else if (dst) {
2046                 struct in6_addr saddr_buf;
2047                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2048                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2049         }
2050
2051         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2052                 goto nla_put_failure;
2053
2054         if (rt->u.dst.neighbour)
2055                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2056
2057         if (rt->u.dst.dev)
2058                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2059
2060         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2061         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2062         if (rt->rt6i_expires)
2063                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
2064         else
2065                 ci.rta_expires = 0;
2066         ci.rta_used = rt->u.dst.__use;
2067         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2068         ci.rta_error = rt->u.dst.error;
2069         ci.rta_id = 0;
2070         ci.rta_ts = 0;
2071         ci.rta_tsage = 0;
2072         NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2073
2074         return nlmsg_end(skb, nlh);
2075
2076 nla_put_failure:
2077         return nlmsg_cancel(skb, nlh);
2078 }
2079
2080 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2081 {
2082         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2083         int prefix;
2084
2085         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2086                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2087                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2088         } else
2089                 prefix = 0;
2090
2091         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2092                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2093                      prefix, NLM_F_MULTI);
2094 }
2095
2096 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2097 {
2098         struct nlattr *tb[RTA_MAX+1];
2099         struct rt6_info *rt;
2100         struct sk_buff *skb;
2101         struct rtmsg *rtm;
2102         struct flowi fl;
2103         int err, iif = 0;
2104
2105         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2106         if (err < 0)
2107                 goto errout;
2108
2109         err = -EINVAL;
2110         memset(&fl, 0, sizeof(fl));
2111
2112         if (tb[RTA_SRC]) {
2113                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2114                         goto errout;
2115
2116                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2117         }
2118
2119         if (tb[RTA_DST]) {
2120                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2121                         goto errout;
2122
2123                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2124         }
2125
2126         if (tb[RTA_IIF])
2127                 iif = nla_get_u32(tb[RTA_IIF]);
2128
2129         if (tb[RTA_OIF])
2130                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2131
2132         if (iif) {
2133                 struct net_device *dev;
2134                 dev = __dev_get_by_index(iif);
2135                 if (!dev) {
2136                         err = -ENODEV;
2137                         goto errout;
2138                 }
2139         }
2140
2141         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2142         if (skb == NULL) {
2143                 err = -ENOBUFS;
2144                 goto errout;
2145         }
2146
2147         /* Reserve room for dummy headers, this skb can pass
2148            through good chunk of routing engine.
2149          */
2150         skb->mac.raw = skb->data;
2151         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2152
2153         rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2154         skb->dst = &rt->u.dst;
2155
2156         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2157                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2158                             nlh->nlmsg_seq, 0, 0);
2159         if (err < 0) {
2160                 kfree_skb(skb);
2161                 goto errout;
2162         }
2163
2164         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2165 errout:
2166         return err;
2167 }
2168
2169 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2170 {
2171         struct sk_buff *skb;
2172         u32 pid = 0, seq = 0;
2173         struct nlmsghdr *nlh = NULL;
2174         int payload = sizeof(struct rtmsg) + 256;
2175         int err = -ENOBUFS;
2176
2177         if (info) {
2178                 pid = info->pid;
2179                 nlh = info->nlh;
2180                 if (nlh)
2181                         seq = nlh->nlmsg_seq;
2182         }
2183
2184         skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2185         if (skb == NULL)
2186                 goto errout;
2187
2188         err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2189         if (err < 0) {
2190                 kfree_skb(skb);
2191                 goto errout;
2192         }
2193
2194         err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2195 errout:
2196         if (err < 0)
2197                 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2198 }
2199
2200 /*
2201  *      /proc
2202  */
2203
2204 #ifdef CONFIG_PROC_FS
2205
2206 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2207
2208 struct rt6_proc_arg
2209 {
2210         char *buffer;
2211         int offset;
2212         int length;
2213         int skip;
2214         int len;
2215 };
2216
2217 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2218 {
2219         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2220         int i;
2221
2222         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2223                 arg->skip++;
2224                 return 0;
2225         }
2226
2227         if (arg->len >= arg->length)
2228                 return 0;
2229
2230         for (i=0; i<16; i++) {
2231                 sprintf(arg->buffer + arg->len, "%02x",
2232                         rt->rt6i_dst.addr.s6_addr[i]);
2233                 arg->len += 2;
2234         }
2235         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2236                             rt->rt6i_dst.plen);
2237
2238 #ifdef CONFIG_IPV6_SUBTREES
2239         for (i=0; i<16; i++) {
2240                 sprintf(arg->buffer + arg->len, "%02x",
2241                         rt->rt6i_src.addr.s6_addr[i]);
2242                 arg->len += 2;
2243         }
2244         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2245                             rt->rt6i_src.plen);
2246 #else
2247         sprintf(arg->buffer + arg->len,
2248                 "00000000000000000000000000000000 00 ");
2249         arg->len += 36;
2250 #endif
2251
2252         if (rt->rt6i_nexthop) {
2253                 for (i=0; i<16; i++) {
2254                         sprintf(arg->buffer + arg->len, "%02x",
2255                                 rt->rt6i_nexthop->primary_key[i]);
2256                         arg->len += 2;
2257                 }
2258         } else {
2259                 sprintf(arg->buffer + arg->len,
2260                         "00000000000000000000000000000000");
2261                 arg->len += 32;
2262         }
2263         arg->len += sprintf(arg->buffer + arg->len,
2264                             " %08x %08x %08x %08x %8s\n",
2265                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2266                             rt->u.dst.__use, rt->rt6i_flags, 
2267                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2268         return 0;
2269 }
2270
2271 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2272 {
2273         struct rt6_proc_arg arg = {
2274                 .buffer = buffer,
2275                 .offset = offset,
2276                 .length = length,
2277         };
2278
2279         fib6_clean_all(rt6_info_route, 0, &arg);
2280
2281         *start = buffer;
2282         if (offset)
2283                 *start += offset % RT6_INFO_LEN;
2284
2285         arg.len -= offset % RT6_INFO_LEN;
2286
2287         if (arg.len > length)
2288                 arg.len = length;
2289         if (arg.len < 0)
2290                 arg.len = 0;
2291
2292         return arg.len;
2293 }
2294
2295 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2296 {
2297         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2298                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2299                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2300                       rt6_stats.fib_rt_cache,
2301                       atomic_read(&ip6_dst_ops.entries),
2302                       rt6_stats.fib_discarded_routes);
2303
2304         return 0;
2305 }
2306
2307 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2308 {
2309         return single_open(file, rt6_stats_seq_show, NULL);
2310 }
2311
2312 static struct file_operations rt6_stats_seq_fops = {
2313         .owner   = THIS_MODULE,
2314         .open    = rt6_stats_seq_open,
2315         .read    = seq_read,
2316         .llseek  = seq_lseek,
2317         .release = single_release,
2318 };
2319 #endif  /* CONFIG_PROC_FS */
2320
2321 #ifdef CONFIG_SYSCTL
2322
2323 static int flush_delay;
2324
2325 static
2326 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2327                               void __user *buffer, size_t *lenp, loff_t *ppos)
2328 {
2329         if (write) {
2330                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2331                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2332                 return 0;
2333         } else
2334                 return -EINVAL;
2335 }
2336
2337 ctl_table ipv6_route_table[] = {
2338         {
2339                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2340                 .procname       =       "flush",
2341                 .data           =       &flush_delay,
2342                 .maxlen         =       sizeof(int),
2343                 .mode           =       0200,
2344                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2345         },
2346         {
2347                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2348                 .procname       =       "gc_thresh",
2349                 .data           =       &ip6_dst_ops.gc_thresh,
2350                 .maxlen         =       sizeof(int),
2351                 .mode           =       0644,
2352                 .proc_handler   =       &proc_dointvec,
2353         },
2354         {
2355                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2356                 .procname       =       "max_size",
2357                 .data           =       &ip6_rt_max_size,
2358                 .maxlen         =       sizeof(int),
2359                 .mode           =       0644,
2360                 .proc_handler   =       &proc_dointvec,
2361         },
2362         {
2363                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2364                 .procname       =       "gc_min_interval",
2365                 .data           =       &ip6_rt_gc_min_interval,
2366                 .maxlen         =       sizeof(int),
2367                 .mode           =       0644,
2368                 .proc_handler   =       &proc_dointvec_jiffies,
2369                 .strategy       =       &sysctl_jiffies,
2370         },
2371         {
2372                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2373                 .procname       =       "gc_timeout",
2374                 .data           =       &ip6_rt_gc_timeout,
2375                 .maxlen         =       sizeof(int),
2376                 .mode           =       0644,
2377                 .proc_handler   =       &proc_dointvec_jiffies,
2378                 .strategy       =       &sysctl_jiffies,
2379         },
2380         {
2381                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2382                 .procname       =       "gc_interval",
2383                 .data           =       &ip6_rt_gc_interval,
2384                 .maxlen         =       sizeof(int),
2385                 .mode           =       0644,
2386                 .proc_handler   =       &proc_dointvec_jiffies,
2387                 .strategy       =       &sysctl_jiffies,
2388         },
2389         {
2390                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2391                 .procname       =       "gc_elasticity",
2392                 .data           =       &ip6_rt_gc_elasticity,
2393                 .maxlen         =       sizeof(int),
2394                 .mode           =       0644,
2395                 .proc_handler   =       &proc_dointvec_jiffies,
2396                 .strategy       =       &sysctl_jiffies,
2397         },
2398         {
2399                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2400                 .procname       =       "mtu_expires",
2401                 .data           =       &ip6_rt_mtu_expires,
2402                 .maxlen         =       sizeof(int),
2403                 .mode           =       0644,
2404                 .proc_handler   =       &proc_dointvec_jiffies,
2405                 .strategy       =       &sysctl_jiffies,
2406         },
2407         {
2408                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2409                 .procname       =       "min_adv_mss",
2410                 .data           =       &ip6_rt_min_advmss,
2411                 .maxlen         =       sizeof(int),
2412                 .mode           =       0644,
2413                 .proc_handler   =       &proc_dointvec_jiffies,
2414                 .strategy       =       &sysctl_jiffies,
2415         },
2416         {
2417                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2418                 .procname       =       "gc_min_interval_ms",
2419                 .data           =       &ip6_rt_gc_min_interval,
2420                 .maxlen         =       sizeof(int),
2421                 .mode           =       0644,
2422                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2423                 .strategy       =       &sysctl_ms_jiffies,
2424         },
2425         { .ctl_name = 0 }
2426 };
2427
2428 #endif
2429
2430 void __init ip6_route_init(void)
2431 {
2432         struct proc_dir_entry *p;
2433
2434         ip6_dst_ops.kmem_cachep =
2435                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2436                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
2437         fib6_init();
2438 #ifdef  CONFIG_PROC_FS
2439         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2440         if (p)
2441                 p->owner = THIS_MODULE;
2442
2443         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2444 #endif
2445 #ifdef CONFIG_XFRM
2446         xfrm6_init();
2447 #endif
2448 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2449         fib6_rules_init();
2450 #endif
2451 }
2452
2453 void ip6_route_cleanup(void)
2454 {
2455 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2456         fib6_rules_cleanup();
2457 #endif
2458 #ifdef CONFIG_PROC_FS
2459         proc_net_remove("ipv6_route");
2460         proc_net_remove("rt6_stats");
2461 #endif
2462 #ifdef CONFIG_XFRM
2463         xfrm6_fini();
2464 #endif
2465         rt6_ifdown(NULL);
2466         fib6_gc_cleanup();
2467         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2468 }