]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv6/route.c
net: use the macros defined for the members of flowi
[net-next-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            struct in6_addr *prefix, int prefixlen,
93                                            struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            struct in6_addr *prefix, int prefixlen,
97                                            struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static struct dst_ops ip6_dst_ops_template = {
101         .family                 =       AF_INET6,
102         .protocol               =       cpu_to_be16(ETH_P_IPV6),
103         .gc                     =       ip6_dst_gc,
104         .gc_thresh              =       1024,
105         .check                  =       ip6_dst_check,
106         .destroy                =       ip6_dst_destroy,
107         .ifdown                 =       ip6_dst_ifdown,
108         .negative_advice        =       ip6_negative_advice,
109         .link_failure           =       ip6_link_failure,
110         .update_pmtu            =       ip6_rt_update_pmtu,
111         .local_out              =       __ip6_local_out,
112 };
113
114 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
115 {
116 }
117
118 static struct dst_ops ip6_dst_blackhole_ops = {
119         .family                 =       AF_INET6,
120         .protocol               =       cpu_to_be16(ETH_P_IPV6),
121         .destroy                =       ip6_dst_destroy,
122         .check                  =       ip6_dst_check,
123         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
124 };
125
126 static struct rt6_info ip6_null_entry_template = {
127         .dst = {
128                 .__refcnt       = ATOMIC_INIT(1),
129                 .__use          = 1,
130                 .obsolete       = -1,
131                 .error          = -ENETUNREACH,
132                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
133                 .input          = ip6_pkt_discard,
134                 .output         = ip6_pkt_discard_out,
135         },
136         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
137         .rt6i_protocol  = RTPROT_KERNEL,
138         .rt6i_metric    = ~(u32) 0,
139         .rt6i_ref       = ATOMIC_INIT(1),
140 };
141
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
143
144 static int ip6_pkt_prohibit(struct sk_buff *skb);
145 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
146
147 static struct rt6_info ip6_prohibit_entry_template = {
148         .dst = {
149                 .__refcnt       = ATOMIC_INIT(1),
150                 .__use          = 1,
151                 .obsolete       = -1,
152                 .error          = -EACCES,
153                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
154                 .input          = ip6_pkt_prohibit,
155                 .output         = ip6_pkt_prohibit_out,
156         },
157         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
158         .rt6i_protocol  = RTPROT_KERNEL,
159         .rt6i_metric    = ~(u32) 0,
160         .rt6i_ref       = ATOMIC_INIT(1),
161 };
162
163 static struct rt6_info ip6_blk_hole_entry_template = {
164         .dst = {
165                 .__refcnt       = ATOMIC_INIT(1),
166                 .__use          = 1,
167                 .obsolete       = -1,
168                 .error          = -EINVAL,
169                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
170                 .input          = dst_discard,
171                 .output         = dst_discard,
172         },
173         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
174         .rt6i_protocol  = RTPROT_KERNEL,
175         .rt6i_metric    = ~(u32) 0,
176         .rt6i_ref       = ATOMIC_INIT(1),
177 };
178
179 #endif
180
181 /* allocate dst with ip6_dst_ops */
182 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
183 {
184         return (struct rt6_info *)dst_alloc(ops);
185 }
186
187 static void ip6_dst_destroy(struct dst_entry *dst)
188 {
189         struct rt6_info *rt = (struct rt6_info *)dst;
190         struct inet6_dev *idev = rt->rt6i_idev;
191
192         if (idev != NULL) {
193                 rt->rt6i_idev = NULL;
194                 in6_dev_put(idev);
195         }
196 }
197
198 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
199                            int how)
200 {
201         struct rt6_info *rt = (struct rt6_info *)dst;
202         struct inet6_dev *idev = rt->rt6i_idev;
203         struct net_device *loopback_dev =
204                 dev_net(dev)->loopback_dev;
205
206         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
207                 struct inet6_dev *loopback_idev =
208                         in6_dev_get(loopback_dev);
209                 if (loopback_idev != NULL) {
210                         rt->rt6i_idev = loopback_idev;
211                         in6_dev_put(idev);
212                 }
213         }
214 }
215
216 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
217 {
218         return (rt->rt6i_flags & RTF_EXPIRES) &&
219                 time_after(jiffies, rt->rt6i_expires);
220 }
221
222 static inline int rt6_need_strict(struct in6_addr *daddr)
223 {
224         return ipv6_addr_type(daddr) &
225                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
226 }
227
228 /*
229  *      Route lookup. Any table->tb6_lock is implied.
230  */
231
232 static inline struct rt6_info *rt6_device_match(struct net *net,
233                                                     struct rt6_info *rt,
234                                                     struct in6_addr *saddr,
235                                                     int oif,
236                                                     int flags)
237 {
238         struct rt6_info *local = NULL;
239         struct rt6_info *sprt;
240
241         if (!oif && ipv6_addr_any(saddr))
242                 goto out;
243
244         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
245                 struct net_device *dev = sprt->rt6i_dev;
246
247                 if (oif) {
248                         if (dev->ifindex == oif)
249                                 return sprt;
250                         if (dev->flags & IFF_LOOPBACK) {
251                                 if (sprt->rt6i_idev == NULL ||
252                                     sprt->rt6i_idev->dev->ifindex != oif) {
253                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
254                                                 continue;
255                                         if (local && (!oif ||
256                                                       local->rt6i_idev->dev->ifindex == oif))
257                                                 continue;
258                                 }
259                                 local = sprt;
260                         }
261                 } else {
262                         if (ipv6_chk_addr(net, saddr, dev,
263                                           flags & RT6_LOOKUP_F_IFACE))
264                                 return sprt;
265                 }
266         }
267
268         if (oif) {
269                 if (local)
270                         return local;
271
272                 if (flags & RT6_LOOKUP_F_IFACE)
273                         return net->ipv6.ip6_null_entry;
274         }
275 out:
276         return rt;
277 }
278
279 #ifdef CONFIG_IPV6_ROUTER_PREF
280 static void rt6_probe(struct rt6_info *rt)
281 {
282         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
283         /*
284          * Okay, this does not seem to be appropriate
285          * for now, however, we need to check if it
286          * is really so; aka Router Reachability Probing.
287          *
288          * Router Reachability Probe MUST be rate-limited
289          * to no more than one per minute.
290          */
291         if (!neigh || (neigh->nud_state & NUD_VALID))
292                 return;
293         read_lock_bh(&neigh->lock);
294         if (!(neigh->nud_state & NUD_VALID) &&
295             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
296                 struct in6_addr mcaddr;
297                 struct in6_addr *target;
298
299                 neigh->updated = jiffies;
300                 read_unlock_bh(&neigh->lock);
301
302                 target = (struct in6_addr *)&neigh->primary_key;
303                 addrconf_addr_solict_mult(target, &mcaddr);
304                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
305         } else
306                 read_unlock_bh(&neigh->lock);
307 }
308 #else
309 static inline void rt6_probe(struct rt6_info *rt)
310 {
311 }
312 #endif
313
314 /*
315  * Default Router Selection (RFC 2461 6.3.6)
316  */
317 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
318 {
319         struct net_device *dev = rt->rt6i_dev;
320         if (!oif || dev->ifindex == oif)
321                 return 2;
322         if ((dev->flags & IFF_LOOPBACK) &&
323             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
324                 return 1;
325         return 0;
326 }
327
328 static inline int rt6_check_neigh(struct rt6_info *rt)
329 {
330         struct neighbour *neigh = rt->rt6i_nexthop;
331         int m;
332         if (rt->rt6i_flags & RTF_NONEXTHOP ||
333             !(rt->rt6i_flags & RTF_GATEWAY))
334                 m = 1;
335         else if (neigh) {
336                 read_lock_bh(&neigh->lock);
337                 if (neigh->nud_state & NUD_VALID)
338                         m = 2;
339 #ifdef CONFIG_IPV6_ROUTER_PREF
340                 else if (neigh->nud_state & NUD_FAILED)
341                         m = 0;
342 #endif
343                 else
344                         m = 1;
345                 read_unlock_bh(&neigh->lock);
346         } else
347                 m = 0;
348         return m;
349 }
350
351 static int rt6_score_route(struct rt6_info *rt, int oif,
352                            int strict)
353 {
354         int m, n;
355
356         m = rt6_check_dev(rt, oif);
357         if (!m && (strict & RT6_LOOKUP_F_IFACE))
358                 return -1;
359 #ifdef CONFIG_IPV6_ROUTER_PREF
360         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
361 #endif
362         n = rt6_check_neigh(rt);
363         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
364                 return -1;
365         return m;
366 }
367
368 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
369                                    int *mpri, struct rt6_info *match)
370 {
371         int m;
372
373         if (rt6_check_expired(rt))
374                 goto out;
375
376         m = rt6_score_route(rt, oif, strict);
377         if (m < 0)
378                 goto out;
379
380         if (m > *mpri) {
381                 if (strict & RT6_LOOKUP_F_REACHABLE)
382                         rt6_probe(match);
383                 *mpri = m;
384                 match = rt;
385         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
386                 rt6_probe(rt);
387         }
388
389 out:
390         return match;
391 }
392
393 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
394                                      struct rt6_info *rr_head,
395                                      u32 metric, int oif, int strict)
396 {
397         struct rt6_info *rt, *match;
398         int mpri = -1;
399
400         match = NULL;
401         for (rt = rr_head; rt && rt->rt6i_metric == metric;
402              rt = rt->dst.rt6_next)
403                 match = find_match(rt, oif, strict, &mpri, match);
404         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
405              rt = rt->dst.rt6_next)
406                 match = find_match(rt, oif, strict, &mpri, match);
407
408         return match;
409 }
410
411 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
412 {
413         struct rt6_info *match, *rt0;
414         struct net *net;
415
416         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
417                   __func__, fn->leaf, oif);
418
419         rt0 = fn->rr_ptr;
420         if (!rt0)
421                 fn->rr_ptr = rt0 = fn->leaf;
422
423         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
424
425         if (!match &&
426             (strict & RT6_LOOKUP_F_REACHABLE)) {
427                 struct rt6_info *next = rt0->dst.rt6_next;
428
429                 /* no entries matched; do round-robin */
430                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
431                         next = fn->leaf;
432
433                 if (next != rt0)
434                         fn->rr_ptr = next;
435         }
436
437         RT6_TRACE("%s() => %p\n",
438                   __func__, match);
439
440         net = dev_net(rt0->rt6i_dev);
441         return match ? match : net->ipv6.ip6_null_entry;
442 }
443
444 #ifdef CONFIG_IPV6_ROUTE_INFO
445 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
446                   struct in6_addr *gwaddr)
447 {
448         struct net *net = dev_net(dev);
449         struct route_info *rinfo = (struct route_info *) opt;
450         struct in6_addr prefix_buf, *prefix;
451         unsigned int pref;
452         unsigned long lifetime;
453         struct rt6_info *rt;
454
455         if (len < sizeof(struct route_info)) {
456                 return -EINVAL;
457         }
458
459         /* Sanity check for prefix_len and length */
460         if (rinfo->length > 3) {
461                 return -EINVAL;
462         } else if (rinfo->prefix_len > 128) {
463                 return -EINVAL;
464         } else if (rinfo->prefix_len > 64) {
465                 if (rinfo->length < 2) {
466                         return -EINVAL;
467                 }
468         } else if (rinfo->prefix_len > 0) {
469                 if (rinfo->length < 1) {
470                         return -EINVAL;
471                 }
472         }
473
474         pref = rinfo->route_pref;
475         if (pref == ICMPV6_ROUTER_PREF_INVALID)
476                 return -EINVAL;
477
478         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
479
480         if (rinfo->length == 3)
481                 prefix = (struct in6_addr *)rinfo->prefix;
482         else {
483                 /* this function is safe */
484                 ipv6_addr_prefix(&prefix_buf,
485                                  (struct in6_addr *)rinfo->prefix,
486                                  rinfo->prefix_len);
487                 prefix = &prefix_buf;
488         }
489
490         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
491                                 dev->ifindex);
492
493         if (rt && !lifetime) {
494                 ip6_del_rt(rt);
495                 rt = NULL;
496         }
497
498         if (!rt && lifetime)
499                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
500                                         pref);
501         else if (rt)
502                 rt->rt6i_flags = RTF_ROUTEINFO |
503                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
504
505         if (rt) {
506                 if (!addrconf_finite_timeout(lifetime)) {
507                         rt->rt6i_flags &= ~RTF_EXPIRES;
508                 } else {
509                         rt->rt6i_expires = jiffies + HZ * lifetime;
510                         rt->rt6i_flags |= RTF_EXPIRES;
511                 }
512                 dst_release(&rt->dst);
513         }
514         return 0;
515 }
516 #endif
517
518 #define BACKTRACK(__net, saddr)                 \
519 do { \
520         if (rt == __net->ipv6.ip6_null_entry) { \
521                 struct fib6_node *pn; \
522                 while (1) { \
523                         if (fn->fn_flags & RTN_TL_ROOT) \
524                                 goto out; \
525                         pn = fn->parent; \
526                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
527                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
528                         else \
529                                 fn = pn; \
530                         if (fn->fn_flags & RTN_RTINFO) \
531                                 goto restart; \
532                 } \
533         } \
534 } while(0)
535
536 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
537                                              struct fib6_table *table,
538                                              struct flowi *fl, int flags)
539 {
540         struct fib6_node *fn;
541         struct rt6_info *rt;
542
543         read_lock_bh(&table->tb6_lock);
544         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
545 restart:
546         rt = fn->leaf;
547         rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
548         BACKTRACK(net, &fl->fl6_src);
549 out:
550         dst_use(&rt->dst, jiffies);
551         read_unlock_bh(&table->tb6_lock);
552         return rt;
553
554 }
555
556 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
557                             const struct in6_addr *saddr, int oif, int strict)
558 {
559         struct flowi fl = {
560                 .oif = oif,
561                 .fl6_dst = *daddr,
562         };
563         struct dst_entry *dst;
564         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
565
566         if (saddr) {
567                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
568                 flags |= RT6_LOOKUP_F_HAS_SADDR;
569         }
570
571         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
572         if (dst->error == 0)
573                 return (struct rt6_info *) dst;
574
575         dst_release(dst);
576
577         return NULL;
578 }
579
580 EXPORT_SYMBOL(rt6_lookup);
581
582 /* ip6_ins_rt is called with FREE table->tb6_lock.
583    It takes new route entry, the addition fails by any reason the
584    route is freed. In any case, if caller does not hold it, it may
585    be destroyed.
586  */
587
588 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
589 {
590         int err;
591         struct fib6_table *table;
592
593         table = rt->rt6i_table;
594         write_lock_bh(&table->tb6_lock);
595         err = fib6_add(&table->tb6_root, rt, info);
596         write_unlock_bh(&table->tb6_lock);
597
598         return err;
599 }
600
601 int ip6_ins_rt(struct rt6_info *rt)
602 {
603         struct nl_info info = {
604                 .nl_net = dev_net(rt->rt6i_dev),
605         };
606         return __ip6_ins_rt(rt, &info);
607 }
608
609 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
610                                       struct in6_addr *saddr)
611 {
612         struct rt6_info *rt;
613
614         /*
615          *      Clone the route.
616          */
617
618         rt = ip6_rt_copy(ort);
619
620         if (rt) {
621                 struct neighbour *neigh;
622                 int attempts = !in_softirq();
623
624                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
625                         if (rt->rt6i_dst.plen != 128 &&
626                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
627                                 rt->rt6i_flags |= RTF_ANYCAST;
628                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
629                 }
630
631                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
632                 rt->rt6i_dst.plen = 128;
633                 rt->rt6i_flags |= RTF_CACHE;
634                 rt->dst.flags |= DST_HOST;
635
636 #ifdef CONFIG_IPV6_SUBTREES
637                 if (rt->rt6i_src.plen && saddr) {
638                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
639                         rt->rt6i_src.plen = 128;
640                 }
641 #endif
642
643         retry:
644                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
645                 if (IS_ERR(neigh)) {
646                         struct net *net = dev_net(rt->rt6i_dev);
647                         int saved_rt_min_interval =
648                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
649                         int saved_rt_elasticity =
650                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
651
652                         if (attempts-- > 0) {
653                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
654                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
655
656                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
657
658                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
659                                         saved_rt_elasticity;
660                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
661                                         saved_rt_min_interval;
662                                 goto retry;
663                         }
664
665                         if (net_ratelimit())
666                                 printk(KERN_WARNING
667                                        "ipv6: Neighbour table overflow.\n");
668                         dst_free(&rt->dst);
669                         return NULL;
670                 }
671                 rt->rt6i_nexthop = neigh;
672
673         }
674
675         return rt;
676 }
677
678 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
679 {
680         struct rt6_info *rt = ip6_rt_copy(ort);
681         if (rt) {
682                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
683                 rt->rt6i_dst.plen = 128;
684                 rt->rt6i_flags |= RTF_CACHE;
685                 rt->dst.flags |= DST_HOST;
686                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
687         }
688         return rt;
689 }
690
691 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
692                                       struct flowi *fl, int flags)
693 {
694         struct fib6_node *fn;
695         struct rt6_info *rt, *nrt;
696         int strict = 0;
697         int attempts = 3;
698         int err;
699         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
700
701         strict |= flags & RT6_LOOKUP_F_IFACE;
702
703 relookup:
704         read_lock_bh(&table->tb6_lock);
705
706 restart_2:
707         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
708
709 restart:
710         rt = rt6_select(fn, oif, strict | reachable);
711
712         BACKTRACK(net, &fl->fl6_src);
713         if (rt == net->ipv6.ip6_null_entry ||
714             rt->rt6i_flags & RTF_CACHE)
715                 goto out;
716
717         dst_hold(&rt->dst);
718         read_unlock_bh(&table->tb6_lock);
719
720         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
721                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
722         else {
723 #if CLONE_OFFLINK_ROUTE
724                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
725 #else
726                 goto out2;
727 #endif
728         }
729
730         dst_release(&rt->dst);
731         rt = nrt ? : net->ipv6.ip6_null_entry;
732
733         dst_hold(&rt->dst);
734         if (nrt) {
735                 err = ip6_ins_rt(nrt);
736                 if (!err)
737                         goto out2;
738         }
739
740         if (--attempts <= 0)
741                 goto out2;
742
743         /*
744          * Race condition! In the gap, when table->tb6_lock was
745          * released someone could insert this route.  Relookup.
746          */
747         dst_release(&rt->dst);
748         goto relookup;
749
750 out:
751         if (reachable) {
752                 reachable = 0;
753                 goto restart_2;
754         }
755         dst_hold(&rt->dst);
756         read_unlock_bh(&table->tb6_lock);
757 out2:
758         rt->dst.lastuse = jiffies;
759         rt->dst.__use++;
760
761         return rt;
762 }
763
764 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
765                                             struct flowi *fl, int flags)
766 {
767         return ip6_pol_route(net, table, fl->iif, fl, flags);
768 }
769
770 void ip6_route_input(struct sk_buff *skb)
771 {
772         struct ipv6hdr *iph = ipv6_hdr(skb);
773         struct net *net = dev_net(skb->dev);
774         int flags = RT6_LOOKUP_F_HAS_SADDR;
775         struct flowi fl = {
776                 .iif = skb->dev->ifindex,
777                 .fl6_dst = iph->daddr,
778                 .fl6_src = iph->saddr,
779                 .fl6_flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
780                 .mark = skb->mark,
781                 .proto = iph->nexthdr,
782         };
783
784         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
785                 flags |= RT6_LOOKUP_F_IFACE;
786
787         skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
788 }
789
790 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
791                                              struct flowi *fl, int flags)
792 {
793         return ip6_pol_route(net, table, fl->oif, fl, flags);
794 }
795
796 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
797                                     struct flowi *fl)
798 {
799         int flags = 0;
800
801         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
802                 flags |= RT6_LOOKUP_F_IFACE;
803
804         if (!ipv6_addr_any(&fl->fl6_src))
805                 flags |= RT6_LOOKUP_F_HAS_SADDR;
806         else if (sk)
807                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
808
809         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
810 }
811
812 EXPORT_SYMBOL(ip6_route_output);
813
814 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
815 {
816         struct rt6_info *ort = (struct rt6_info *) *dstp;
817         struct rt6_info *rt = (struct rt6_info *)
818                 dst_alloc(&ip6_dst_blackhole_ops);
819         struct dst_entry *new = NULL;
820
821         if (rt) {
822                 new = &rt->dst;
823
824                 atomic_set(&new->__refcnt, 1);
825                 new->__use = 1;
826                 new->input = dst_discard;
827                 new->output = dst_discard;
828
829                 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
830                 new->dev = ort->dst.dev;
831                 if (new->dev)
832                         dev_hold(new->dev);
833                 rt->rt6i_idev = ort->rt6i_idev;
834                 if (rt->rt6i_idev)
835                         in6_dev_hold(rt->rt6i_idev);
836                 rt->rt6i_expires = 0;
837
838                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
839                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
840                 rt->rt6i_metric = 0;
841
842                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
843 #ifdef CONFIG_IPV6_SUBTREES
844                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
845 #endif
846
847                 dst_free(new);
848         }
849
850         dst_release(*dstp);
851         *dstp = new;
852         return new ? 0 : -ENOMEM;
853 }
854 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
855
856 /*
857  *      Destination cache support functions
858  */
859
860 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
861 {
862         struct rt6_info *rt;
863
864         rt = (struct rt6_info *) dst;
865
866         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
867                 return dst;
868
869         return NULL;
870 }
871
872 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
873 {
874         struct rt6_info *rt = (struct rt6_info *) dst;
875
876         if (rt) {
877                 if (rt->rt6i_flags & RTF_CACHE) {
878                         if (rt6_check_expired(rt)) {
879                                 ip6_del_rt(rt);
880                                 dst = NULL;
881                         }
882                 } else {
883                         dst_release(dst);
884                         dst = NULL;
885                 }
886         }
887         return dst;
888 }
889
890 static void ip6_link_failure(struct sk_buff *skb)
891 {
892         struct rt6_info *rt;
893
894         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
895
896         rt = (struct rt6_info *) skb_dst(skb);
897         if (rt) {
898                 if (rt->rt6i_flags&RTF_CACHE) {
899                         dst_set_expires(&rt->dst, 0);
900                         rt->rt6i_flags |= RTF_EXPIRES;
901                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
902                         rt->rt6i_node->fn_sernum = -1;
903         }
904 }
905
906 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
907 {
908         struct rt6_info *rt6 = (struct rt6_info*)dst;
909
910         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
911                 rt6->rt6i_flags |= RTF_MODIFIED;
912                 if (mtu < IPV6_MIN_MTU) {
913                         mtu = IPV6_MIN_MTU;
914                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
915                 }
916                 dst->metrics[RTAX_MTU-1] = mtu;
917                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
918         }
919 }
920
921 static int ipv6_get_mtu(struct net_device *dev);
922
923 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
924 {
925         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
926
927         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
928                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
929
930         /*
931          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
932          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
933          * IPV6_MAXPLEN is also valid and means: "any MSS,
934          * rely only on pmtu discovery"
935          */
936         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
937                 mtu = IPV6_MAXPLEN;
938         return mtu;
939 }
940
941 static struct dst_entry *icmp6_dst_gc_list;
942 static DEFINE_SPINLOCK(icmp6_dst_lock);
943
944 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
945                                   struct neighbour *neigh,
946                                   const struct in6_addr *addr)
947 {
948         struct rt6_info *rt;
949         struct inet6_dev *idev = in6_dev_get(dev);
950         struct net *net = dev_net(dev);
951
952         if (unlikely(idev == NULL))
953                 return NULL;
954
955         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
956         if (unlikely(rt == NULL)) {
957                 in6_dev_put(idev);
958                 goto out;
959         }
960
961         dev_hold(dev);
962         if (neigh)
963                 neigh_hold(neigh);
964         else {
965                 neigh = ndisc_get_neigh(dev, addr);
966                 if (IS_ERR(neigh))
967                         neigh = NULL;
968         }
969
970         rt->rt6i_dev      = dev;
971         rt->rt6i_idev     = idev;
972         rt->rt6i_nexthop  = neigh;
973         atomic_set(&rt->dst.__refcnt, 1);
974         rt->dst.metrics[RTAX_HOPLIMIT-1] = 255;
975         rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
976         rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
977         rt->dst.output  = ip6_output;
978
979 #if 0   /* there's no chance to use these for ndisc */
980         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
981                                 ? DST_HOST
982                                 : 0;
983         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
984         rt->rt6i_dst.plen = 128;
985 #endif
986
987         spin_lock_bh(&icmp6_dst_lock);
988         rt->dst.next = icmp6_dst_gc_list;
989         icmp6_dst_gc_list = &rt->dst;
990         spin_unlock_bh(&icmp6_dst_lock);
991
992         fib6_force_start_gc(net);
993
994 out:
995         return &rt->dst;
996 }
997
998 int icmp6_dst_gc(void)
999 {
1000         struct dst_entry *dst, *next, **pprev;
1001         int more = 0;
1002
1003         next = NULL;
1004
1005         spin_lock_bh(&icmp6_dst_lock);
1006         pprev = &icmp6_dst_gc_list;
1007
1008         while ((dst = *pprev) != NULL) {
1009                 if (!atomic_read(&dst->__refcnt)) {
1010                         *pprev = dst->next;
1011                         dst_free(dst);
1012                 } else {
1013                         pprev = &dst->next;
1014                         ++more;
1015                 }
1016         }
1017
1018         spin_unlock_bh(&icmp6_dst_lock);
1019
1020         return more;
1021 }
1022
1023 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1024                             void *arg)
1025 {
1026         struct dst_entry *dst, **pprev;
1027
1028         spin_lock_bh(&icmp6_dst_lock);
1029         pprev = &icmp6_dst_gc_list;
1030         while ((dst = *pprev) != NULL) {
1031                 struct rt6_info *rt = (struct rt6_info *) dst;
1032                 if (func(rt, arg)) {
1033                         *pprev = dst->next;
1034                         dst_free(dst);
1035                 } else {
1036                         pprev = &dst->next;
1037                 }
1038         }
1039         spin_unlock_bh(&icmp6_dst_lock);
1040 }
1041
1042 static int ip6_dst_gc(struct dst_ops *ops)
1043 {
1044         unsigned long now = jiffies;
1045         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1046         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1047         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1048         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1049         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1050         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1051         int entries;
1052
1053         entries = dst_entries_get_fast(ops);
1054         if (time_after(rt_last_gc + rt_min_interval, now) &&
1055             entries <= rt_max_size)
1056                 goto out;
1057
1058         net->ipv6.ip6_rt_gc_expire++;
1059         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1060         net->ipv6.ip6_rt_last_gc = now;
1061         entries = dst_entries_get_slow(ops);
1062         if (entries < ops->gc_thresh)
1063                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1064 out:
1065         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1066         return entries > rt_max_size;
1067 }
1068
1069 /* Clean host part of a prefix. Not necessary in radix tree,
1070    but results in cleaner routing tables.
1071
1072    Remove it only when all the things will work!
1073  */
1074
1075 static int ipv6_get_mtu(struct net_device *dev)
1076 {
1077         int mtu = IPV6_MIN_MTU;
1078         struct inet6_dev *idev;
1079
1080         rcu_read_lock();
1081         idev = __in6_dev_get(dev);
1082         if (idev)
1083                 mtu = idev->cnf.mtu6;
1084         rcu_read_unlock();
1085         return mtu;
1086 }
1087
1088 int ip6_dst_hoplimit(struct dst_entry *dst)
1089 {
1090         int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1091         if (hoplimit < 0) {
1092                 struct net_device *dev = dst->dev;
1093                 struct inet6_dev *idev;
1094
1095                 rcu_read_lock();
1096                 idev = __in6_dev_get(dev);
1097                 if (idev)
1098                         hoplimit = idev->cnf.hop_limit;
1099                 else
1100                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1101                 rcu_read_unlock();
1102         }
1103         return hoplimit;
1104 }
1105
1106 /*
1107  *
1108  */
1109
1110 int ip6_route_add(struct fib6_config *cfg)
1111 {
1112         int err;
1113         struct net *net = cfg->fc_nlinfo.nl_net;
1114         struct rt6_info *rt = NULL;
1115         struct net_device *dev = NULL;
1116         struct inet6_dev *idev = NULL;
1117         struct fib6_table *table;
1118         int addr_type;
1119
1120         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1121                 return -EINVAL;
1122 #ifndef CONFIG_IPV6_SUBTREES
1123         if (cfg->fc_src_len)
1124                 return -EINVAL;
1125 #endif
1126         if (cfg->fc_ifindex) {
1127                 err = -ENODEV;
1128                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1129                 if (!dev)
1130                         goto out;
1131                 idev = in6_dev_get(dev);
1132                 if (!idev)
1133                         goto out;
1134         }
1135
1136         if (cfg->fc_metric == 0)
1137                 cfg->fc_metric = IP6_RT_PRIO_USER;
1138
1139         table = fib6_new_table(net, cfg->fc_table);
1140         if (table == NULL) {
1141                 err = -ENOBUFS;
1142                 goto out;
1143         }
1144
1145         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1146
1147         if (rt == NULL) {
1148                 err = -ENOMEM;
1149                 goto out;
1150         }
1151
1152         rt->dst.obsolete = -1;
1153         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1154                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1155                                 0;
1156
1157         if (cfg->fc_protocol == RTPROT_UNSPEC)
1158                 cfg->fc_protocol = RTPROT_BOOT;
1159         rt->rt6i_protocol = cfg->fc_protocol;
1160
1161         addr_type = ipv6_addr_type(&cfg->fc_dst);
1162
1163         if (addr_type & IPV6_ADDR_MULTICAST)
1164                 rt->dst.input = ip6_mc_input;
1165         else if (cfg->fc_flags & RTF_LOCAL)
1166                 rt->dst.input = ip6_input;
1167         else
1168                 rt->dst.input = ip6_forward;
1169
1170         rt->dst.output = ip6_output;
1171
1172         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1173         rt->rt6i_dst.plen = cfg->fc_dst_len;
1174         if (rt->rt6i_dst.plen == 128)
1175                rt->dst.flags = DST_HOST;
1176
1177 #ifdef CONFIG_IPV6_SUBTREES
1178         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1179         rt->rt6i_src.plen = cfg->fc_src_len;
1180 #endif
1181
1182         rt->rt6i_metric = cfg->fc_metric;
1183
1184         /* We cannot add true routes via loopback here,
1185            they would result in kernel looping; promote them to reject routes
1186          */
1187         if ((cfg->fc_flags & RTF_REJECT) ||
1188             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1189                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1190                 /* hold loopback dev/idev if we haven't done so. */
1191                 if (dev != net->loopback_dev) {
1192                         if (dev) {
1193                                 dev_put(dev);
1194                                 in6_dev_put(idev);
1195                         }
1196                         dev = net->loopback_dev;
1197                         dev_hold(dev);
1198                         idev = in6_dev_get(dev);
1199                         if (!idev) {
1200                                 err = -ENODEV;
1201                                 goto out;
1202                         }
1203                 }
1204                 rt->dst.output = ip6_pkt_discard_out;
1205                 rt->dst.input = ip6_pkt_discard;
1206                 rt->dst.error = -ENETUNREACH;
1207                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1208                 goto install_route;
1209         }
1210
1211         if (cfg->fc_flags & RTF_GATEWAY) {
1212                 struct in6_addr *gw_addr;
1213                 int gwa_type;
1214
1215                 gw_addr = &cfg->fc_gateway;
1216                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1217                 gwa_type = ipv6_addr_type(gw_addr);
1218
1219                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1220                         struct rt6_info *grt;
1221
1222                         /* IPv6 strictly inhibits using not link-local
1223                            addresses as nexthop address.
1224                            Otherwise, router will not able to send redirects.
1225                            It is very good, but in some (rare!) circumstances
1226                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1227                            some exceptions. --ANK
1228                          */
1229                         err = -EINVAL;
1230                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1231                                 goto out;
1232
1233                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1234
1235                         err = -EHOSTUNREACH;
1236                         if (grt == NULL)
1237                                 goto out;
1238                         if (dev) {
1239                                 if (dev != grt->rt6i_dev) {
1240                                         dst_release(&grt->dst);
1241                                         goto out;
1242                                 }
1243                         } else {
1244                                 dev = grt->rt6i_dev;
1245                                 idev = grt->rt6i_idev;
1246                                 dev_hold(dev);
1247                                 in6_dev_hold(grt->rt6i_idev);
1248                         }
1249                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1250                                 err = 0;
1251                         dst_release(&grt->dst);
1252
1253                         if (err)
1254                                 goto out;
1255                 }
1256                 err = -EINVAL;
1257                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1258                         goto out;
1259         }
1260
1261         err = -ENODEV;
1262         if (dev == NULL)
1263                 goto out;
1264
1265         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1266                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1267                 if (IS_ERR(rt->rt6i_nexthop)) {
1268                         err = PTR_ERR(rt->rt6i_nexthop);
1269                         rt->rt6i_nexthop = NULL;
1270                         goto out;
1271                 }
1272         }
1273
1274         rt->rt6i_flags = cfg->fc_flags;
1275
1276 install_route:
1277         if (cfg->fc_mx) {
1278                 struct nlattr *nla;
1279                 int remaining;
1280
1281                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1282                         int type = nla_type(nla);
1283
1284                         if (type) {
1285                                 if (type > RTAX_MAX) {
1286                                         err = -EINVAL;
1287                                         goto out;
1288                                 }
1289
1290                                 rt->dst.metrics[type - 1] = nla_get_u32(nla);
1291                         }
1292                 }
1293         }
1294
1295         if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1296                 rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
1297         if (!dst_mtu(&rt->dst))
1298                 rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1299         if (!dst_metric(&rt->dst, RTAX_ADVMSS))
1300                 rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
1301         rt->dst.dev = dev;
1302         rt->rt6i_idev = idev;
1303         rt->rt6i_table = table;
1304
1305         cfg->fc_nlinfo.nl_net = dev_net(dev);
1306
1307         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1308
1309 out:
1310         if (dev)
1311                 dev_put(dev);
1312         if (idev)
1313                 in6_dev_put(idev);
1314         if (rt)
1315                 dst_free(&rt->dst);
1316         return err;
1317 }
1318
1319 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1320 {
1321         int err;
1322         struct fib6_table *table;
1323         struct net *net = dev_net(rt->rt6i_dev);
1324
1325         if (rt == net->ipv6.ip6_null_entry)
1326                 return -ENOENT;
1327
1328         table = rt->rt6i_table;
1329         write_lock_bh(&table->tb6_lock);
1330
1331         err = fib6_del(rt, info);
1332         dst_release(&rt->dst);
1333
1334         write_unlock_bh(&table->tb6_lock);
1335
1336         return err;
1337 }
1338
1339 int ip6_del_rt(struct rt6_info *rt)
1340 {
1341         struct nl_info info = {
1342                 .nl_net = dev_net(rt->rt6i_dev),
1343         };
1344         return __ip6_del_rt(rt, &info);
1345 }
1346
1347 static int ip6_route_del(struct fib6_config *cfg)
1348 {
1349         struct fib6_table *table;
1350         struct fib6_node *fn;
1351         struct rt6_info *rt;
1352         int err = -ESRCH;
1353
1354         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1355         if (table == NULL)
1356                 return err;
1357
1358         read_lock_bh(&table->tb6_lock);
1359
1360         fn = fib6_locate(&table->tb6_root,
1361                          &cfg->fc_dst, cfg->fc_dst_len,
1362                          &cfg->fc_src, cfg->fc_src_len);
1363
1364         if (fn) {
1365                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1366                         if (cfg->fc_ifindex &&
1367                             (rt->rt6i_dev == NULL ||
1368                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1369                                 continue;
1370                         if (cfg->fc_flags & RTF_GATEWAY &&
1371                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1372                                 continue;
1373                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1374                                 continue;
1375                         dst_hold(&rt->dst);
1376                         read_unlock_bh(&table->tb6_lock);
1377
1378                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1379                 }
1380         }
1381         read_unlock_bh(&table->tb6_lock);
1382
1383         return err;
1384 }
1385
1386 /*
1387  *      Handle redirects
1388  */
1389 struct ip6rd_flowi {
1390         struct flowi fl;
1391         struct in6_addr gateway;
1392 };
1393
1394 static struct rt6_info *__ip6_route_redirect(struct net *net,
1395                                              struct fib6_table *table,
1396                                              struct flowi *fl,
1397                                              int flags)
1398 {
1399         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1400         struct rt6_info *rt;
1401         struct fib6_node *fn;
1402
1403         /*
1404          * Get the "current" route for this destination and
1405          * check if the redirect has come from approriate router.
1406          *
1407          * RFC 2461 specifies that redirects should only be
1408          * accepted if they come from the nexthop to the target.
1409          * Due to the way the routes are chosen, this notion
1410          * is a bit fuzzy and one might need to check all possible
1411          * routes.
1412          */
1413
1414         read_lock_bh(&table->tb6_lock);
1415         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1416 restart:
1417         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1418                 /*
1419                  * Current route is on-link; redirect is always invalid.
1420                  *
1421                  * Seems, previous statement is not true. It could
1422                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1423                  * But then router serving it might decide, that we should
1424                  * know truth 8)8) --ANK (980726).
1425                  */
1426                 if (rt6_check_expired(rt))
1427                         continue;
1428                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1429                         continue;
1430                 if (fl->oif != rt->rt6i_dev->ifindex)
1431                         continue;
1432                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1433                         continue;
1434                 break;
1435         }
1436
1437         if (!rt)
1438                 rt = net->ipv6.ip6_null_entry;
1439         BACKTRACK(net, &fl->fl6_src);
1440 out:
1441         dst_hold(&rt->dst);
1442
1443         read_unlock_bh(&table->tb6_lock);
1444
1445         return rt;
1446 };
1447
1448 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1449                                            struct in6_addr *src,
1450                                            struct in6_addr *gateway,
1451                                            struct net_device *dev)
1452 {
1453         int flags = RT6_LOOKUP_F_HAS_SADDR;
1454         struct net *net = dev_net(dev);
1455         struct ip6rd_flowi rdfl = {
1456                 .fl = {
1457                         .oif = dev->ifindex,
1458                         .fl6_dst = *dest,
1459                         .fl6_src = *src,
1460                 },
1461         };
1462
1463         ipv6_addr_copy(&rdfl.gateway, gateway);
1464
1465         if (rt6_need_strict(dest))
1466                 flags |= RT6_LOOKUP_F_IFACE;
1467
1468         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1469                                                    flags, __ip6_route_redirect);
1470 }
1471
1472 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1473                   struct in6_addr *saddr,
1474                   struct neighbour *neigh, u8 *lladdr, int on_link)
1475 {
1476         struct rt6_info *rt, *nrt = NULL;
1477         struct netevent_redirect netevent;
1478         struct net *net = dev_net(neigh->dev);
1479
1480         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1481
1482         if (rt == net->ipv6.ip6_null_entry) {
1483                 if (net_ratelimit())
1484                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1485                                "for redirect target\n");
1486                 goto out;
1487         }
1488
1489         /*
1490          *      We have finally decided to accept it.
1491          */
1492
1493         neigh_update(neigh, lladdr, NUD_STALE,
1494                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1495                      NEIGH_UPDATE_F_OVERRIDE|
1496                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1497                                      NEIGH_UPDATE_F_ISROUTER))
1498                      );
1499
1500         /*
1501          * Redirect received -> path was valid.
1502          * Look, redirects are sent only in response to data packets,
1503          * so that this nexthop apparently is reachable. --ANK
1504          */
1505         dst_confirm(&rt->dst);
1506
1507         /* Duplicate redirect: silently ignore. */
1508         if (neigh == rt->dst.neighbour)
1509                 goto out;
1510
1511         nrt = ip6_rt_copy(rt);
1512         if (nrt == NULL)
1513                 goto out;
1514
1515         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1516         if (on_link)
1517                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1518
1519         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1520         nrt->rt6i_dst.plen = 128;
1521         nrt->dst.flags |= DST_HOST;
1522
1523         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1524         nrt->rt6i_nexthop = neigh_clone(neigh);
1525         /* Reset pmtu, it may be better */
1526         nrt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1527         nrt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1528                                                         dst_mtu(&nrt->dst));
1529
1530         if (ip6_ins_rt(nrt))
1531                 goto out;
1532
1533         netevent.old = &rt->dst;
1534         netevent.new = &nrt->dst;
1535         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1536
1537         if (rt->rt6i_flags&RTF_CACHE) {
1538                 ip6_del_rt(rt);
1539                 return;
1540         }
1541
1542 out:
1543         dst_release(&rt->dst);
1544 }
1545
1546 /*
1547  *      Handle ICMP "packet too big" messages
1548  *      i.e. Path MTU discovery
1549  */
1550
1551 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1552                              struct net *net, u32 pmtu, int ifindex)
1553 {
1554         struct rt6_info *rt, *nrt;
1555         int allfrag = 0;
1556
1557         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1558         if (rt == NULL)
1559                 return;
1560
1561         if (pmtu >= dst_mtu(&rt->dst))
1562                 goto out;
1563
1564         if (pmtu < IPV6_MIN_MTU) {
1565                 /*
1566                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1567                  * MTU (1280) and a fragment header should always be included
1568                  * after a node receiving Too Big message reporting PMTU is
1569                  * less than the IPv6 Minimum Link MTU.
1570                  */
1571                 pmtu = IPV6_MIN_MTU;
1572                 allfrag = 1;
1573         }
1574
1575         /* New mtu received -> path was valid.
1576            They are sent only in response to data packets,
1577            so that this nexthop apparently is reachable. --ANK
1578          */
1579         dst_confirm(&rt->dst);
1580
1581         /* Host route. If it is static, it would be better
1582            not to override it, but add new one, so that
1583            when cache entry will expire old pmtu
1584            would return automatically.
1585          */
1586         if (rt->rt6i_flags & RTF_CACHE) {
1587                 rt->dst.metrics[RTAX_MTU-1] = pmtu;
1588                 if (allfrag)
1589                         rt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1590                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1591                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1592                 goto out;
1593         }
1594
1595         /* Network route.
1596            Two cases are possible:
1597            1. It is connected route. Action: COW
1598            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1599          */
1600         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1601                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1602         else
1603                 nrt = rt6_alloc_clone(rt, daddr);
1604
1605         if (nrt) {
1606                 nrt->dst.metrics[RTAX_MTU-1] = pmtu;
1607                 if (allfrag)
1608                         nrt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1609
1610                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1611                  * happened within 5 mins, the recommended timer is 10 mins.
1612                  * Here this route expiration time is set to ip6_rt_mtu_expires
1613                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1614                  * and detecting PMTU increase will be automatically happened.
1615                  */
1616                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1617                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1618
1619                 ip6_ins_rt(nrt);
1620         }
1621 out:
1622         dst_release(&rt->dst);
1623 }
1624
1625 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1626                         struct net_device *dev, u32 pmtu)
1627 {
1628         struct net *net = dev_net(dev);
1629
1630         /*
1631          * RFC 1981 states that a node "MUST reduce the size of the packets it
1632          * is sending along the path" that caused the Packet Too Big message.
1633          * Since it's not possible in the general case to determine which
1634          * interface was used to send the original packet, we update the MTU
1635          * on the interface that will be used to send future packets. We also
1636          * update the MTU on the interface that received the Packet Too Big in
1637          * case the original packet was forced out that interface with
1638          * SO_BINDTODEVICE or similar. This is the next best thing to the
1639          * correct behaviour, which would be to update the MTU on all
1640          * interfaces.
1641          */
1642         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1643         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1644 }
1645
1646 /*
1647  *      Misc support functions
1648  */
1649
1650 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1651 {
1652         struct net *net = dev_net(ort->rt6i_dev);
1653         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1654
1655         if (rt) {
1656                 rt->dst.input = ort->dst.input;
1657                 rt->dst.output = ort->dst.output;
1658
1659                 memcpy(rt->dst.metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
1660                 rt->dst.error = ort->dst.error;
1661                 rt->dst.dev = ort->dst.dev;
1662                 if (rt->dst.dev)
1663                         dev_hold(rt->dst.dev);
1664                 rt->rt6i_idev = ort->rt6i_idev;
1665                 if (rt->rt6i_idev)
1666                         in6_dev_hold(rt->rt6i_idev);
1667                 rt->dst.lastuse = jiffies;
1668                 rt->rt6i_expires = 0;
1669
1670                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1671                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1672                 rt->rt6i_metric = 0;
1673
1674                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1675 #ifdef CONFIG_IPV6_SUBTREES
1676                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1677 #endif
1678                 rt->rt6i_table = ort->rt6i_table;
1679         }
1680         return rt;
1681 }
1682
1683 #ifdef CONFIG_IPV6_ROUTE_INFO
1684 static struct rt6_info *rt6_get_route_info(struct net *net,
1685                                            struct in6_addr *prefix, int prefixlen,
1686                                            struct in6_addr *gwaddr, int ifindex)
1687 {
1688         struct fib6_node *fn;
1689         struct rt6_info *rt = NULL;
1690         struct fib6_table *table;
1691
1692         table = fib6_get_table(net, RT6_TABLE_INFO);
1693         if (table == NULL)
1694                 return NULL;
1695
1696         write_lock_bh(&table->tb6_lock);
1697         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1698         if (!fn)
1699                 goto out;
1700
1701         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1702                 if (rt->rt6i_dev->ifindex != ifindex)
1703                         continue;
1704                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1705                         continue;
1706                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1707                         continue;
1708                 dst_hold(&rt->dst);
1709                 break;
1710         }
1711 out:
1712         write_unlock_bh(&table->tb6_lock);
1713         return rt;
1714 }
1715
1716 static struct rt6_info *rt6_add_route_info(struct net *net,
1717                                            struct in6_addr *prefix, int prefixlen,
1718                                            struct in6_addr *gwaddr, int ifindex,
1719                                            unsigned pref)
1720 {
1721         struct fib6_config cfg = {
1722                 .fc_table       = RT6_TABLE_INFO,
1723                 .fc_metric      = IP6_RT_PRIO_USER,
1724                 .fc_ifindex     = ifindex,
1725                 .fc_dst_len     = prefixlen,
1726                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1727                                   RTF_UP | RTF_PREF(pref),
1728                 .fc_nlinfo.pid = 0,
1729                 .fc_nlinfo.nlh = NULL,
1730                 .fc_nlinfo.nl_net = net,
1731         };
1732
1733         ipv6_addr_copy(&cfg.fc_dst, prefix);
1734         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1735
1736         /* We should treat it as a default route if prefix length is 0. */
1737         if (!prefixlen)
1738                 cfg.fc_flags |= RTF_DEFAULT;
1739
1740         ip6_route_add(&cfg);
1741
1742         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1743 }
1744 #endif
1745
1746 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1747 {
1748         struct rt6_info *rt;
1749         struct fib6_table *table;
1750
1751         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1752         if (table == NULL)
1753                 return NULL;
1754
1755         write_lock_bh(&table->tb6_lock);
1756         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1757                 if (dev == rt->rt6i_dev &&
1758                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1759                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1760                         break;
1761         }
1762         if (rt)
1763                 dst_hold(&rt->dst);
1764         write_unlock_bh(&table->tb6_lock);
1765         return rt;
1766 }
1767
1768 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1769                                      struct net_device *dev,
1770                                      unsigned int pref)
1771 {
1772         struct fib6_config cfg = {
1773                 .fc_table       = RT6_TABLE_DFLT,
1774                 .fc_metric      = IP6_RT_PRIO_USER,
1775                 .fc_ifindex     = dev->ifindex,
1776                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1777                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1778                 .fc_nlinfo.pid = 0,
1779                 .fc_nlinfo.nlh = NULL,
1780                 .fc_nlinfo.nl_net = dev_net(dev),
1781         };
1782
1783         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1784
1785         ip6_route_add(&cfg);
1786
1787         return rt6_get_dflt_router(gwaddr, dev);
1788 }
1789
1790 void rt6_purge_dflt_routers(struct net *net)
1791 {
1792         struct rt6_info *rt;
1793         struct fib6_table *table;
1794
1795         /* NOTE: Keep consistent with rt6_get_dflt_router */
1796         table = fib6_get_table(net, RT6_TABLE_DFLT);
1797         if (table == NULL)
1798                 return;
1799
1800 restart:
1801         read_lock_bh(&table->tb6_lock);
1802         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1803                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1804                         dst_hold(&rt->dst);
1805                         read_unlock_bh(&table->tb6_lock);
1806                         ip6_del_rt(rt);
1807                         goto restart;
1808                 }
1809         }
1810         read_unlock_bh(&table->tb6_lock);
1811 }
1812
1813 static void rtmsg_to_fib6_config(struct net *net,
1814                                  struct in6_rtmsg *rtmsg,
1815                                  struct fib6_config *cfg)
1816 {
1817         memset(cfg, 0, sizeof(*cfg));
1818
1819         cfg->fc_table = RT6_TABLE_MAIN;
1820         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1821         cfg->fc_metric = rtmsg->rtmsg_metric;
1822         cfg->fc_expires = rtmsg->rtmsg_info;
1823         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1824         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1825         cfg->fc_flags = rtmsg->rtmsg_flags;
1826
1827         cfg->fc_nlinfo.nl_net = net;
1828
1829         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1830         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1831         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1832 }
1833
1834 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1835 {
1836         struct fib6_config cfg;
1837         struct in6_rtmsg rtmsg;
1838         int err;
1839
1840         switch(cmd) {
1841         case SIOCADDRT:         /* Add a route */
1842         case SIOCDELRT:         /* Delete a route */
1843                 if (!capable(CAP_NET_ADMIN))
1844                         return -EPERM;
1845                 err = copy_from_user(&rtmsg, arg,
1846                                      sizeof(struct in6_rtmsg));
1847                 if (err)
1848                         return -EFAULT;
1849
1850                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1851
1852                 rtnl_lock();
1853                 switch (cmd) {
1854                 case SIOCADDRT:
1855                         err = ip6_route_add(&cfg);
1856                         break;
1857                 case SIOCDELRT:
1858                         err = ip6_route_del(&cfg);
1859                         break;
1860                 default:
1861                         err = -EINVAL;
1862                 }
1863                 rtnl_unlock();
1864
1865                 return err;
1866         }
1867
1868         return -EINVAL;
1869 }
1870
1871 /*
1872  *      Drop the packet on the floor
1873  */
1874
1875 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1876 {
1877         int type;
1878         struct dst_entry *dst = skb_dst(skb);
1879         switch (ipstats_mib_noroutes) {
1880         case IPSTATS_MIB_INNOROUTES:
1881                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1882                 if (type == IPV6_ADDR_ANY) {
1883                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1884                                       IPSTATS_MIB_INADDRERRORS);
1885                         break;
1886                 }
1887                 /* FALLTHROUGH */
1888         case IPSTATS_MIB_OUTNOROUTES:
1889                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1890                               ipstats_mib_noroutes);
1891                 break;
1892         }
1893         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1894         kfree_skb(skb);
1895         return 0;
1896 }
1897
1898 static int ip6_pkt_discard(struct sk_buff *skb)
1899 {
1900         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1901 }
1902
1903 static int ip6_pkt_discard_out(struct sk_buff *skb)
1904 {
1905         skb->dev = skb_dst(skb)->dev;
1906         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1907 }
1908
1909 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1910
1911 static int ip6_pkt_prohibit(struct sk_buff *skb)
1912 {
1913         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1914 }
1915
1916 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1917 {
1918         skb->dev = skb_dst(skb)->dev;
1919         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1920 }
1921
1922 #endif
1923
1924 /*
1925  *      Allocate a dst for local (unicast / anycast) address.
1926  */
1927
1928 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1929                                     const struct in6_addr *addr,
1930                                     int anycast)
1931 {
1932         struct net *net = dev_net(idev->dev);
1933         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1934         struct neighbour *neigh;
1935
1936         if (rt == NULL) {
1937                 if (net_ratelimit())
1938                         pr_warning("IPv6:  Maximum number of routes reached,"
1939                                    " consider increasing route/max_size.\n");
1940                 return ERR_PTR(-ENOMEM);
1941         }
1942
1943         dev_hold(net->loopback_dev);
1944         in6_dev_hold(idev);
1945
1946         rt->dst.flags = DST_HOST;
1947         rt->dst.input = ip6_input;
1948         rt->dst.output = ip6_output;
1949         rt->rt6i_dev = net->loopback_dev;
1950         rt->rt6i_idev = idev;
1951         rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1952         rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
1953         rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
1954         rt->dst.obsolete = -1;
1955
1956         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1957         if (anycast)
1958                 rt->rt6i_flags |= RTF_ANYCAST;
1959         else
1960                 rt->rt6i_flags |= RTF_LOCAL;
1961         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1962         if (IS_ERR(neigh)) {
1963                 dst_free(&rt->dst);
1964
1965                 /* We are casting this because that is the return
1966                  * value type.  But an errno encoded pointer is the
1967                  * same regardless of the underlying pointer type,
1968                  * and that's what we are returning.  So this is OK.
1969                  */
1970                 return (struct rt6_info *) neigh;
1971         }
1972         rt->rt6i_nexthop = neigh;
1973
1974         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1975         rt->rt6i_dst.plen = 128;
1976         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1977
1978         atomic_set(&rt->dst.__refcnt, 1);
1979
1980         return rt;
1981 }
1982
1983 struct arg_dev_net {
1984         struct net_device *dev;
1985         struct net *net;
1986 };
1987
1988 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1989 {
1990         struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1991         struct net *net = ((struct arg_dev_net *)arg)->net;
1992
1993         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1994             rt != net->ipv6.ip6_null_entry) {
1995                 RT6_TRACE("deleted by ifdown %p\n", rt);
1996                 return -1;
1997         }
1998         return 0;
1999 }
2000
2001 void rt6_ifdown(struct net *net, struct net_device *dev)
2002 {
2003         struct arg_dev_net adn = {
2004                 .dev = dev,
2005                 .net = net,
2006         };
2007
2008         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2009         icmp6_clean_all(fib6_ifdown, &adn);
2010 }
2011
2012 struct rt6_mtu_change_arg
2013 {
2014         struct net_device *dev;
2015         unsigned mtu;
2016 };
2017
2018 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2019 {
2020         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2021         struct inet6_dev *idev;
2022         struct net *net = dev_net(arg->dev);
2023
2024         /* In IPv6 pmtu discovery is not optional,
2025            so that RTAX_MTU lock cannot disable it.
2026            We still use this lock to block changes
2027            caused by addrconf/ndisc.
2028         */
2029
2030         idev = __in6_dev_get(arg->dev);
2031         if (idev == NULL)
2032                 return 0;
2033
2034         /* For administrative MTU increase, there is no way to discover
2035            IPv6 PMTU increase, so PMTU increase should be updated here.
2036            Since RFC 1981 doesn't include administrative MTU increase
2037            update PMTU increase is a MUST. (i.e. jumbo frame)
2038          */
2039         /*
2040            If new MTU is less than route PMTU, this new MTU will be the
2041            lowest MTU in the path, update the route PMTU to reflect PMTU
2042            decreases; if new MTU is greater than route PMTU, and the
2043            old MTU is the lowest MTU in the path, update the route PMTU
2044            to reflect the increase. In this case if the other nodes' MTU
2045            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2046            PMTU discouvery.
2047          */
2048         if (rt->rt6i_dev == arg->dev &&
2049             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2050             (dst_mtu(&rt->dst) >= arg->mtu ||
2051              (dst_mtu(&rt->dst) < arg->mtu &&
2052               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2053                 rt->dst.metrics[RTAX_MTU-1] = arg->mtu;
2054                 rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
2055         }
2056         return 0;
2057 }
2058
2059 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2060 {
2061         struct rt6_mtu_change_arg arg = {
2062                 .dev = dev,
2063                 .mtu = mtu,
2064         };
2065
2066         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2067 }
2068
2069 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2070         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2071         [RTA_OIF]               = { .type = NLA_U32 },
2072         [RTA_IIF]               = { .type = NLA_U32 },
2073         [RTA_PRIORITY]          = { .type = NLA_U32 },
2074         [RTA_METRICS]           = { .type = NLA_NESTED },
2075 };
2076
2077 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2078                               struct fib6_config *cfg)
2079 {
2080         struct rtmsg *rtm;
2081         struct nlattr *tb[RTA_MAX+1];
2082         int err;
2083
2084         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2085         if (err < 0)
2086                 goto errout;
2087
2088         err = -EINVAL;
2089         rtm = nlmsg_data(nlh);
2090         memset(cfg, 0, sizeof(*cfg));
2091
2092         cfg->fc_table = rtm->rtm_table;
2093         cfg->fc_dst_len = rtm->rtm_dst_len;
2094         cfg->fc_src_len = rtm->rtm_src_len;
2095         cfg->fc_flags = RTF_UP;
2096         cfg->fc_protocol = rtm->rtm_protocol;
2097
2098         if (rtm->rtm_type == RTN_UNREACHABLE)
2099                 cfg->fc_flags |= RTF_REJECT;
2100
2101         if (rtm->rtm_type == RTN_LOCAL)
2102                 cfg->fc_flags |= RTF_LOCAL;
2103
2104         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2105         cfg->fc_nlinfo.nlh = nlh;
2106         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2107
2108         if (tb[RTA_GATEWAY]) {
2109                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2110                 cfg->fc_flags |= RTF_GATEWAY;
2111         }
2112
2113         if (tb[RTA_DST]) {
2114                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2115
2116                 if (nla_len(tb[RTA_DST]) < plen)
2117                         goto errout;
2118
2119                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2120         }
2121
2122         if (tb[RTA_SRC]) {
2123                 int plen = (rtm->rtm_src_len + 7) >> 3;
2124
2125                 if (nla_len(tb[RTA_SRC]) < plen)
2126                         goto errout;
2127
2128                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2129         }
2130
2131         if (tb[RTA_OIF])
2132                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2133
2134         if (tb[RTA_PRIORITY])
2135                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2136
2137         if (tb[RTA_METRICS]) {
2138                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2139                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2140         }
2141
2142         if (tb[RTA_TABLE])
2143                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2144
2145         err = 0;
2146 errout:
2147         return err;
2148 }
2149
2150 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2151 {
2152         struct fib6_config cfg;
2153         int err;
2154
2155         err = rtm_to_fib6_config(skb, nlh, &cfg);
2156         if (err < 0)
2157                 return err;
2158
2159         return ip6_route_del(&cfg);
2160 }
2161
2162 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2163 {
2164         struct fib6_config cfg;
2165         int err;
2166
2167         err = rtm_to_fib6_config(skb, nlh, &cfg);
2168         if (err < 0)
2169                 return err;
2170
2171         return ip6_route_add(&cfg);
2172 }
2173
2174 static inline size_t rt6_nlmsg_size(void)
2175 {
2176         return NLMSG_ALIGN(sizeof(struct rtmsg))
2177                + nla_total_size(16) /* RTA_SRC */
2178                + nla_total_size(16) /* RTA_DST */
2179                + nla_total_size(16) /* RTA_GATEWAY */
2180                + nla_total_size(16) /* RTA_PREFSRC */
2181                + nla_total_size(4) /* RTA_TABLE */
2182                + nla_total_size(4) /* RTA_IIF */
2183                + nla_total_size(4) /* RTA_OIF */
2184                + nla_total_size(4) /* RTA_PRIORITY */
2185                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2186                + nla_total_size(sizeof(struct rta_cacheinfo));
2187 }
2188
2189 static int rt6_fill_node(struct net *net,
2190                          struct sk_buff *skb, struct rt6_info *rt,
2191                          struct in6_addr *dst, struct in6_addr *src,
2192                          int iif, int type, u32 pid, u32 seq,
2193                          int prefix, int nowait, unsigned int flags)
2194 {
2195         struct rtmsg *rtm;
2196         struct nlmsghdr *nlh;
2197         long expires;
2198         u32 table;
2199
2200         if (prefix) {   /* user wants prefix routes only */
2201                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2202                         /* success since this is not a prefix route */
2203                         return 1;
2204                 }
2205         }
2206
2207         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2208         if (nlh == NULL)
2209                 return -EMSGSIZE;
2210
2211         rtm = nlmsg_data(nlh);
2212         rtm->rtm_family = AF_INET6;
2213         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2214         rtm->rtm_src_len = rt->rt6i_src.plen;
2215         rtm->rtm_tos = 0;
2216         if (rt->rt6i_table)
2217                 table = rt->rt6i_table->tb6_id;
2218         else
2219                 table = RT6_TABLE_UNSPEC;
2220         rtm->rtm_table = table;
2221         NLA_PUT_U32(skb, RTA_TABLE, table);
2222         if (rt->rt6i_flags&RTF_REJECT)
2223                 rtm->rtm_type = RTN_UNREACHABLE;
2224         else if (rt->rt6i_flags&RTF_LOCAL)
2225                 rtm->rtm_type = RTN_LOCAL;
2226         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2227                 rtm->rtm_type = RTN_LOCAL;
2228         else
2229                 rtm->rtm_type = RTN_UNICAST;
2230         rtm->rtm_flags = 0;
2231         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2232         rtm->rtm_protocol = rt->rt6i_protocol;
2233         if (rt->rt6i_flags&RTF_DYNAMIC)
2234                 rtm->rtm_protocol = RTPROT_REDIRECT;
2235         else if (rt->rt6i_flags & RTF_ADDRCONF)
2236                 rtm->rtm_protocol = RTPROT_KERNEL;
2237         else if (rt->rt6i_flags&RTF_DEFAULT)
2238                 rtm->rtm_protocol = RTPROT_RA;
2239
2240         if (rt->rt6i_flags&RTF_CACHE)
2241                 rtm->rtm_flags |= RTM_F_CLONED;
2242
2243         if (dst) {
2244                 NLA_PUT(skb, RTA_DST, 16, dst);
2245                 rtm->rtm_dst_len = 128;
2246         } else if (rtm->rtm_dst_len)
2247                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2248 #ifdef CONFIG_IPV6_SUBTREES
2249         if (src) {
2250                 NLA_PUT(skb, RTA_SRC, 16, src);
2251                 rtm->rtm_src_len = 128;
2252         } else if (rtm->rtm_src_len)
2253                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2254 #endif
2255         if (iif) {
2256 #ifdef CONFIG_IPV6_MROUTE
2257                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2258                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2259                         if (err <= 0) {
2260                                 if (!nowait) {
2261                                         if (err == 0)
2262                                                 return 0;
2263                                         goto nla_put_failure;
2264                                 } else {
2265                                         if (err == -EMSGSIZE)
2266                                                 goto nla_put_failure;
2267                                 }
2268                         }
2269                 } else
2270 #endif
2271                         NLA_PUT_U32(skb, RTA_IIF, iif);
2272         } else if (dst) {
2273                 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2274                 struct in6_addr saddr_buf;
2275                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2276                                        dst, 0, &saddr_buf) == 0)
2277                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2278         }
2279
2280         if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2281                 goto nla_put_failure;
2282
2283         if (rt->dst.neighbour)
2284                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2285
2286         if (rt->dst.dev)
2287                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2288
2289         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2290
2291         if (!(rt->rt6i_flags & RTF_EXPIRES))
2292                 expires = 0;
2293         else if (rt->rt6i_expires - jiffies < INT_MAX)
2294                 expires = rt->rt6i_expires - jiffies;
2295         else
2296                 expires = INT_MAX;
2297
2298         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2299                                expires, rt->dst.error) < 0)
2300                 goto nla_put_failure;
2301
2302         return nlmsg_end(skb, nlh);
2303
2304 nla_put_failure:
2305         nlmsg_cancel(skb, nlh);
2306         return -EMSGSIZE;
2307 }
2308
2309 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2310 {
2311         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2312         int prefix;
2313
2314         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2315                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2316                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2317         } else
2318                 prefix = 0;
2319
2320         return rt6_fill_node(arg->net,
2321                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2322                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2323                      prefix, 0, NLM_F_MULTI);
2324 }
2325
2326 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2327 {
2328         struct net *net = sock_net(in_skb->sk);
2329         struct nlattr *tb[RTA_MAX+1];
2330         struct rt6_info *rt;
2331         struct sk_buff *skb;
2332         struct rtmsg *rtm;
2333         struct flowi fl;
2334         int err, iif = 0;
2335
2336         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2337         if (err < 0)
2338                 goto errout;
2339
2340         err = -EINVAL;
2341         memset(&fl, 0, sizeof(fl));
2342
2343         if (tb[RTA_SRC]) {
2344                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2345                         goto errout;
2346
2347                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2348         }
2349
2350         if (tb[RTA_DST]) {
2351                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2352                         goto errout;
2353
2354                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2355         }
2356
2357         if (tb[RTA_IIF])
2358                 iif = nla_get_u32(tb[RTA_IIF]);
2359
2360         if (tb[RTA_OIF])
2361                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2362
2363         if (iif) {
2364                 struct net_device *dev;
2365                 dev = __dev_get_by_index(net, iif);
2366                 if (!dev) {
2367                         err = -ENODEV;
2368                         goto errout;
2369                 }
2370         }
2371
2372         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2373         if (skb == NULL) {
2374                 err = -ENOBUFS;
2375                 goto errout;
2376         }
2377
2378         /* Reserve room for dummy headers, this skb can pass
2379            through good chunk of routing engine.
2380          */
2381         skb_reset_mac_header(skb);
2382         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2383
2384         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2385         skb_dst_set(skb, &rt->dst);
2386
2387         err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2388                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2389                             nlh->nlmsg_seq, 0, 0, 0);
2390         if (err < 0) {
2391                 kfree_skb(skb);
2392                 goto errout;
2393         }
2394
2395         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2396 errout:
2397         return err;
2398 }
2399
2400 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2401 {
2402         struct sk_buff *skb;
2403         struct net *net = info->nl_net;
2404         u32 seq;
2405         int err;
2406
2407         err = -ENOBUFS;
2408         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2409
2410         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2411         if (skb == NULL)
2412                 goto errout;
2413
2414         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2415                                 event, info->pid, seq, 0, 0, 0);
2416         if (err < 0) {
2417                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2418                 WARN_ON(err == -EMSGSIZE);
2419                 kfree_skb(skb);
2420                 goto errout;
2421         }
2422         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2423                     info->nlh, gfp_any());
2424         return;
2425 errout:
2426         if (err < 0)
2427                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2428 }
2429
2430 static int ip6_route_dev_notify(struct notifier_block *this,
2431                                 unsigned long event, void *data)
2432 {
2433         struct net_device *dev = (struct net_device *)data;
2434         struct net *net = dev_net(dev);
2435
2436         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2437                 net->ipv6.ip6_null_entry->dst.dev = dev;
2438                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2439 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2440                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2441                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2442                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2443                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2444 #endif
2445         }
2446
2447         return NOTIFY_OK;
2448 }
2449
2450 /*
2451  *      /proc
2452  */
2453
2454 #ifdef CONFIG_PROC_FS
2455
2456 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2457
2458 struct rt6_proc_arg
2459 {
2460         char *buffer;
2461         int offset;
2462         int length;
2463         int skip;
2464         int len;
2465 };
2466
2467 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2468 {
2469         struct seq_file *m = p_arg;
2470
2471         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2472
2473 #ifdef CONFIG_IPV6_SUBTREES
2474         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2475 #else
2476         seq_puts(m, "00000000000000000000000000000000 00 ");
2477 #endif
2478
2479         if (rt->rt6i_nexthop) {
2480                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2481         } else {
2482                 seq_puts(m, "00000000000000000000000000000000");
2483         }
2484         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2485                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2486                    rt->dst.__use, rt->rt6i_flags,
2487                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2488         return 0;
2489 }
2490
2491 static int ipv6_route_show(struct seq_file *m, void *v)
2492 {
2493         struct net *net = (struct net *)m->private;
2494         fib6_clean_all(net, rt6_info_route, 0, m);
2495         return 0;
2496 }
2497
2498 static int ipv6_route_open(struct inode *inode, struct file *file)
2499 {
2500         return single_open_net(inode, file, ipv6_route_show);
2501 }
2502
2503 static const struct file_operations ipv6_route_proc_fops = {
2504         .owner          = THIS_MODULE,
2505         .open           = ipv6_route_open,
2506         .read           = seq_read,
2507         .llseek         = seq_lseek,
2508         .release        = single_release_net,
2509 };
2510
2511 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2512 {
2513         struct net *net = (struct net *)seq->private;
2514         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2515                    net->ipv6.rt6_stats->fib_nodes,
2516                    net->ipv6.rt6_stats->fib_route_nodes,
2517                    net->ipv6.rt6_stats->fib_rt_alloc,
2518                    net->ipv6.rt6_stats->fib_rt_entries,
2519                    net->ipv6.rt6_stats->fib_rt_cache,
2520                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2521                    net->ipv6.rt6_stats->fib_discarded_routes);
2522
2523         return 0;
2524 }
2525
2526 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2527 {
2528         return single_open_net(inode, file, rt6_stats_seq_show);
2529 }
2530
2531 static const struct file_operations rt6_stats_seq_fops = {
2532         .owner   = THIS_MODULE,
2533         .open    = rt6_stats_seq_open,
2534         .read    = seq_read,
2535         .llseek  = seq_lseek,
2536         .release = single_release_net,
2537 };
2538 #endif  /* CONFIG_PROC_FS */
2539
2540 #ifdef CONFIG_SYSCTL
2541
2542 static
2543 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2544                               void __user *buffer, size_t *lenp, loff_t *ppos)
2545 {
2546         struct net *net = current->nsproxy->net_ns;
2547         int delay = net->ipv6.sysctl.flush_delay;
2548         if (write) {
2549                 proc_dointvec(ctl, write, buffer, lenp, ppos);
2550                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2551                 return 0;
2552         } else
2553                 return -EINVAL;
2554 }
2555
2556 ctl_table ipv6_route_table_template[] = {
2557         {
2558                 .procname       =       "flush",
2559                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2560                 .maxlen         =       sizeof(int),
2561                 .mode           =       0200,
2562                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2563         },
2564         {
2565                 .procname       =       "gc_thresh",
2566                 .data           =       &ip6_dst_ops_template.gc_thresh,
2567                 .maxlen         =       sizeof(int),
2568                 .mode           =       0644,
2569                 .proc_handler   =       proc_dointvec,
2570         },
2571         {
2572                 .procname       =       "max_size",
2573                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2574                 .maxlen         =       sizeof(int),
2575                 .mode           =       0644,
2576                 .proc_handler   =       proc_dointvec,
2577         },
2578         {
2579                 .procname       =       "gc_min_interval",
2580                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2581                 .maxlen         =       sizeof(int),
2582                 .mode           =       0644,
2583                 .proc_handler   =       proc_dointvec_jiffies,
2584         },
2585         {
2586                 .procname       =       "gc_timeout",
2587                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2588                 .maxlen         =       sizeof(int),
2589                 .mode           =       0644,
2590                 .proc_handler   =       proc_dointvec_jiffies,
2591         },
2592         {
2593                 .procname       =       "gc_interval",
2594                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2595                 .maxlen         =       sizeof(int),
2596                 .mode           =       0644,
2597                 .proc_handler   =       proc_dointvec_jiffies,
2598         },
2599         {
2600                 .procname       =       "gc_elasticity",
2601                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2602                 .maxlen         =       sizeof(int),
2603                 .mode           =       0644,
2604                 .proc_handler   =       proc_dointvec,
2605         },
2606         {
2607                 .procname       =       "mtu_expires",
2608                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2609                 .maxlen         =       sizeof(int),
2610                 .mode           =       0644,
2611                 .proc_handler   =       proc_dointvec_jiffies,
2612         },
2613         {
2614                 .procname       =       "min_adv_mss",
2615                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2616                 .maxlen         =       sizeof(int),
2617                 .mode           =       0644,
2618                 .proc_handler   =       proc_dointvec,
2619         },
2620         {
2621                 .procname       =       "gc_min_interval_ms",
2622                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2623                 .maxlen         =       sizeof(int),
2624                 .mode           =       0644,
2625                 .proc_handler   =       proc_dointvec_ms_jiffies,
2626         },
2627         { }
2628 };
2629
2630 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2631 {
2632         struct ctl_table *table;
2633
2634         table = kmemdup(ipv6_route_table_template,
2635                         sizeof(ipv6_route_table_template),
2636                         GFP_KERNEL);
2637
2638         if (table) {
2639                 table[0].data = &net->ipv6.sysctl.flush_delay;
2640                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2641                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2642                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2643                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2644                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2645                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2646                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2647                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2648                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2649         }
2650
2651         return table;
2652 }
2653 #endif
2654
2655 static int __net_init ip6_route_net_init(struct net *net)
2656 {
2657         int ret = -ENOMEM;
2658
2659         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2660                sizeof(net->ipv6.ip6_dst_ops));
2661
2662         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2663                 goto out_ip6_dst_ops;
2664
2665         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2666                                            sizeof(*net->ipv6.ip6_null_entry),
2667                                            GFP_KERNEL);
2668         if (!net->ipv6.ip6_null_entry)
2669                 goto out_ip6_dst_entries;
2670         net->ipv6.ip6_null_entry->dst.path =
2671                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2672         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2673
2674 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2675         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2676                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2677                                                GFP_KERNEL);
2678         if (!net->ipv6.ip6_prohibit_entry)
2679                 goto out_ip6_null_entry;
2680         net->ipv6.ip6_prohibit_entry->dst.path =
2681                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2682         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2683
2684         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2685                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2686                                                GFP_KERNEL);
2687         if (!net->ipv6.ip6_blk_hole_entry)
2688                 goto out_ip6_prohibit_entry;
2689         net->ipv6.ip6_blk_hole_entry->dst.path =
2690                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2691         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2692 #endif
2693
2694         net->ipv6.sysctl.flush_delay = 0;
2695         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2696         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2697         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2698         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2699         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2700         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2701         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2702
2703 #ifdef CONFIG_PROC_FS
2704         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2705         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2706 #endif
2707         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2708
2709         ret = 0;
2710 out:
2711         return ret;
2712
2713 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2714 out_ip6_prohibit_entry:
2715         kfree(net->ipv6.ip6_prohibit_entry);
2716 out_ip6_null_entry:
2717         kfree(net->ipv6.ip6_null_entry);
2718 #endif
2719 out_ip6_dst_entries:
2720         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2721 out_ip6_dst_ops:
2722         goto out;
2723 }
2724
2725 static void __net_exit ip6_route_net_exit(struct net *net)
2726 {
2727 #ifdef CONFIG_PROC_FS
2728         proc_net_remove(net, "ipv6_route");
2729         proc_net_remove(net, "rt6_stats");
2730 #endif
2731         kfree(net->ipv6.ip6_null_entry);
2732 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2733         kfree(net->ipv6.ip6_prohibit_entry);
2734         kfree(net->ipv6.ip6_blk_hole_entry);
2735 #endif
2736         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2737 }
2738
2739 static struct pernet_operations ip6_route_net_ops = {
2740         .init = ip6_route_net_init,
2741         .exit = ip6_route_net_exit,
2742 };
2743
2744 static struct notifier_block ip6_route_dev_notifier = {
2745         .notifier_call = ip6_route_dev_notify,
2746         .priority = 0,
2747 };
2748
2749 int __init ip6_route_init(void)
2750 {
2751         int ret;
2752
2753         ret = -ENOMEM;
2754         ip6_dst_ops_template.kmem_cachep =
2755                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2756                                   SLAB_HWCACHE_ALIGN, NULL);
2757         if (!ip6_dst_ops_template.kmem_cachep)
2758                 goto out;
2759
2760         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2761         if (ret)
2762                 goto out_kmem_cache;
2763
2764         ret = register_pernet_subsys(&ip6_route_net_ops);
2765         if (ret)
2766                 goto out_dst_entries;
2767
2768         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2769
2770         /* Registering of the loopback is done before this portion of code,
2771          * the loopback reference in rt6_info will not be taken, do it
2772          * manually for init_net */
2773         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2774         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2775   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2776         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2777         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2778         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2779         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2780   #endif
2781         ret = fib6_init();
2782         if (ret)
2783                 goto out_register_subsys;
2784
2785         ret = xfrm6_init();
2786         if (ret)
2787                 goto out_fib6_init;
2788
2789         ret = fib6_rules_init();
2790         if (ret)
2791                 goto xfrm6_init;
2792
2793         ret = -ENOBUFS;
2794         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2795             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2796             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2797                 goto fib6_rules_init;
2798
2799         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2800         if (ret)
2801                 goto fib6_rules_init;
2802
2803 out:
2804         return ret;
2805
2806 fib6_rules_init:
2807         fib6_rules_cleanup();
2808 xfrm6_init:
2809         xfrm6_fini();
2810 out_fib6_init:
2811         fib6_gc_cleanup();
2812 out_register_subsys:
2813         unregister_pernet_subsys(&ip6_route_net_ops);
2814 out_dst_entries:
2815         dst_entries_destroy(&ip6_dst_blackhole_ops);
2816 out_kmem_cache:
2817         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2818         goto out;
2819 }
2820
2821 void ip6_route_cleanup(void)
2822 {
2823         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2824         fib6_rules_cleanup();
2825         xfrm6_fini();
2826         fib6_gc_cleanup();
2827         unregister_pernet_subsys(&ip6_route_net_ops);
2828         dst_entries_destroy(&ip6_dst_blackhole_ops);
2829         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2830 }