]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv6/route.c
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[net-next-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            struct in6_addr *prefix, int prefixlen,
93                                            struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            struct in6_addr *prefix, int prefixlen,
97                                            struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static struct dst_ops ip6_dst_ops_template = {
101         .family                 =       AF_INET6,
102         .protocol               =       cpu_to_be16(ETH_P_IPV6),
103         .gc                     =       ip6_dst_gc,
104         .gc_thresh              =       1024,
105         .check                  =       ip6_dst_check,
106         .destroy                =       ip6_dst_destroy,
107         .ifdown                 =       ip6_dst_ifdown,
108         .negative_advice        =       ip6_negative_advice,
109         .link_failure           =       ip6_link_failure,
110         .update_pmtu            =       ip6_rt_update_pmtu,
111         .local_out              =       __ip6_local_out,
112         .entries                =       ATOMIC_INIT(0),
113 };
114
115 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
116 {
117 }
118
119 static struct dst_ops ip6_dst_blackhole_ops = {
120         .family                 =       AF_INET6,
121         .protocol               =       cpu_to_be16(ETH_P_IPV6),
122         .destroy                =       ip6_dst_destroy,
123         .check                  =       ip6_dst_check,
124         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
125         .entries                =       ATOMIC_INIT(0),
126 };
127
128 static struct rt6_info ip6_null_entry_template = {
129         .dst = {
130                 .__refcnt       = ATOMIC_INIT(1),
131                 .__use          = 1,
132                 .obsolete       = -1,
133                 .error          = -ENETUNREACH,
134                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
135                 .input          = ip6_pkt_discard,
136                 .output         = ip6_pkt_discard_out,
137         },
138         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
139         .rt6i_protocol  = RTPROT_KERNEL,
140         .rt6i_metric    = ~(u32) 0,
141         .rt6i_ref       = ATOMIC_INIT(1),
142 };
143
144 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
145
146 static int ip6_pkt_prohibit(struct sk_buff *skb);
147 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
148
149 static struct rt6_info ip6_prohibit_entry_template = {
150         .dst = {
151                 .__refcnt       = ATOMIC_INIT(1),
152                 .__use          = 1,
153                 .obsolete       = -1,
154                 .error          = -EACCES,
155                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
156                 .input          = ip6_pkt_prohibit,
157                 .output         = ip6_pkt_prohibit_out,
158         },
159         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
160         .rt6i_protocol  = RTPROT_KERNEL,
161         .rt6i_metric    = ~(u32) 0,
162         .rt6i_ref       = ATOMIC_INIT(1),
163 };
164
165 static struct rt6_info ip6_blk_hole_entry_template = {
166         .dst = {
167                 .__refcnt       = ATOMIC_INIT(1),
168                 .__use          = 1,
169                 .obsolete       = -1,
170                 .error          = -EINVAL,
171                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
172                 .input          = dst_discard,
173                 .output         = dst_discard,
174         },
175         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
176         .rt6i_protocol  = RTPROT_KERNEL,
177         .rt6i_metric    = ~(u32) 0,
178         .rt6i_ref       = ATOMIC_INIT(1),
179 };
180
181 #endif
182
183 /* allocate dst with ip6_dst_ops */
184 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
185 {
186         return (struct rt6_info *)dst_alloc(ops);
187 }
188
189 static void ip6_dst_destroy(struct dst_entry *dst)
190 {
191         struct rt6_info *rt = (struct rt6_info *)dst;
192         struct inet6_dev *idev = rt->rt6i_idev;
193
194         if (idev != NULL) {
195                 rt->rt6i_idev = NULL;
196                 in6_dev_put(idev);
197         }
198 }
199
200 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
201                            int how)
202 {
203         struct rt6_info *rt = (struct rt6_info *)dst;
204         struct inet6_dev *idev = rt->rt6i_idev;
205         struct net_device *loopback_dev =
206                 dev_net(dev)->loopback_dev;
207
208         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
209                 struct inet6_dev *loopback_idev =
210                         in6_dev_get(loopback_dev);
211                 if (loopback_idev != NULL) {
212                         rt->rt6i_idev = loopback_idev;
213                         in6_dev_put(idev);
214                 }
215         }
216 }
217
218 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
219 {
220         return (rt->rt6i_flags & RTF_EXPIRES) &&
221                 time_after(jiffies, rt->rt6i_expires);
222 }
223
224 static inline int rt6_need_strict(struct in6_addr *daddr)
225 {
226         return ipv6_addr_type(daddr) &
227                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
228 }
229
230 /*
231  *      Route lookup. Any table->tb6_lock is implied.
232  */
233
234 static inline struct rt6_info *rt6_device_match(struct net *net,
235                                                     struct rt6_info *rt,
236                                                     struct in6_addr *saddr,
237                                                     int oif,
238                                                     int flags)
239 {
240         struct rt6_info *local = NULL;
241         struct rt6_info *sprt;
242
243         if (!oif && ipv6_addr_any(saddr))
244                 goto out;
245
246         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
247                 struct net_device *dev = sprt->rt6i_dev;
248
249                 if (oif) {
250                         if (dev->ifindex == oif)
251                                 return sprt;
252                         if (dev->flags & IFF_LOOPBACK) {
253                                 if (sprt->rt6i_idev == NULL ||
254                                     sprt->rt6i_idev->dev->ifindex != oif) {
255                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
256                                                 continue;
257                                         if (local && (!oif ||
258                                                       local->rt6i_idev->dev->ifindex == oif))
259                                                 continue;
260                                 }
261                                 local = sprt;
262                         }
263                 } else {
264                         if (ipv6_chk_addr(net, saddr, dev,
265                                           flags & RT6_LOOKUP_F_IFACE))
266                                 return sprt;
267                 }
268         }
269
270         if (oif) {
271                 if (local)
272                         return local;
273
274                 if (flags & RT6_LOOKUP_F_IFACE)
275                         return net->ipv6.ip6_null_entry;
276         }
277 out:
278         return rt;
279 }
280
281 #ifdef CONFIG_IPV6_ROUTER_PREF
282 static void rt6_probe(struct rt6_info *rt)
283 {
284         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
285         /*
286          * Okay, this does not seem to be appropriate
287          * for now, however, we need to check if it
288          * is really so; aka Router Reachability Probing.
289          *
290          * Router Reachability Probe MUST be rate-limited
291          * to no more than one per minute.
292          */
293         if (!neigh || (neigh->nud_state & NUD_VALID))
294                 return;
295         read_lock_bh(&neigh->lock);
296         if (!(neigh->nud_state & NUD_VALID) &&
297             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
298                 struct in6_addr mcaddr;
299                 struct in6_addr *target;
300
301                 neigh->updated = jiffies;
302                 read_unlock_bh(&neigh->lock);
303
304                 target = (struct in6_addr *)&neigh->primary_key;
305                 addrconf_addr_solict_mult(target, &mcaddr);
306                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
307         } else
308                 read_unlock_bh(&neigh->lock);
309 }
310 #else
311 static inline void rt6_probe(struct rt6_info *rt)
312 {
313 }
314 #endif
315
316 /*
317  * Default Router Selection (RFC 2461 6.3.6)
318  */
319 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
320 {
321         struct net_device *dev = rt->rt6i_dev;
322         if (!oif || dev->ifindex == oif)
323                 return 2;
324         if ((dev->flags & IFF_LOOPBACK) &&
325             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
326                 return 1;
327         return 0;
328 }
329
330 static inline int rt6_check_neigh(struct rt6_info *rt)
331 {
332         struct neighbour *neigh = rt->rt6i_nexthop;
333         int m;
334         if (rt->rt6i_flags & RTF_NONEXTHOP ||
335             !(rt->rt6i_flags & RTF_GATEWAY))
336                 m = 1;
337         else if (neigh) {
338                 read_lock_bh(&neigh->lock);
339                 if (neigh->nud_state & NUD_VALID)
340                         m = 2;
341 #ifdef CONFIG_IPV6_ROUTER_PREF
342                 else if (neigh->nud_state & NUD_FAILED)
343                         m = 0;
344 #endif
345                 else
346                         m = 1;
347                 read_unlock_bh(&neigh->lock);
348         } else
349                 m = 0;
350         return m;
351 }
352
353 static int rt6_score_route(struct rt6_info *rt, int oif,
354                            int strict)
355 {
356         int m, n;
357
358         m = rt6_check_dev(rt, oif);
359         if (!m && (strict & RT6_LOOKUP_F_IFACE))
360                 return -1;
361 #ifdef CONFIG_IPV6_ROUTER_PREF
362         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
363 #endif
364         n = rt6_check_neigh(rt);
365         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
366                 return -1;
367         return m;
368 }
369
370 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
371                                    int *mpri, struct rt6_info *match)
372 {
373         int m;
374
375         if (rt6_check_expired(rt))
376                 goto out;
377
378         m = rt6_score_route(rt, oif, strict);
379         if (m < 0)
380                 goto out;
381
382         if (m > *mpri) {
383                 if (strict & RT6_LOOKUP_F_REACHABLE)
384                         rt6_probe(match);
385                 *mpri = m;
386                 match = rt;
387         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
388                 rt6_probe(rt);
389         }
390
391 out:
392         return match;
393 }
394
395 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
396                                      struct rt6_info *rr_head,
397                                      u32 metric, int oif, int strict)
398 {
399         struct rt6_info *rt, *match;
400         int mpri = -1;
401
402         match = NULL;
403         for (rt = rr_head; rt && rt->rt6i_metric == metric;
404              rt = rt->dst.rt6_next)
405                 match = find_match(rt, oif, strict, &mpri, match);
406         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
407              rt = rt->dst.rt6_next)
408                 match = find_match(rt, oif, strict, &mpri, match);
409
410         return match;
411 }
412
413 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
414 {
415         struct rt6_info *match, *rt0;
416         struct net *net;
417
418         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
419                   __func__, fn->leaf, oif);
420
421         rt0 = fn->rr_ptr;
422         if (!rt0)
423                 fn->rr_ptr = rt0 = fn->leaf;
424
425         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
426
427         if (!match &&
428             (strict & RT6_LOOKUP_F_REACHABLE)) {
429                 struct rt6_info *next = rt0->dst.rt6_next;
430
431                 /* no entries matched; do round-robin */
432                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
433                         next = fn->leaf;
434
435                 if (next != rt0)
436                         fn->rr_ptr = next;
437         }
438
439         RT6_TRACE("%s() => %p\n",
440                   __func__, match);
441
442         net = dev_net(rt0->rt6i_dev);
443         return match ? match : net->ipv6.ip6_null_entry;
444 }
445
446 #ifdef CONFIG_IPV6_ROUTE_INFO
447 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
448                   struct in6_addr *gwaddr)
449 {
450         struct net *net = dev_net(dev);
451         struct route_info *rinfo = (struct route_info *) opt;
452         struct in6_addr prefix_buf, *prefix;
453         unsigned int pref;
454         unsigned long lifetime;
455         struct rt6_info *rt;
456
457         if (len < sizeof(struct route_info)) {
458                 return -EINVAL;
459         }
460
461         /* Sanity check for prefix_len and length */
462         if (rinfo->length > 3) {
463                 return -EINVAL;
464         } else if (rinfo->prefix_len > 128) {
465                 return -EINVAL;
466         } else if (rinfo->prefix_len > 64) {
467                 if (rinfo->length < 2) {
468                         return -EINVAL;
469                 }
470         } else if (rinfo->prefix_len > 0) {
471                 if (rinfo->length < 1) {
472                         return -EINVAL;
473                 }
474         }
475
476         pref = rinfo->route_pref;
477         if (pref == ICMPV6_ROUTER_PREF_INVALID)
478                 return -EINVAL;
479
480         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
481
482         if (rinfo->length == 3)
483                 prefix = (struct in6_addr *)rinfo->prefix;
484         else {
485                 /* this function is safe */
486                 ipv6_addr_prefix(&prefix_buf,
487                                  (struct in6_addr *)rinfo->prefix,
488                                  rinfo->prefix_len);
489                 prefix = &prefix_buf;
490         }
491
492         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
493                                 dev->ifindex);
494
495         if (rt && !lifetime) {
496                 ip6_del_rt(rt);
497                 rt = NULL;
498         }
499
500         if (!rt && lifetime)
501                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
502                                         pref);
503         else if (rt)
504                 rt->rt6i_flags = RTF_ROUTEINFO |
505                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
506
507         if (rt) {
508                 if (!addrconf_finite_timeout(lifetime)) {
509                         rt->rt6i_flags &= ~RTF_EXPIRES;
510                 } else {
511                         rt->rt6i_expires = jiffies + HZ * lifetime;
512                         rt->rt6i_flags |= RTF_EXPIRES;
513                 }
514                 dst_release(&rt->dst);
515         }
516         return 0;
517 }
518 #endif
519
520 #define BACKTRACK(__net, saddr)                 \
521 do { \
522         if (rt == __net->ipv6.ip6_null_entry) { \
523                 struct fib6_node *pn; \
524                 while (1) { \
525                         if (fn->fn_flags & RTN_TL_ROOT) \
526                                 goto out; \
527                         pn = fn->parent; \
528                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
529                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
530                         else \
531                                 fn = pn; \
532                         if (fn->fn_flags & RTN_RTINFO) \
533                                 goto restart; \
534                 } \
535         } \
536 } while(0)
537
538 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
539                                              struct fib6_table *table,
540                                              struct flowi *fl, int flags)
541 {
542         struct fib6_node *fn;
543         struct rt6_info *rt;
544
545         read_lock_bh(&table->tb6_lock);
546         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
547 restart:
548         rt = fn->leaf;
549         rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
550         BACKTRACK(net, &fl->fl6_src);
551 out:
552         dst_use(&rt->dst, jiffies);
553         read_unlock_bh(&table->tb6_lock);
554         return rt;
555
556 }
557
558 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
559                             const struct in6_addr *saddr, int oif, int strict)
560 {
561         struct flowi fl = {
562                 .oif = oif,
563                 .nl_u = {
564                         .ip6_u = {
565                                 .daddr = *daddr,
566                         },
567                 },
568         };
569         struct dst_entry *dst;
570         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
571
572         if (saddr) {
573                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
574                 flags |= RT6_LOOKUP_F_HAS_SADDR;
575         }
576
577         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
578         if (dst->error == 0)
579                 return (struct rt6_info *) dst;
580
581         dst_release(dst);
582
583         return NULL;
584 }
585
586 EXPORT_SYMBOL(rt6_lookup);
587
588 /* ip6_ins_rt is called with FREE table->tb6_lock.
589    It takes new route entry, the addition fails by any reason the
590    route is freed. In any case, if caller does not hold it, it may
591    be destroyed.
592  */
593
594 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
595 {
596         int err;
597         struct fib6_table *table;
598
599         table = rt->rt6i_table;
600         write_lock_bh(&table->tb6_lock);
601         err = fib6_add(&table->tb6_root, rt, info);
602         write_unlock_bh(&table->tb6_lock);
603
604         return err;
605 }
606
607 int ip6_ins_rt(struct rt6_info *rt)
608 {
609         struct nl_info info = {
610                 .nl_net = dev_net(rt->rt6i_dev),
611         };
612         return __ip6_ins_rt(rt, &info);
613 }
614
615 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
616                                       struct in6_addr *saddr)
617 {
618         struct rt6_info *rt;
619
620         /*
621          *      Clone the route.
622          */
623
624         rt = ip6_rt_copy(ort);
625
626         if (rt) {
627                 struct neighbour *neigh;
628                 int attempts = !in_softirq();
629
630                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
631                         if (rt->rt6i_dst.plen != 128 &&
632                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
633                                 rt->rt6i_flags |= RTF_ANYCAST;
634                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
635                 }
636
637                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
638                 rt->rt6i_dst.plen = 128;
639                 rt->rt6i_flags |= RTF_CACHE;
640                 rt->dst.flags |= DST_HOST;
641
642 #ifdef CONFIG_IPV6_SUBTREES
643                 if (rt->rt6i_src.plen && saddr) {
644                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
645                         rt->rt6i_src.plen = 128;
646                 }
647 #endif
648
649         retry:
650                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
651                 if (IS_ERR(neigh)) {
652                         struct net *net = dev_net(rt->rt6i_dev);
653                         int saved_rt_min_interval =
654                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
655                         int saved_rt_elasticity =
656                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
657
658                         if (attempts-- > 0) {
659                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
660                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
661
662                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
663
664                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
665                                         saved_rt_elasticity;
666                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
667                                         saved_rt_min_interval;
668                                 goto retry;
669                         }
670
671                         if (net_ratelimit())
672                                 printk(KERN_WARNING
673                                        "ipv6: Neighbour table overflow.\n");
674                         dst_free(&rt->dst);
675                         return NULL;
676                 }
677                 rt->rt6i_nexthop = neigh;
678
679         }
680
681         return rt;
682 }
683
684 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
685 {
686         struct rt6_info *rt = ip6_rt_copy(ort);
687         if (rt) {
688                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
689                 rt->rt6i_dst.plen = 128;
690                 rt->rt6i_flags |= RTF_CACHE;
691                 rt->dst.flags |= DST_HOST;
692                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
693         }
694         return rt;
695 }
696
697 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
698                                       struct flowi *fl, int flags)
699 {
700         struct fib6_node *fn;
701         struct rt6_info *rt, *nrt;
702         int strict = 0;
703         int attempts = 3;
704         int err;
705         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
706
707         strict |= flags & RT6_LOOKUP_F_IFACE;
708
709 relookup:
710         read_lock_bh(&table->tb6_lock);
711
712 restart_2:
713         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
714
715 restart:
716         rt = rt6_select(fn, oif, strict | reachable);
717
718         BACKTRACK(net, &fl->fl6_src);
719         if (rt == net->ipv6.ip6_null_entry ||
720             rt->rt6i_flags & RTF_CACHE)
721                 goto out;
722
723         dst_hold(&rt->dst);
724         read_unlock_bh(&table->tb6_lock);
725
726         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
727                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
728         else {
729 #if CLONE_OFFLINK_ROUTE
730                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
731 #else
732                 goto out2;
733 #endif
734         }
735
736         dst_release(&rt->dst);
737         rt = nrt ? : net->ipv6.ip6_null_entry;
738
739         dst_hold(&rt->dst);
740         if (nrt) {
741                 err = ip6_ins_rt(nrt);
742                 if (!err)
743                         goto out2;
744         }
745
746         if (--attempts <= 0)
747                 goto out2;
748
749         /*
750          * Race condition! In the gap, when table->tb6_lock was
751          * released someone could insert this route.  Relookup.
752          */
753         dst_release(&rt->dst);
754         goto relookup;
755
756 out:
757         if (reachable) {
758                 reachable = 0;
759                 goto restart_2;
760         }
761         dst_hold(&rt->dst);
762         read_unlock_bh(&table->tb6_lock);
763 out2:
764         rt->dst.lastuse = jiffies;
765         rt->dst.__use++;
766
767         return rt;
768 }
769
770 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
771                                             struct flowi *fl, int flags)
772 {
773         return ip6_pol_route(net, table, fl->iif, fl, flags);
774 }
775
776 void ip6_route_input(struct sk_buff *skb)
777 {
778         struct ipv6hdr *iph = ipv6_hdr(skb);
779         struct net *net = dev_net(skb->dev);
780         int flags = RT6_LOOKUP_F_HAS_SADDR;
781         struct flowi fl = {
782                 .iif = skb->dev->ifindex,
783                 .nl_u = {
784                         .ip6_u = {
785                                 .daddr = iph->daddr,
786                                 .saddr = iph->saddr,
787                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
788                         },
789                 },
790                 .mark = skb->mark,
791                 .proto = iph->nexthdr,
792         };
793
794         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
795                 flags |= RT6_LOOKUP_F_IFACE;
796
797         skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
798 }
799
800 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
801                                              struct flowi *fl, int flags)
802 {
803         return ip6_pol_route(net, table, fl->oif, fl, flags);
804 }
805
806 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
807                                     struct flowi *fl)
808 {
809         int flags = 0;
810
811         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
812                 flags |= RT6_LOOKUP_F_IFACE;
813
814         if (!ipv6_addr_any(&fl->fl6_src))
815                 flags |= RT6_LOOKUP_F_HAS_SADDR;
816         else if (sk)
817                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
818
819         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
820 }
821
822 EXPORT_SYMBOL(ip6_route_output);
823
824 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
825 {
826         struct rt6_info *ort = (struct rt6_info *) *dstp;
827         struct rt6_info *rt = (struct rt6_info *)
828                 dst_alloc(&ip6_dst_blackhole_ops);
829         struct dst_entry *new = NULL;
830
831         if (rt) {
832                 new = &rt->dst;
833
834                 atomic_set(&new->__refcnt, 1);
835                 new->__use = 1;
836                 new->input = dst_discard;
837                 new->output = dst_discard;
838
839                 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
840                 new->dev = ort->dst.dev;
841                 if (new->dev)
842                         dev_hold(new->dev);
843                 rt->rt6i_idev = ort->rt6i_idev;
844                 if (rt->rt6i_idev)
845                         in6_dev_hold(rt->rt6i_idev);
846                 rt->rt6i_expires = 0;
847
848                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
849                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
850                 rt->rt6i_metric = 0;
851
852                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
853 #ifdef CONFIG_IPV6_SUBTREES
854                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
855 #endif
856
857                 dst_free(new);
858         }
859
860         dst_release(*dstp);
861         *dstp = new;
862         return new ? 0 : -ENOMEM;
863 }
864 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
865
866 /*
867  *      Destination cache support functions
868  */
869
870 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
871 {
872         struct rt6_info *rt;
873
874         rt = (struct rt6_info *) dst;
875
876         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
877                 return dst;
878
879         return NULL;
880 }
881
882 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
883 {
884         struct rt6_info *rt = (struct rt6_info *) dst;
885
886         if (rt) {
887                 if (rt->rt6i_flags & RTF_CACHE) {
888                         if (rt6_check_expired(rt)) {
889                                 ip6_del_rt(rt);
890                                 dst = NULL;
891                         }
892                 } else {
893                         dst_release(dst);
894                         dst = NULL;
895                 }
896         }
897         return dst;
898 }
899
900 static void ip6_link_failure(struct sk_buff *skb)
901 {
902         struct rt6_info *rt;
903
904         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
905
906         rt = (struct rt6_info *) skb_dst(skb);
907         if (rt) {
908                 if (rt->rt6i_flags&RTF_CACHE) {
909                         dst_set_expires(&rt->dst, 0);
910                         rt->rt6i_flags |= RTF_EXPIRES;
911                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
912                         rt->rt6i_node->fn_sernum = -1;
913         }
914 }
915
916 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
917 {
918         struct rt6_info *rt6 = (struct rt6_info*)dst;
919
920         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
921                 rt6->rt6i_flags |= RTF_MODIFIED;
922                 if (mtu < IPV6_MIN_MTU) {
923                         mtu = IPV6_MIN_MTU;
924                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
925                 }
926                 dst->metrics[RTAX_MTU-1] = mtu;
927                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
928         }
929 }
930
931 static int ipv6_get_mtu(struct net_device *dev);
932
933 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
934 {
935         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
936
937         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
938                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
939
940         /*
941          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
942          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
943          * IPV6_MAXPLEN is also valid and means: "any MSS,
944          * rely only on pmtu discovery"
945          */
946         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
947                 mtu = IPV6_MAXPLEN;
948         return mtu;
949 }
950
951 static struct dst_entry *icmp6_dst_gc_list;
952 static DEFINE_SPINLOCK(icmp6_dst_lock);
953
954 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
955                                   struct neighbour *neigh,
956                                   const struct in6_addr *addr)
957 {
958         struct rt6_info *rt;
959         struct inet6_dev *idev = in6_dev_get(dev);
960         struct net *net = dev_net(dev);
961
962         if (unlikely(idev == NULL))
963                 return NULL;
964
965         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
966         if (unlikely(rt == NULL)) {
967                 in6_dev_put(idev);
968                 goto out;
969         }
970
971         dev_hold(dev);
972         if (neigh)
973                 neigh_hold(neigh);
974         else {
975                 neigh = ndisc_get_neigh(dev, addr);
976                 if (IS_ERR(neigh))
977                         neigh = NULL;
978         }
979
980         rt->rt6i_dev      = dev;
981         rt->rt6i_idev     = idev;
982         rt->rt6i_nexthop  = neigh;
983         atomic_set(&rt->dst.__refcnt, 1);
984         rt->dst.metrics[RTAX_HOPLIMIT-1] = 255;
985         rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
986         rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
987         rt->dst.output  = ip6_output;
988
989 #if 0   /* there's no chance to use these for ndisc */
990         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
991                                 ? DST_HOST
992                                 : 0;
993         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
994         rt->rt6i_dst.plen = 128;
995 #endif
996
997         spin_lock_bh(&icmp6_dst_lock);
998         rt->dst.next = icmp6_dst_gc_list;
999         icmp6_dst_gc_list = &rt->dst;
1000         spin_unlock_bh(&icmp6_dst_lock);
1001
1002         fib6_force_start_gc(net);
1003
1004 out:
1005         return &rt->dst;
1006 }
1007
1008 int icmp6_dst_gc(void)
1009 {
1010         struct dst_entry *dst, *next, **pprev;
1011         int more = 0;
1012
1013         next = NULL;
1014
1015         spin_lock_bh(&icmp6_dst_lock);
1016         pprev = &icmp6_dst_gc_list;
1017
1018         while ((dst = *pprev) != NULL) {
1019                 if (!atomic_read(&dst->__refcnt)) {
1020                         *pprev = dst->next;
1021                         dst_free(dst);
1022                 } else {
1023                         pprev = &dst->next;
1024                         ++more;
1025                 }
1026         }
1027
1028         spin_unlock_bh(&icmp6_dst_lock);
1029
1030         return more;
1031 }
1032
1033 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1034                             void *arg)
1035 {
1036         struct dst_entry *dst, **pprev;
1037
1038         spin_lock_bh(&icmp6_dst_lock);
1039         pprev = &icmp6_dst_gc_list;
1040         while ((dst = *pprev) != NULL) {
1041                 struct rt6_info *rt = (struct rt6_info *) dst;
1042                 if (func(rt, arg)) {
1043                         *pprev = dst->next;
1044                         dst_free(dst);
1045                 } else {
1046                         pprev = &dst->next;
1047                 }
1048         }
1049         spin_unlock_bh(&icmp6_dst_lock);
1050 }
1051
1052 static int ip6_dst_gc(struct dst_ops *ops)
1053 {
1054         unsigned long now = jiffies;
1055         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1056         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1057         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1058         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1059         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1060         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1061
1062         if (time_after(rt_last_gc + rt_min_interval, now) &&
1063             atomic_read(&ops->entries) <= rt_max_size)
1064                 goto out;
1065
1066         net->ipv6.ip6_rt_gc_expire++;
1067         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1068         net->ipv6.ip6_rt_last_gc = now;
1069         if (atomic_read(&ops->entries) < ops->gc_thresh)
1070                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1071 out:
1072         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1073         return atomic_read(&ops->entries) > rt_max_size;
1074 }
1075
1076 /* Clean host part of a prefix. Not necessary in radix tree,
1077    but results in cleaner routing tables.
1078
1079    Remove it only when all the things will work!
1080  */
1081
1082 static int ipv6_get_mtu(struct net_device *dev)
1083 {
1084         int mtu = IPV6_MIN_MTU;
1085         struct inet6_dev *idev;
1086
1087         rcu_read_lock();
1088         idev = __in6_dev_get(dev);
1089         if (idev)
1090                 mtu = idev->cnf.mtu6;
1091         rcu_read_unlock();
1092         return mtu;
1093 }
1094
1095 int ip6_dst_hoplimit(struct dst_entry *dst)
1096 {
1097         int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1098         if (hoplimit < 0) {
1099                 struct net_device *dev = dst->dev;
1100                 struct inet6_dev *idev;
1101
1102                 rcu_read_lock();
1103                 idev = __in6_dev_get(dev);
1104                 if (idev)
1105                         hoplimit = idev->cnf.hop_limit;
1106                 else
1107                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1108                 rcu_read_unlock();
1109         }
1110         return hoplimit;
1111 }
1112
1113 /*
1114  *
1115  */
1116
1117 int ip6_route_add(struct fib6_config *cfg)
1118 {
1119         int err;
1120         struct net *net = cfg->fc_nlinfo.nl_net;
1121         struct rt6_info *rt = NULL;
1122         struct net_device *dev = NULL;
1123         struct inet6_dev *idev = NULL;
1124         struct fib6_table *table;
1125         int addr_type;
1126
1127         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1128                 return -EINVAL;
1129 #ifndef CONFIG_IPV6_SUBTREES
1130         if (cfg->fc_src_len)
1131                 return -EINVAL;
1132 #endif
1133         if (cfg->fc_ifindex) {
1134                 err = -ENODEV;
1135                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1136                 if (!dev)
1137                         goto out;
1138                 idev = in6_dev_get(dev);
1139                 if (!idev)
1140                         goto out;
1141         }
1142
1143         if (cfg->fc_metric == 0)
1144                 cfg->fc_metric = IP6_RT_PRIO_USER;
1145
1146         table = fib6_new_table(net, cfg->fc_table);
1147         if (table == NULL) {
1148                 err = -ENOBUFS;
1149                 goto out;
1150         }
1151
1152         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1153
1154         if (rt == NULL) {
1155                 err = -ENOMEM;
1156                 goto out;
1157         }
1158
1159         rt->dst.obsolete = -1;
1160         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1161                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1162                                 0;
1163
1164         if (cfg->fc_protocol == RTPROT_UNSPEC)
1165                 cfg->fc_protocol = RTPROT_BOOT;
1166         rt->rt6i_protocol = cfg->fc_protocol;
1167
1168         addr_type = ipv6_addr_type(&cfg->fc_dst);
1169
1170         if (addr_type & IPV6_ADDR_MULTICAST)
1171                 rt->dst.input = ip6_mc_input;
1172         else if (cfg->fc_flags & RTF_LOCAL)
1173                 rt->dst.input = ip6_input;
1174         else
1175                 rt->dst.input = ip6_forward;
1176
1177         rt->dst.output = ip6_output;
1178
1179         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1180         rt->rt6i_dst.plen = cfg->fc_dst_len;
1181         if (rt->rt6i_dst.plen == 128)
1182                rt->dst.flags = DST_HOST;
1183
1184 #ifdef CONFIG_IPV6_SUBTREES
1185         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1186         rt->rt6i_src.plen = cfg->fc_src_len;
1187 #endif
1188
1189         rt->rt6i_metric = cfg->fc_metric;
1190
1191         /* We cannot add true routes via loopback here,
1192            they would result in kernel looping; promote them to reject routes
1193          */
1194         if ((cfg->fc_flags & RTF_REJECT) ||
1195             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1196                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1197                 /* hold loopback dev/idev if we haven't done so. */
1198                 if (dev != net->loopback_dev) {
1199                         if (dev) {
1200                                 dev_put(dev);
1201                                 in6_dev_put(idev);
1202                         }
1203                         dev = net->loopback_dev;
1204                         dev_hold(dev);
1205                         idev = in6_dev_get(dev);
1206                         if (!idev) {
1207                                 err = -ENODEV;
1208                                 goto out;
1209                         }
1210                 }
1211                 rt->dst.output = ip6_pkt_discard_out;
1212                 rt->dst.input = ip6_pkt_discard;
1213                 rt->dst.error = -ENETUNREACH;
1214                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1215                 goto install_route;
1216         }
1217
1218         if (cfg->fc_flags & RTF_GATEWAY) {
1219                 struct in6_addr *gw_addr;
1220                 int gwa_type;
1221
1222                 gw_addr = &cfg->fc_gateway;
1223                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1224                 gwa_type = ipv6_addr_type(gw_addr);
1225
1226                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1227                         struct rt6_info *grt;
1228
1229                         /* IPv6 strictly inhibits using not link-local
1230                            addresses as nexthop address.
1231                            Otherwise, router will not able to send redirects.
1232                            It is very good, but in some (rare!) circumstances
1233                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1234                            some exceptions. --ANK
1235                          */
1236                         err = -EINVAL;
1237                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1238                                 goto out;
1239
1240                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1241
1242                         err = -EHOSTUNREACH;
1243                         if (grt == NULL)
1244                                 goto out;
1245                         if (dev) {
1246                                 if (dev != grt->rt6i_dev) {
1247                                         dst_release(&grt->dst);
1248                                         goto out;
1249                                 }
1250                         } else {
1251                                 dev = grt->rt6i_dev;
1252                                 idev = grt->rt6i_idev;
1253                                 dev_hold(dev);
1254                                 in6_dev_hold(grt->rt6i_idev);
1255                         }
1256                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1257                                 err = 0;
1258                         dst_release(&grt->dst);
1259
1260                         if (err)
1261                                 goto out;
1262                 }
1263                 err = -EINVAL;
1264                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1265                         goto out;
1266         }
1267
1268         err = -ENODEV;
1269         if (dev == NULL)
1270                 goto out;
1271
1272         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1273                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1274                 if (IS_ERR(rt->rt6i_nexthop)) {
1275                         err = PTR_ERR(rt->rt6i_nexthop);
1276                         rt->rt6i_nexthop = NULL;
1277                         goto out;
1278                 }
1279         }
1280
1281         rt->rt6i_flags = cfg->fc_flags;
1282
1283 install_route:
1284         if (cfg->fc_mx) {
1285                 struct nlattr *nla;
1286                 int remaining;
1287
1288                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1289                         int type = nla_type(nla);
1290
1291                         if (type) {
1292                                 if (type > RTAX_MAX) {
1293                                         err = -EINVAL;
1294                                         goto out;
1295                                 }
1296
1297                                 rt->dst.metrics[type - 1] = nla_get_u32(nla);
1298                         }
1299                 }
1300         }
1301
1302         if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1303                 rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
1304         if (!dst_mtu(&rt->dst))
1305                 rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1306         if (!dst_metric(&rt->dst, RTAX_ADVMSS))
1307                 rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
1308         rt->dst.dev = dev;
1309         rt->rt6i_idev = idev;
1310         rt->rt6i_table = table;
1311
1312         cfg->fc_nlinfo.nl_net = dev_net(dev);
1313
1314         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1315
1316 out:
1317         if (dev)
1318                 dev_put(dev);
1319         if (idev)
1320                 in6_dev_put(idev);
1321         if (rt)
1322                 dst_free(&rt->dst);
1323         return err;
1324 }
1325
1326 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1327 {
1328         int err;
1329         struct fib6_table *table;
1330         struct net *net = dev_net(rt->rt6i_dev);
1331
1332         if (rt == net->ipv6.ip6_null_entry)
1333                 return -ENOENT;
1334
1335         table = rt->rt6i_table;
1336         write_lock_bh(&table->tb6_lock);
1337
1338         err = fib6_del(rt, info);
1339         dst_release(&rt->dst);
1340
1341         write_unlock_bh(&table->tb6_lock);
1342
1343         return err;
1344 }
1345
1346 int ip6_del_rt(struct rt6_info *rt)
1347 {
1348         struct nl_info info = {
1349                 .nl_net = dev_net(rt->rt6i_dev),
1350         };
1351         return __ip6_del_rt(rt, &info);
1352 }
1353
1354 static int ip6_route_del(struct fib6_config *cfg)
1355 {
1356         struct fib6_table *table;
1357         struct fib6_node *fn;
1358         struct rt6_info *rt;
1359         int err = -ESRCH;
1360
1361         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1362         if (table == NULL)
1363                 return err;
1364
1365         read_lock_bh(&table->tb6_lock);
1366
1367         fn = fib6_locate(&table->tb6_root,
1368                          &cfg->fc_dst, cfg->fc_dst_len,
1369                          &cfg->fc_src, cfg->fc_src_len);
1370
1371         if (fn) {
1372                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1373                         if (cfg->fc_ifindex &&
1374                             (rt->rt6i_dev == NULL ||
1375                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1376                                 continue;
1377                         if (cfg->fc_flags & RTF_GATEWAY &&
1378                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1379                                 continue;
1380                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1381                                 continue;
1382                         dst_hold(&rt->dst);
1383                         read_unlock_bh(&table->tb6_lock);
1384
1385                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1386                 }
1387         }
1388         read_unlock_bh(&table->tb6_lock);
1389
1390         return err;
1391 }
1392
1393 /*
1394  *      Handle redirects
1395  */
1396 struct ip6rd_flowi {
1397         struct flowi fl;
1398         struct in6_addr gateway;
1399 };
1400
1401 static struct rt6_info *__ip6_route_redirect(struct net *net,
1402                                              struct fib6_table *table,
1403                                              struct flowi *fl,
1404                                              int flags)
1405 {
1406         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1407         struct rt6_info *rt;
1408         struct fib6_node *fn;
1409
1410         /*
1411          * Get the "current" route for this destination and
1412          * check if the redirect has come from approriate router.
1413          *
1414          * RFC 2461 specifies that redirects should only be
1415          * accepted if they come from the nexthop to the target.
1416          * Due to the way the routes are chosen, this notion
1417          * is a bit fuzzy and one might need to check all possible
1418          * routes.
1419          */
1420
1421         read_lock_bh(&table->tb6_lock);
1422         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1423 restart:
1424         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1425                 /*
1426                  * Current route is on-link; redirect is always invalid.
1427                  *
1428                  * Seems, previous statement is not true. It could
1429                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1430                  * But then router serving it might decide, that we should
1431                  * know truth 8)8) --ANK (980726).
1432                  */
1433                 if (rt6_check_expired(rt))
1434                         continue;
1435                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1436                         continue;
1437                 if (fl->oif != rt->rt6i_dev->ifindex)
1438                         continue;
1439                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1440                         continue;
1441                 break;
1442         }
1443
1444         if (!rt)
1445                 rt = net->ipv6.ip6_null_entry;
1446         BACKTRACK(net, &fl->fl6_src);
1447 out:
1448         dst_hold(&rt->dst);
1449
1450         read_unlock_bh(&table->tb6_lock);
1451
1452         return rt;
1453 };
1454
1455 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1456                                            struct in6_addr *src,
1457                                            struct in6_addr *gateway,
1458                                            struct net_device *dev)
1459 {
1460         int flags = RT6_LOOKUP_F_HAS_SADDR;
1461         struct net *net = dev_net(dev);
1462         struct ip6rd_flowi rdfl = {
1463                 .fl = {
1464                         .oif = dev->ifindex,
1465                         .nl_u = {
1466                                 .ip6_u = {
1467                                         .daddr = *dest,
1468                                         .saddr = *src,
1469                                 },
1470                         },
1471                 },
1472         };
1473
1474         ipv6_addr_copy(&rdfl.gateway, gateway);
1475
1476         if (rt6_need_strict(dest))
1477                 flags |= RT6_LOOKUP_F_IFACE;
1478
1479         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1480                                                    flags, __ip6_route_redirect);
1481 }
1482
1483 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1484                   struct in6_addr *saddr,
1485                   struct neighbour *neigh, u8 *lladdr, int on_link)
1486 {
1487         struct rt6_info *rt, *nrt = NULL;
1488         struct netevent_redirect netevent;
1489         struct net *net = dev_net(neigh->dev);
1490
1491         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1492
1493         if (rt == net->ipv6.ip6_null_entry) {
1494                 if (net_ratelimit())
1495                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1496                                "for redirect target\n");
1497                 goto out;
1498         }
1499
1500         /*
1501          *      We have finally decided to accept it.
1502          */
1503
1504         neigh_update(neigh, lladdr, NUD_STALE,
1505                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1506                      NEIGH_UPDATE_F_OVERRIDE|
1507                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1508                                      NEIGH_UPDATE_F_ISROUTER))
1509                      );
1510
1511         /*
1512          * Redirect received -> path was valid.
1513          * Look, redirects are sent only in response to data packets,
1514          * so that this nexthop apparently is reachable. --ANK
1515          */
1516         dst_confirm(&rt->dst);
1517
1518         /* Duplicate redirect: silently ignore. */
1519         if (neigh == rt->dst.neighbour)
1520                 goto out;
1521
1522         nrt = ip6_rt_copy(rt);
1523         if (nrt == NULL)
1524                 goto out;
1525
1526         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1527         if (on_link)
1528                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1529
1530         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1531         nrt->rt6i_dst.plen = 128;
1532         nrt->dst.flags |= DST_HOST;
1533
1534         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1535         nrt->rt6i_nexthop = neigh_clone(neigh);
1536         /* Reset pmtu, it may be better */
1537         nrt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1538         nrt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1539                                                         dst_mtu(&nrt->dst));
1540
1541         if (ip6_ins_rt(nrt))
1542                 goto out;
1543
1544         netevent.old = &rt->dst;
1545         netevent.new = &nrt->dst;
1546         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1547
1548         if (rt->rt6i_flags&RTF_CACHE) {
1549                 ip6_del_rt(rt);
1550                 return;
1551         }
1552
1553 out:
1554         dst_release(&rt->dst);
1555 }
1556
1557 /*
1558  *      Handle ICMP "packet too big" messages
1559  *      i.e. Path MTU discovery
1560  */
1561
1562 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1563                              struct net *net, u32 pmtu, int ifindex)
1564 {
1565         struct rt6_info *rt, *nrt;
1566         int allfrag = 0;
1567
1568         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1569         if (rt == NULL)
1570                 return;
1571
1572         if (pmtu >= dst_mtu(&rt->dst))
1573                 goto out;
1574
1575         if (pmtu < IPV6_MIN_MTU) {
1576                 /*
1577                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1578                  * MTU (1280) and a fragment header should always be included
1579                  * after a node receiving Too Big message reporting PMTU is
1580                  * less than the IPv6 Minimum Link MTU.
1581                  */
1582                 pmtu = IPV6_MIN_MTU;
1583                 allfrag = 1;
1584         }
1585
1586         /* New mtu received -> path was valid.
1587            They are sent only in response to data packets,
1588            so that this nexthop apparently is reachable. --ANK
1589          */
1590         dst_confirm(&rt->dst);
1591
1592         /* Host route. If it is static, it would be better
1593            not to override it, but add new one, so that
1594            when cache entry will expire old pmtu
1595            would return automatically.
1596          */
1597         if (rt->rt6i_flags & RTF_CACHE) {
1598                 rt->dst.metrics[RTAX_MTU-1] = pmtu;
1599                 if (allfrag)
1600                         rt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1601                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1602                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1603                 goto out;
1604         }
1605
1606         /* Network route.
1607            Two cases are possible:
1608            1. It is connected route. Action: COW
1609            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1610          */
1611         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1612                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1613         else
1614                 nrt = rt6_alloc_clone(rt, daddr);
1615
1616         if (nrt) {
1617                 nrt->dst.metrics[RTAX_MTU-1] = pmtu;
1618                 if (allfrag)
1619                         nrt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1620
1621                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1622                  * happened within 5 mins, the recommended timer is 10 mins.
1623                  * Here this route expiration time is set to ip6_rt_mtu_expires
1624                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1625                  * and detecting PMTU increase will be automatically happened.
1626                  */
1627                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1628                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1629
1630                 ip6_ins_rt(nrt);
1631         }
1632 out:
1633         dst_release(&rt->dst);
1634 }
1635
1636 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1637                         struct net_device *dev, u32 pmtu)
1638 {
1639         struct net *net = dev_net(dev);
1640
1641         /*
1642          * RFC 1981 states that a node "MUST reduce the size of the packets it
1643          * is sending along the path" that caused the Packet Too Big message.
1644          * Since it's not possible in the general case to determine which
1645          * interface was used to send the original packet, we update the MTU
1646          * on the interface that will be used to send future packets. We also
1647          * update the MTU on the interface that received the Packet Too Big in
1648          * case the original packet was forced out that interface with
1649          * SO_BINDTODEVICE or similar. This is the next best thing to the
1650          * correct behaviour, which would be to update the MTU on all
1651          * interfaces.
1652          */
1653         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1654         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1655 }
1656
1657 /*
1658  *      Misc support functions
1659  */
1660
1661 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1662 {
1663         struct net *net = dev_net(ort->rt6i_dev);
1664         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1665
1666         if (rt) {
1667                 rt->dst.input = ort->dst.input;
1668                 rt->dst.output = ort->dst.output;
1669
1670                 memcpy(rt->dst.metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
1671                 rt->dst.error = ort->dst.error;
1672                 rt->dst.dev = ort->dst.dev;
1673                 if (rt->dst.dev)
1674                         dev_hold(rt->dst.dev);
1675                 rt->rt6i_idev = ort->rt6i_idev;
1676                 if (rt->rt6i_idev)
1677                         in6_dev_hold(rt->rt6i_idev);
1678                 rt->dst.lastuse = jiffies;
1679                 rt->rt6i_expires = 0;
1680
1681                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1682                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1683                 rt->rt6i_metric = 0;
1684
1685                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1686 #ifdef CONFIG_IPV6_SUBTREES
1687                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1688 #endif
1689                 rt->rt6i_table = ort->rt6i_table;
1690         }
1691         return rt;
1692 }
1693
1694 #ifdef CONFIG_IPV6_ROUTE_INFO
1695 static struct rt6_info *rt6_get_route_info(struct net *net,
1696                                            struct in6_addr *prefix, int prefixlen,
1697                                            struct in6_addr *gwaddr, int ifindex)
1698 {
1699         struct fib6_node *fn;
1700         struct rt6_info *rt = NULL;
1701         struct fib6_table *table;
1702
1703         table = fib6_get_table(net, RT6_TABLE_INFO);
1704         if (table == NULL)
1705                 return NULL;
1706
1707         write_lock_bh(&table->tb6_lock);
1708         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1709         if (!fn)
1710                 goto out;
1711
1712         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1713                 if (rt->rt6i_dev->ifindex != ifindex)
1714                         continue;
1715                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1716                         continue;
1717                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1718                         continue;
1719                 dst_hold(&rt->dst);
1720                 break;
1721         }
1722 out:
1723         write_unlock_bh(&table->tb6_lock);
1724         return rt;
1725 }
1726
1727 static struct rt6_info *rt6_add_route_info(struct net *net,
1728                                            struct in6_addr *prefix, int prefixlen,
1729                                            struct in6_addr *gwaddr, int ifindex,
1730                                            unsigned pref)
1731 {
1732         struct fib6_config cfg = {
1733                 .fc_table       = RT6_TABLE_INFO,
1734                 .fc_metric      = IP6_RT_PRIO_USER,
1735                 .fc_ifindex     = ifindex,
1736                 .fc_dst_len     = prefixlen,
1737                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1738                                   RTF_UP | RTF_PREF(pref),
1739                 .fc_nlinfo.pid = 0,
1740                 .fc_nlinfo.nlh = NULL,
1741                 .fc_nlinfo.nl_net = net,
1742         };
1743
1744         ipv6_addr_copy(&cfg.fc_dst, prefix);
1745         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1746
1747         /* We should treat it as a default route if prefix length is 0. */
1748         if (!prefixlen)
1749                 cfg.fc_flags |= RTF_DEFAULT;
1750
1751         ip6_route_add(&cfg);
1752
1753         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1754 }
1755 #endif
1756
1757 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1758 {
1759         struct rt6_info *rt;
1760         struct fib6_table *table;
1761
1762         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1763         if (table == NULL)
1764                 return NULL;
1765
1766         write_lock_bh(&table->tb6_lock);
1767         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1768                 if (dev == rt->rt6i_dev &&
1769                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1770                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1771                         break;
1772         }
1773         if (rt)
1774                 dst_hold(&rt->dst);
1775         write_unlock_bh(&table->tb6_lock);
1776         return rt;
1777 }
1778
1779 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1780                                      struct net_device *dev,
1781                                      unsigned int pref)
1782 {
1783         struct fib6_config cfg = {
1784                 .fc_table       = RT6_TABLE_DFLT,
1785                 .fc_metric      = IP6_RT_PRIO_USER,
1786                 .fc_ifindex     = dev->ifindex,
1787                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1788                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1789                 .fc_nlinfo.pid = 0,
1790                 .fc_nlinfo.nlh = NULL,
1791                 .fc_nlinfo.nl_net = dev_net(dev),
1792         };
1793
1794         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1795
1796         ip6_route_add(&cfg);
1797
1798         return rt6_get_dflt_router(gwaddr, dev);
1799 }
1800
1801 void rt6_purge_dflt_routers(struct net *net)
1802 {
1803         struct rt6_info *rt;
1804         struct fib6_table *table;
1805
1806         /* NOTE: Keep consistent with rt6_get_dflt_router */
1807         table = fib6_get_table(net, RT6_TABLE_DFLT);
1808         if (table == NULL)
1809                 return;
1810
1811 restart:
1812         read_lock_bh(&table->tb6_lock);
1813         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1814                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1815                         dst_hold(&rt->dst);
1816                         read_unlock_bh(&table->tb6_lock);
1817                         ip6_del_rt(rt);
1818                         goto restart;
1819                 }
1820         }
1821         read_unlock_bh(&table->tb6_lock);
1822 }
1823
1824 static void rtmsg_to_fib6_config(struct net *net,
1825                                  struct in6_rtmsg *rtmsg,
1826                                  struct fib6_config *cfg)
1827 {
1828         memset(cfg, 0, sizeof(*cfg));
1829
1830         cfg->fc_table = RT6_TABLE_MAIN;
1831         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1832         cfg->fc_metric = rtmsg->rtmsg_metric;
1833         cfg->fc_expires = rtmsg->rtmsg_info;
1834         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1835         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1836         cfg->fc_flags = rtmsg->rtmsg_flags;
1837
1838         cfg->fc_nlinfo.nl_net = net;
1839
1840         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1841         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1842         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1843 }
1844
1845 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1846 {
1847         struct fib6_config cfg;
1848         struct in6_rtmsg rtmsg;
1849         int err;
1850
1851         switch(cmd) {
1852         case SIOCADDRT:         /* Add a route */
1853         case SIOCDELRT:         /* Delete a route */
1854                 if (!capable(CAP_NET_ADMIN))
1855                         return -EPERM;
1856                 err = copy_from_user(&rtmsg, arg,
1857                                      sizeof(struct in6_rtmsg));
1858                 if (err)
1859                         return -EFAULT;
1860
1861                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1862
1863                 rtnl_lock();
1864                 switch (cmd) {
1865                 case SIOCADDRT:
1866                         err = ip6_route_add(&cfg);
1867                         break;
1868                 case SIOCDELRT:
1869                         err = ip6_route_del(&cfg);
1870                         break;
1871                 default:
1872                         err = -EINVAL;
1873                 }
1874                 rtnl_unlock();
1875
1876                 return err;
1877         }
1878
1879         return -EINVAL;
1880 }
1881
1882 /*
1883  *      Drop the packet on the floor
1884  */
1885
1886 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1887 {
1888         int type;
1889         struct dst_entry *dst = skb_dst(skb);
1890         switch (ipstats_mib_noroutes) {
1891         case IPSTATS_MIB_INNOROUTES:
1892                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1893                 if (type == IPV6_ADDR_ANY) {
1894                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1895                                       IPSTATS_MIB_INADDRERRORS);
1896                         break;
1897                 }
1898                 /* FALLTHROUGH */
1899         case IPSTATS_MIB_OUTNOROUTES:
1900                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1901                               ipstats_mib_noroutes);
1902                 break;
1903         }
1904         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1905         kfree_skb(skb);
1906         return 0;
1907 }
1908
1909 static int ip6_pkt_discard(struct sk_buff *skb)
1910 {
1911         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1912 }
1913
1914 static int ip6_pkt_discard_out(struct sk_buff *skb)
1915 {
1916         skb->dev = skb_dst(skb)->dev;
1917         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1918 }
1919
1920 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1921
1922 static int ip6_pkt_prohibit(struct sk_buff *skb)
1923 {
1924         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1925 }
1926
1927 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1928 {
1929         skb->dev = skb_dst(skb)->dev;
1930         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1931 }
1932
1933 #endif
1934
1935 /*
1936  *      Allocate a dst for local (unicast / anycast) address.
1937  */
1938
1939 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1940                                     const struct in6_addr *addr,
1941                                     int anycast)
1942 {
1943         struct net *net = dev_net(idev->dev);
1944         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1945         struct neighbour *neigh;
1946
1947         if (rt == NULL)
1948                 return ERR_PTR(-ENOMEM);
1949
1950         dev_hold(net->loopback_dev);
1951         in6_dev_hold(idev);
1952
1953         rt->dst.flags = DST_HOST;
1954         rt->dst.input = ip6_input;
1955         rt->dst.output = ip6_output;
1956         rt->rt6i_dev = net->loopback_dev;
1957         rt->rt6i_idev = idev;
1958         rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1959         rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
1960         rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
1961         rt->dst.obsolete = -1;
1962
1963         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1964         if (anycast)
1965                 rt->rt6i_flags |= RTF_ANYCAST;
1966         else
1967                 rt->rt6i_flags |= RTF_LOCAL;
1968         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1969         if (IS_ERR(neigh)) {
1970                 dst_free(&rt->dst);
1971
1972                 /* We are casting this because that is the return
1973                  * value type.  But an errno encoded pointer is the
1974                  * same regardless of the underlying pointer type,
1975                  * and that's what we are returning.  So this is OK.
1976                  */
1977                 return (struct rt6_info *) neigh;
1978         }
1979         rt->rt6i_nexthop = neigh;
1980
1981         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1982         rt->rt6i_dst.plen = 128;
1983         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1984
1985         atomic_set(&rt->dst.__refcnt, 1);
1986
1987         return rt;
1988 }
1989
1990 struct arg_dev_net {
1991         struct net_device *dev;
1992         struct net *net;
1993 };
1994
1995 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1996 {
1997         struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1998         struct net *net = ((struct arg_dev_net *)arg)->net;
1999
2000         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2001             rt != net->ipv6.ip6_null_entry) {
2002                 RT6_TRACE("deleted by ifdown %p\n", rt);
2003                 return -1;
2004         }
2005         return 0;
2006 }
2007
2008 void rt6_ifdown(struct net *net, struct net_device *dev)
2009 {
2010         struct arg_dev_net adn = {
2011                 .dev = dev,
2012                 .net = net,
2013         };
2014
2015         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2016         icmp6_clean_all(fib6_ifdown, &adn);
2017 }
2018
2019 struct rt6_mtu_change_arg
2020 {
2021         struct net_device *dev;
2022         unsigned mtu;
2023 };
2024
2025 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2026 {
2027         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2028         struct inet6_dev *idev;
2029         struct net *net = dev_net(arg->dev);
2030
2031         /* In IPv6 pmtu discovery is not optional,
2032            so that RTAX_MTU lock cannot disable it.
2033            We still use this lock to block changes
2034            caused by addrconf/ndisc.
2035         */
2036
2037         idev = __in6_dev_get(arg->dev);
2038         if (idev == NULL)
2039                 return 0;
2040
2041         /* For administrative MTU increase, there is no way to discover
2042            IPv6 PMTU increase, so PMTU increase should be updated here.
2043            Since RFC 1981 doesn't include administrative MTU increase
2044            update PMTU increase is a MUST. (i.e. jumbo frame)
2045          */
2046         /*
2047            If new MTU is less than route PMTU, this new MTU will be the
2048            lowest MTU in the path, update the route PMTU to reflect PMTU
2049            decreases; if new MTU is greater than route PMTU, and the
2050            old MTU is the lowest MTU in the path, update the route PMTU
2051            to reflect the increase. In this case if the other nodes' MTU
2052            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2053            PMTU discouvery.
2054          */
2055         if (rt->rt6i_dev == arg->dev &&
2056             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2057             (dst_mtu(&rt->dst) >= arg->mtu ||
2058              (dst_mtu(&rt->dst) < arg->mtu &&
2059               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2060                 rt->dst.metrics[RTAX_MTU-1] = arg->mtu;
2061                 rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
2062         }
2063         return 0;
2064 }
2065
2066 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2067 {
2068         struct rt6_mtu_change_arg arg = {
2069                 .dev = dev,
2070                 .mtu = mtu,
2071         };
2072
2073         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2074 }
2075
2076 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2077         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2078         [RTA_OIF]               = { .type = NLA_U32 },
2079         [RTA_IIF]               = { .type = NLA_U32 },
2080         [RTA_PRIORITY]          = { .type = NLA_U32 },
2081         [RTA_METRICS]           = { .type = NLA_NESTED },
2082 };
2083
2084 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2085                               struct fib6_config *cfg)
2086 {
2087         struct rtmsg *rtm;
2088         struct nlattr *tb[RTA_MAX+1];
2089         int err;
2090
2091         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2092         if (err < 0)
2093                 goto errout;
2094
2095         err = -EINVAL;
2096         rtm = nlmsg_data(nlh);
2097         memset(cfg, 0, sizeof(*cfg));
2098
2099         cfg->fc_table = rtm->rtm_table;
2100         cfg->fc_dst_len = rtm->rtm_dst_len;
2101         cfg->fc_src_len = rtm->rtm_src_len;
2102         cfg->fc_flags = RTF_UP;
2103         cfg->fc_protocol = rtm->rtm_protocol;
2104
2105         if (rtm->rtm_type == RTN_UNREACHABLE)
2106                 cfg->fc_flags |= RTF_REJECT;
2107
2108         if (rtm->rtm_type == RTN_LOCAL)
2109                 cfg->fc_flags |= RTF_LOCAL;
2110
2111         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2112         cfg->fc_nlinfo.nlh = nlh;
2113         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2114
2115         if (tb[RTA_GATEWAY]) {
2116                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2117                 cfg->fc_flags |= RTF_GATEWAY;
2118         }
2119
2120         if (tb[RTA_DST]) {
2121                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2122
2123                 if (nla_len(tb[RTA_DST]) < plen)
2124                         goto errout;
2125
2126                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2127         }
2128
2129         if (tb[RTA_SRC]) {
2130                 int plen = (rtm->rtm_src_len + 7) >> 3;
2131
2132                 if (nla_len(tb[RTA_SRC]) < plen)
2133                         goto errout;
2134
2135                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2136         }
2137
2138         if (tb[RTA_OIF])
2139                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2140
2141         if (tb[RTA_PRIORITY])
2142                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2143
2144         if (tb[RTA_METRICS]) {
2145                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2146                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2147         }
2148
2149         if (tb[RTA_TABLE])
2150                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2151
2152         err = 0;
2153 errout:
2154         return err;
2155 }
2156
2157 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2158 {
2159         struct fib6_config cfg;
2160         int err;
2161
2162         err = rtm_to_fib6_config(skb, nlh, &cfg);
2163         if (err < 0)
2164                 return err;
2165
2166         return ip6_route_del(&cfg);
2167 }
2168
2169 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2170 {
2171         struct fib6_config cfg;
2172         int err;
2173
2174         err = rtm_to_fib6_config(skb, nlh, &cfg);
2175         if (err < 0)
2176                 return err;
2177
2178         return ip6_route_add(&cfg);
2179 }
2180
2181 static inline size_t rt6_nlmsg_size(void)
2182 {
2183         return NLMSG_ALIGN(sizeof(struct rtmsg))
2184                + nla_total_size(16) /* RTA_SRC */
2185                + nla_total_size(16) /* RTA_DST */
2186                + nla_total_size(16) /* RTA_GATEWAY */
2187                + nla_total_size(16) /* RTA_PREFSRC */
2188                + nla_total_size(4) /* RTA_TABLE */
2189                + nla_total_size(4) /* RTA_IIF */
2190                + nla_total_size(4) /* RTA_OIF */
2191                + nla_total_size(4) /* RTA_PRIORITY */
2192                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2193                + nla_total_size(sizeof(struct rta_cacheinfo));
2194 }
2195
2196 static int rt6_fill_node(struct net *net,
2197                          struct sk_buff *skb, struct rt6_info *rt,
2198                          struct in6_addr *dst, struct in6_addr *src,
2199                          int iif, int type, u32 pid, u32 seq,
2200                          int prefix, int nowait, unsigned int flags)
2201 {
2202         struct rtmsg *rtm;
2203         struct nlmsghdr *nlh;
2204         long expires;
2205         u32 table;
2206
2207         if (prefix) {   /* user wants prefix routes only */
2208                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2209                         /* success since this is not a prefix route */
2210                         return 1;
2211                 }
2212         }
2213
2214         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2215         if (nlh == NULL)
2216                 return -EMSGSIZE;
2217
2218         rtm = nlmsg_data(nlh);
2219         rtm->rtm_family = AF_INET6;
2220         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2221         rtm->rtm_src_len = rt->rt6i_src.plen;
2222         rtm->rtm_tos = 0;
2223         if (rt->rt6i_table)
2224                 table = rt->rt6i_table->tb6_id;
2225         else
2226                 table = RT6_TABLE_UNSPEC;
2227         rtm->rtm_table = table;
2228         NLA_PUT_U32(skb, RTA_TABLE, table);
2229         if (rt->rt6i_flags&RTF_REJECT)
2230                 rtm->rtm_type = RTN_UNREACHABLE;
2231         else if (rt->rt6i_flags&RTF_LOCAL)
2232                 rtm->rtm_type = RTN_LOCAL;
2233         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2234                 rtm->rtm_type = RTN_LOCAL;
2235         else
2236                 rtm->rtm_type = RTN_UNICAST;
2237         rtm->rtm_flags = 0;
2238         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2239         rtm->rtm_protocol = rt->rt6i_protocol;
2240         if (rt->rt6i_flags&RTF_DYNAMIC)
2241                 rtm->rtm_protocol = RTPROT_REDIRECT;
2242         else if (rt->rt6i_flags & RTF_ADDRCONF)
2243                 rtm->rtm_protocol = RTPROT_KERNEL;
2244         else if (rt->rt6i_flags&RTF_DEFAULT)
2245                 rtm->rtm_protocol = RTPROT_RA;
2246
2247         if (rt->rt6i_flags&RTF_CACHE)
2248                 rtm->rtm_flags |= RTM_F_CLONED;
2249
2250         if (dst) {
2251                 NLA_PUT(skb, RTA_DST, 16, dst);
2252                 rtm->rtm_dst_len = 128;
2253         } else if (rtm->rtm_dst_len)
2254                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2255 #ifdef CONFIG_IPV6_SUBTREES
2256         if (src) {
2257                 NLA_PUT(skb, RTA_SRC, 16, src);
2258                 rtm->rtm_src_len = 128;
2259         } else if (rtm->rtm_src_len)
2260                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2261 #endif
2262         if (iif) {
2263 #ifdef CONFIG_IPV6_MROUTE
2264                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2265                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2266                         if (err <= 0) {
2267                                 if (!nowait) {
2268                                         if (err == 0)
2269                                                 return 0;
2270                                         goto nla_put_failure;
2271                                 } else {
2272                                         if (err == -EMSGSIZE)
2273                                                 goto nla_put_failure;
2274                                 }
2275                         }
2276                 } else
2277 #endif
2278                         NLA_PUT_U32(skb, RTA_IIF, iif);
2279         } else if (dst) {
2280                 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2281                 struct in6_addr saddr_buf;
2282                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2283                                        dst, 0, &saddr_buf) == 0)
2284                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2285         }
2286
2287         if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2288                 goto nla_put_failure;
2289
2290         if (rt->dst.neighbour)
2291                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2292
2293         if (rt->dst.dev)
2294                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2295
2296         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2297
2298         if (!(rt->rt6i_flags & RTF_EXPIRES))
2299                 expires = 0;
2300         else if (rt->rt6i_expires - jiffies < INT_MAX)
2301                 expires = rt->rt6i_expires - jiffies;
2302         else
2303                 expires = INT_MAX;
2304
2305         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2306                                expires, rt->dst.error) < 0)
2307                 goto nla_put_failure;
2308
2309         return nlmsg_end(skb, nlh);
2310
2311 nla_put_failure:
2312         nlmsg_cancel(skb, nlh);
2313         return -EMSGSIZE;
2314 }
2315
2316 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2317 {
2318         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2319         int prefix;
2320
2321         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2322                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2323                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2324         } else
2325                 prefix = 0;
2326
2327         return rt6_fill_node(arg->net,
2328                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2329                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2330                      prefix, 0, NLM_F_MULTI);
2331 }
2332
2333 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2334 {
2335         struct net *net = sock_net(in_skb->sk);
2336         struct nlattr *tb[RTA_MAX+1];
2337         struct rt6_info *rt;
2338         struct sk_buff *skb;
2339         struct rtmsg *rtm;
2340         struct flowi fl;
2341         int err, iif = 0;
2342
2343         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2344         if (err < 0)
2345                 goto errout;
2346
2347         err = -EINVAL;
2348         memset(&fl, 0, sizeof(fl));
2349
2350         if (tb[RTA_SRC]) {
2351                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2352                         goto errout;
2353
2354                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2355         }
2356
2357         if (tb[RTA_DST]) {
2358                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2359                         goto errout;
2360
2361                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2362         }
2363
2364         if (tb[RTA_IIF])
2365                 iif = nla_get_u32(tb[RTA_IIF]);
2366
2367         if (tb[RTA_OIF])
2368                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2369
2370         if (iif) {
2371                 struct net_device *dev;
2372                 dev = __dev_get_by_index(net, iif);
2373                 if (!dev) {
2374                         err = -ENODEV;
2375                         goto errout;
2376                 }
2377         }
2378
2379         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2380         if (skb == NULL) {
2381                 err = -ENOBUFS;
2382                 goto errout;
2383         }
2384
2385         /* Reserve room for dummy headers, this skb can pass
2386            through good chunk of routing engine.
2387          */
2388         skb_reset_mac_header(skb);
2389         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2390
2391         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2392         skb_dst_set(skb, &rt->dst);
2393
2394         err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2395                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2396                             nlh->nlmsg_seq, 0, 0, 0);
2397         if (err < 0) {
2398                 kfree_skb(skb);
2399                 goto errout;
2400         }
2401
2402         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2403 errout:
2404         return err;
2405 }
2406
2407 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2408 {
2409         struct sk_buff *skb;
2410         struct net *net = info->nl_net;
2411         u32 seq;
2412         int err;
2413
2414         err = -ENOBUFS;
2415         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2416
2417         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2418         if (skb == NULL)
2419                 goto errout;
2420
2421         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2422                                 event, info->pid, seq, 0, 0, 0);
2423         if (err < 0) {
2424                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2425                 WARN_ON(err == -EMSGSIZE);
2426                 kfree_skb(skb);
2427                 goto errout;
2428         }
2429         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2430                     info->nlh, gfp_any());
2431         return;
2432 errout:
2433         if (err < 0)
2434                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2435 }
2436
2437 static int ip6_route_dev_notify(struct notifier_block *this,
2438                                 unsigned long event, void *data)
2439 {
2440         struct net_device *dev = (struct net_device *)data;
2441         struct net *net = dev_net(dev);
2442
2443         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2444                 net->ipv6.ip6_null_entry->dst.dev = dev;
2445                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2446 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2447                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2448                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2449                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2450                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2451 #endif
2452         }
2453
2454         return NOTIFY_OK;
2455 }
2456
2457 /*
2458  *      /proc
2459  */
2460
2461 #ifdef CONFIG_PROC_FS
2462
2463 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2464
2465 struct rt6_proc_arg
2466 {
2467         char *buffer;
2468         int offset;
2469         int length;
2470         int skip;
2471         int len;
2472 };
2473
2474 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2475 {
2476         struct seq_file *m = p_arg;
2477
2478         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2479
2480 #ifdef CONFIG_IPV6_SUBTREES
2481         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2482 #else
2483         seq_puts(m, "00000000000000000000000000000000 00 ");
2484 #endif
2485
2486         if (rt->rt6i_nexthop) {
2487                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2488         } else {
2489                 seq_puts(m, "00000000000000000000000000000000");
2490         }
2491         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2492                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2493                    rt->dst.__use, rt->rt6i_flags,
2494                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2495         return 0;
2496 }
2497
2498 static int ipv6_route_show(struct seq_file *m, void *v)
2499 {
2500         struct net *net = (struct net *)m->private;
2501         fib6_clean_all(net, rt6_info_route, 0, m);
2502         return 0;
2503 }
2504
2505 static int ipv6_route_open(struct inode *inode, struct file *file)
2506 {
2507         return single_open_net(inode, file, ipv6_route_show);
2508 }
2509
2510 static const struct file_operations ipv6_route_proc_fops = {
2511         .owner          = THIS_MODULE,
2512         .open           = ipv6_route_open,
2513         .read           = seq_read,
2514         .llseek         = seq_lseek,
2515         .release        = single_release_net,
2516 };
2517
2518 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2519 {
2520         struct net *net = (struct net *)seq->private;
2521         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2522                    net->ipv6.rt6_stats->fib_nodes,
2523                    net->ipv6.rt6_stats->fib_route_nodes,
2524                    net->ipv6.rt6_stats->fib_rt_alloc,
2525                    net->ipv6.rt6_stats->fib_rt_entries,
2526                    net->ipv6.rt6_stats->fib_rt_cache,
2527                    atomic_read(&net->ipv6.ip6_dst_ops.entries),
2528                    net->ipv6.rt6_stats->fib_discarded_routes);
2529
2530         return 0;
2531 }
2532
2533 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2534 {
2535         return single_open_net(inode, file, rt6_stats_seq_show);
2536 }
2537
2538 static const struct file_operations rt6_stats_seq_fops = {
2539         .owner   = THIS_MODULE,
2540         .open    = rt6_stats_seq_open,
2541         .read    = seq_read,
2542         .llseek  = seq_lseek,
2543         .release = single_release_net,
2544 };
2545 #endif  /* CONFIG_PROC_FS */
2546
2547 #ifdef CONFIG_SYSCTL
2548
2549 static
2550 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2551                               void __user *buffer, size_t *lenp, loff_t *ppos)
2552 {
2553         struct net *net = current->nsproxy->net_ns;
2554         int delay = net->ipv6.sysctl.flush_delay;
2555         if (write) {
2556                 proc_dointvec(ctl, write, buffer, lenp, ppos);
2557                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2558                 return 0;
2559         } else
2560                 return -EINVAL;
2561 }
2562
2563 ctl_table ipv6_route_table_template[] = {
2564         {
2565                 .procname       =       "flush",
2566                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2567                 .maxlen         =       sizeof(int),
2568                 .mode           =       0200,
2569                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2570         },
2571         {
2572                 .procname       =       "gc_thresh",
2573                 .data           =       &ip6_dst_ops_template.gc_thresh,
2574                 .maxlen         =       sizeof(int),
2575                 .mode           =       0644,
2576                 .proc_handler   =       proc_dointvec,
2577         },
2578         {
2579                 .procname       =       "max_size",
2580                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2581                 .maxlen         =       sizeof(int),
2582                 .mode           =       0644,
2583                 .proc_handler   =       proc_dointvec,
2584         },
2585         {
2586                 .procname       =       "gc_min_interval",
2587                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2588                 .maxlen         =       sizeof(int),
2589                 .mode           =       0644,
2590                 .proc_handler   =       proc_dointvec_jiffies,
2591         },
2592         {
2593                 .procname       =       "gc_timeout",
2594                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2595                 .maxlen         =       sizeof(int),
2596                 .mode           =       0644,
2597                 .proc_handler   =       proc_dointvec_jiffies,
2598         },
2599         {
2600                 .procname       =       "gc_interval",
2601                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2602                 .maxlen         =       sizeof(int),
2603                 .mode           =       0644,
2604                 .proc_handler   =       proc_dointvec_jiffies,
2605         },
2606         {
2607                 .procname       =       "gc_elasticity",
2608                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2609                 .maxlen         =       sizeof(int),
2610                 .mode           =       0644,
2611                 .proc_handler   =       proc_dointvec,
2612         },
2613         {
2614                 .procname       =       "mtu_expires",
2615                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2616                 .maxlen         =       sizeof(int),
2617                 .mode           =       0644,
2618                 .proc_handler   =       proc_dointvec_jiffies,
2619         },
2620         {
2621                 .procname       =       "min_adv_mss",
2622                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2623                 .maxlen         =       sizeof(int),
2624                 .mode           =       0644,
2625                 .proc_handler   =       proc_dointvec,
2626         },
2627         {
2628                 .procname       =       "gc_min_interval_ms",
2629                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2630                 .maxlen         =       sizeof(int),
2631                 .mode           =       0644,
2632                 .proc_handler   =       proc_dointvec_ms_jiffies,
2633         },
2634         { }
2635 };
2636
2637 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2638 {
2639         struct ctl_table *table;
2640
2641         table = kmemdup(ipv6_route_table_template,
2642                         sizeof(ipv6_route_table_template),
2643                         GFP_KERNEL);
2644
2645         if (table) {
2646                 table[0].data = &net->ipv6.sysctl.flush_delay;
2647                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2648                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2649                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2650                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2651                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2652                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2653                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2654                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2655                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2656         }
2657
2658         return table;
2659 }
2660 #endif
2661
2662 static int __net_init ip6_route_net_init(struct net *net)
2663 {
2664         int ret = -ENOMEM;
2665
2666         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2667                sizeof(net->ipv6.ip6_dst_ops));
2668
2669         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2670                                            sizeof(*net->ipv6.ip6_null_entry),
2671                                            GFP_KERNEL);
2672         if (!net->ipv6.ip6_null_entry)
2673                 goto out_ip6_dst_ops;
2674         net->ipv6.ip6_null_entry->dst.path =
2675                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2676         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2677
2678 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2679         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2680                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2681                                                GFP_KERNEL);
2682         if (!net->ipv6.ip6_prohibit_entry)
2683                 goto out_ip6_null_entry;
2684         net->ipv6.ip6_prohibit_entry->dst.path =
2685                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2686         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2687
2688         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2689                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2690                                                GFP_KERNEL);
2691         if (!net->ipv6.ip6_blk_hole_entry)
2692                 goto out_ip6_prohibit_entry;
2693         net->ipv6.ip6_blk_hole_entry->dst.path =
2694                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2695         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2696 #endif
2697
2698         net->ipv6.sysctl.flush_delay = 0;
2699         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2700         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2701         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2702         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2703         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2704         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2705         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2706
2707 #ifdef CONFIG_PROC_FS
2708         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2709         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2710 #endif
2711         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2712
2713         ret = 0;
2714 out:
2715         return ret;
2716
2717 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2718 out_ip6_prohibit_entry:
2719         kfree(net->ipv6.ip6_prohibit_entry);
2720 out_ip6_null_entry:
2721         kfree(net->ipv6.ip6_null_entry);
2722 #endif
2723 out_ip6_dst_ops:
2724         goto out;
2725 }
2726
2727 static void __net_exit ip6_route_net_exit(struct net *net)
2728 {
2729 #ifdef CONFIG_PROC_FS
2730         proc_net_remove(net, "ipv6_route");
2731         proc_net_remove(net, "rt6_stats");
2732 #endif
2733         kfree(net->ipv6.ip6_null_entry);
2734 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2735         kfree(net->ipv6.ip6_prohibit_entry);
2736         kfree(net->ipv6.ip6_blk_hole_entry);
2737 #endif
2738 }
2739
2740 static struct pernet_operations ip6_route_net_ops = {
2741         .init = ip6_route_net_init,
2742         .exit = ip6_route_net_exit,
2743 };
2744
2745 static struct notifier_block ip6_route_dev_notifier = {
2746         .notifier_call = ip6_route_dev_notify,
2747         .priority = 0,
2748 };
2749
2750 int __init ip6_route_init(void)
2751 {
2752         int ret;
2753
2754         ret = -ENOMEM;
2755         ip6_dst_ops_template.kmem_cachep =
2756                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2757                                   SLAB_HWCACHE_ALIGN, NULL);
2758         if (!ip6_dst_ops_template.kmem_cachep)
2759                 goto out;
2760
2761         ret = register_pernet_subsys(&ip6_route_net_ops);
2762         if (ret)
2763                 goto out_kmem_cache;
2764
2765         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2766
2767         /* Registering of the loopback is done before this portion of code,
2768          * the loopback reference in rt6_info will not be taken, do it
2769          * manually for init_net */
2770         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2771         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2772   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2773         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2774         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2775         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2776         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2777   #endif
2778         ret = fib6_init();
2779         if (ret)
2780                 goto out_register_subsys;
2781
2782         ret = xfrm6_init();
2783         if (ret)
2784                 goto out_fib6_init;
2785
2786         ret = fib6_rules_init();
2787         if (ret)
2788                 goto xfrm6_init;
2789
2790         ret = -ENOBUFS;
2791         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2792             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2793             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2794                 goto fib6_rules_init;
2795
2796         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2797         if (ret)
2798                 goto fib6_rules_init;
2799
2800 out:
2801         return ret;
2802
2803 fib6_rules_init:
2804         fib6_rules_cleanup();
2805 xfrm6_init:
2806         xfrm6_fini();
2807 out_fib6_init:
2808         fib6_gc_cleanup();
2809 out_register_subsys:
2810         unregister_pernet_subsys(&ip6_route_net_ops);
2811 out_kmem_cache:
2812         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2813         goto out;
2814 }
2815
2816 void ip6_route_cleanup(void)
2817 {
2818         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2819         fib6_rules_cleanup();
2820         xfrm6_fini();
2821         fib6_gc_cleanup();
2822         unregister_pernet_subsys(&ip6_route_net_ops);
2823         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2824 }