2 * Linux IPv6 multicast routing support for BSD pim6sd
3 * Based on net/ipv4/ipmr.c.
5 * (c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr>
6 * LSIIT Laboratory, Strasbourg, France
7 * (c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com>
9 * Copyright (C)2007,2008 USAGI/WIDE Project
10 * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
19 #include <asm/system.h>
20 #include <asm/uaccess.h>
21 #include <linux/types.h>
22 #include <linux/sched.h>
23 #include <linux/errno.h>
24 #include <linux/timer.h>
26 #include <linux/kernel.h>
27 #include <linux/fcntl.h>
28 #include <linux/stat.h>
29 #include <linux/socket.h>
30 #include <linux/inet.h>
31 #include <linux/netdevice.h>
32 #include <linux/inetdevice.h>
33 #include <linux/proc_fs.h>
34 #include <linux/seq_file.h>
35 #include <linux/init.h>
36 #include <net/protocol.h>
37 #include <linux/skbuff.h>
40 #include <linux/notifier.h>
41 #include <linux/if_arp.h>
42 #include <net/checksum.h>
43 #include <net/netlink.h>
46 #include <net/ip6_route.h>
47 #include <linux/mroute6.h>
48 #include <linux/pim.h>
49 #include <net/addrconf.h>
50 #include <linux/netfilter_ipv6.h>
52 /* Big lock, protecting vif table, mrt cache and mroute socket state.
53 Note that the changes are semaphored via rtnl_lock.
56 static DEFINE_RWLOCK(mrt_lock);
59 * Multicast router control variables
62 #define MIF_EXISTS(_net, _idx) ((_net)->ipv6.vif6_table[_idx].dev != NULL)
64 static struct mfc6_cache *mfc_unres_queue; /* Queue of unresolved entries */
66 /* Special spinlock for queue of unresolved entries */
67 static DEFINE_SPINLOCK(mfc_unres_lock);
69 /* We return to original Alan's scheme. Hash table of resolved
70 entries is changed only in process context and protected
71 with weak lock mrt_lock. Queue of unresolved entries is protected
72 with strong spinlock mfc_unres_lock.
74 In this case data path is free of exclusive locks at all.
77 static struct kmem_cache *mrt_cachep __read_mostly;
79 static int ip6_mr_forward(struct sk_buff *skb, struct mfc6_cache *cache);
80 static int ip6mr_cache_report(struct sk_buff *pkt, mifi_t mifi, int assert);
81 static int ip6mr_fill_mroute(struct sk_buff *skb, struct mfc6_cache *c, struct rtmsg *rtm);
83 #ifdef CONFIG_IPV6_PIMSM_V2
84 static struct inet6_protocol pim6_protocol;
87 static struct timer_list ipmr_expire_timer;
92 struct ipmr_mfc_iter {
93 struct mfc6_cache **cache;
98 static struct mfc6_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
100 struct mfc6_cache *mfc;
102 it->cache = init_net.ipv6.mfc6_cache_array;
103 read_lock(&mrt_lock);
104 for (it->ct = 0; it->ct < MFC6_LINES; it->ct++)
105 for (mfc = init_net.ipv6.mfc6_cache_array[it->ct];
106 mfc; mfc = mfc->next)
109 read_unlock(&mrt_lock);
111 it->cache = &mfc_unres_queue;
112 spin_lock_bh(&mfc_unres_lock);
113 for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
116 spin_unlock_bh(&mfc_unres_lock);
126 * The /proc interfaces to multicast routing /proc/ip6_mr_cache /proc/ip6_mr_vif
129 struct ipmr_vif_iter {
133 static struct mif_device *ip6mr_vif_seq_idx(struct ipmr_vif_iter *iter,
136 for (iter->ct = 0; iter->ct < init_net.ipv6.maxvif; ++iter->ct) {
137 if (!MIF_EXISTS(&init_net, iter->ct))
140 return &init_net.ipv6.vif6_table[iter->ct];
145 static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
148 read_lock(&mrt_lock);
149 return (*pos ? ip6mr_vif_seq_idx(seq->private, *pos - 1)
153 static void *ip6mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
155 struct ipmr_vif_iter *iter = seq->private;
158 if (v == SEQ_START_TOKEN)
159 return ip6mr_vif_seq_idx(iter, 0);
161 while (++iter->ct < init_net.ipv6.maxvif) {
162 if (!MIF_EXISTS(&init_net, iter->ct))
164 return &init_net.ipv6.vif6_table[iter->ct];
169 static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v)
172 read_unlock(&mrt_lock);
175 static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
177 if (v == SEQ_START_TOKEN) {
179 "Interface BytesIn PktsIn BytesOut PktsOut Flags\n");
181 const struct mif_device *vif = v;
182 const char *name = vif->dev ? vif->dev->name : "none";
185 "%2td %-10s %8ld %7ld %8ld %7ld %05X\n",
186 vif - init_net.ipv6.vif6_table,
187 name, vif->bytes_in, vif->pkt_in,
188 vif->bytes_out, vif->pkt_out,
194 static struct seq_operations ip6mr_vif_seq_ops = {
195 .start = ip6mr_vif_seq_start,
196 .next = ip6mr_vif_seq_next,
197 .stop = ip6mr_vif_seq_stop,
198 .show = ip6mr_vif_seq_show,
201 static int ip6mr_vif_open(struct inode *inode, struct file *file)
203 return seq_open_private(file, &ip6mr_vif_seq_ops,
204 sizeof(struct ipmr_vif_iter));
207 static struct file_operations ip6mr_vif_fops = {
208 .owner = THIS_MODULE,
209 .open = ip6mr_vif_open,
212 .release = seq_release_private,
215 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
217 return (*pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
221 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
223 struct mfc6_cache *mfc = v;
224 struct ipmr_mfc_iter *it = seq->private;
228 if (v == SEQ_START_TOKEN)
229 return ipmr_mfc_seq_idx(seq->private, 0);
234 if (it->cache == &mfc_unres_queue)
237 BUG_ON(it->cache != init_net.ipv6.mfc6_cache_array);
239 while (++it->ct < MFC6_LINES) {
240 mfc = init_net.ipv6.mfc6_cache_array[it->ct];
245 /* exhausted cache_array, show unresolved */
246 read_unlock(&mrt_lock);
247 it->cache = &mfc_unres_queue;
250 spin_lock_bh(&mfc_unres_lock);
251 mfc = mfc_unres_queue;
256 spin_unlock_bh(&mfc_unres_lock);
262 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
264 struct ipmr_mfc_iter *it = seq->private;
266 if (it->cache == &mfc_unres_queue)
267 spin_unlock_bh(&mfc_unres_lock);
268 else if (it->cache == init_net.ipv6.mfc6_cache_array)
269 read_unlock(&mrt_lock);
272 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
276 if (v == SEQ_START_TOKEN) {
280 "Iif Pkts Bytes Wrong Oifs\n");
282 const struct mfc6_cache *mfc = v;
283 const struct ipmr_mfc_iter *it = seq->private;
285 seq_printf(seq, "%pI6 %pI6 %-3hd",
286 &mfc->mf6c_mcastgrp, &mfc->mf6c_origin,
289 if (it->cache != &mfc_unres_queue) {
290 seq_printf(seq, " %8lu %8lu %8lu",
292 mfc->mfc_un.res.bytes,
293 mfc->mfc_un.res.wrong_if);
294 for (n = mfc->mfc_un.res.minvif;
295 n < mfc->mfc_un.res.maxvif; n++) {
296 if (MIF_EXISTS(&init_net, n) &&
297 mfc->mfc_un.res.ttls[n] < 255)
300 n, mfc->mfc_un.res.ttls[n]);
303 /* unresolved mfc_caches don't contain
304 * pkt, bytes and wrong_if values
306 seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
313 static struct seq_operations ipmr_mfc_seq_ops = {
314 .start = ipmr_mfc_seq_start,
315 .next = ipmr_mfc_seq_next,
316 .stop = ipmr_mfc_seq_stop,
317 .show = ipmr_mfc_seq_show,
320 static int ipmr_mfc_open(struct inode *inode, struct file *file)
322 return seq_open_private(file, &ipmr_mfc_seq_ops,
323 sizeof(struct ipmr_mfc_iter));
326 static struct file_operations ip6mr_mfc_fops = {
327 .owner = THIS_MODULE,
328 .open = ipmr_mfc_open,
331 .release = seq_release_private,
335 #ifdef CONFIG_IPV6_PIMSM_V2
337 static int pim6_rcv(struct sk_buff *skb)
339 struct pimreghdr *pim;
340 struct ipv6hdr *encap;
341 struct net_device *reg_dev = NULL;
342 int reg_vif_num = init_net.ipv6.mroute_reg_vif_num;
344 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
347 pim = (struct pimreghdr *)skb_transport_header(skb);
348 if (pim->type != ((PIM_VERSION << 4) | PIM_REGISTER) ||
349 (pim->flags & PIM_NULL_REGISTER) ||
350 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
351 csum_fold(skb_checksum(skb, 0, skb->len, 0))))
354 /* check if the inner packet is destined to mcast group */
355 encap = (struct ipv6hdr *)(skb_transport_header(skb) +
358 if (!ipv6_addr_is_multicast(&encap->daddr) ||
359 encap->payload_len == 0 ||
360 ntohs(encap->payload_len) + sizeof(*pim) > skb->len)
363 read_lock(&mrt_lock);
364 if (reg_vif_num >= 0)
365 reg_dev = init_net.ipv6.vif6_table[reg_vif_num].dev;
368 read_unlock(&mrt_lock);
373 skb->mac_header = skb->network_header;
374 skb_pull(skb, (u8 *)encap - skb->data);
375 skb_reset_network_header(skb);
377 skb->protocol = htons(ETH_P_IP);
379 skb->pkt_type = PACKET_HOST;
380 dst_release(skb->dst);
381 reg_dev->stats.rx_bytes += skb->len;
382 reg_dev->stats.rx_packets++;
393 static struct inet6_protocol pim6_protocol = {
397 /* Service routines creating virtual interfaces: PIMREG */
399 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
401 read_lock(&mrt_lock);
402 dev->stats.tx_bytes += skb->len;
403 dev->stats.tx_packets++;
404 ip6mr_cache_report(skb, init_net.ipv6.mroute_reg_vif_num, MRT6MSG_WHOLEPKT);
405 read_unlock(&mrt_lock);
410 static const struct net_device_ops reg_vif_netdev_ops = {
411 .ndo_start_xmit = reg_vif_xmit,
414 static void reg_vif_setup(struct net_device *dev)
416 dev->type = ARPHRD_PIMREG;
417 dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8;
418 dev->flags = IFF_NOARP;
419 dev->netdev_ops = ®_vif_netdev_ops;
420 dev->destructor = free_netdev;
423 static struct net_device *ip6mr_reg_vif(void)
425 struct net_device *dev;
427 dev = alloc_netdev(0, "pim6reg", reg_vif_setup);
431 if (register_netdevice(dev)) {
444 /* allow the register to be completed before unregistering. */
448 unregister_netdevice(dev);
457 static int mif6_delete(int vifi)
459 struct mif_device *v;
460 struct net_device *dev;
461 if (vifi < 0 || vifi >= init_net.ipv6.maxvif)
462 return -EADDRNOTAVAIL;
464 v = &init_net.ipv6.vif6_table[vifi];
466 write_lock_bh(&mrt_lock);
471 write_unlock_bh(&mrt_lock);
472 return -EADDRNOTAVAIL;
475 #ifdef CONFIG_IPV6_PIMSM_V2
476 if (vifi == init_net.ipv6.mroute_reg_vif_num)
477 init_net.ipv6.mroute_reg_vif_num = -1;
480 if (vifi + 1 == init_net.ipv6.maxvif) {
482 for (tmp = vifi - 1; tmp >= 0; tmp--) {
483 if (MIF_EXISTS(&init_net, tmp))
486 init_net.ipv6.maxvif = tmp + 1;
489 write_unlock_bh(&mrt_lock);
491 dev_set_allmulti(dev, -1);
493 if (v->flags & MIFF_REGISTER)
494 unregister_netdevice(dev);
500 static inline void ip6mr_cache_free(struct mfc6_cache *c)
502 release_net(mfc6_net(c));
503 kmem_cache_free(mrt_cachep, c);
506 /* Destroy an unresolved cache entry, killing queued skbs
507 and reporting error to netlink readers.
510 static void ip6mr_destroy_unres(struct mfc6_cache *c)
514 atomic_dec(&init_net.ipv6.cache_resolve_queue_len);
516 while((skb = skb_dequeue(&c->mfc_un.unres.unresolved)) != NULL) {
517 if (ipv6_hdr(skb)->version == 0) {
518 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr));
519 nlh->nlmsg_type = NLMSG_ERROR;
520 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
521 skb_trim(skb, nlh->nlmsg_len);
522 ((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
523 rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
532 /* Single timer process for all the unresolved queue. */
534 static void ipmr_do_expire_process(unsigned long dummy)
536 unsigned long now = jiffies;
537 unsigned long expires = 10 * HZ;
538 struct mfc6_cache *c, **cp;
540 cp = &mfc_unres_queue;
542 while ((c = *cp) != NULL) {
543 if (time_after(c->mfc_un.unres.expires, now)) {
545 unsigned long interval = c->mfc_un.unres.expires - now;
546 if (interval < expires)
553 ip6mr_destroy_unres(c);
556 if (mfc_unres_queue != NULL)
557 mod_timer(&ipmr_expire_timer, jiffies + expires);
560 static void ipmr_expire_process(unsigned long dummy)
562 if (!spin_trylock(&mfc_unres_lock)) {
563 mod_timer(&ipmr_expire_timer, jiffies + 1);
567 if (mfc_unres_queue != NULL)
568 ipmr_do_expire_process(dummy);
570 spin_unlock(&mfc_unres_lock);
573 /* Fill oifs list. It is called under write locked mrt_lock. */
575 static void ip6mr_update_thresholds(struct mfc6_cache *cache, unsigned char *ttls)
579 cache->mfc_un.res.minvif = MAXMIFS;
580 cache->mfc_un.res.maxvif = 0;
581 memset(cache->mfc_un.res.ttls, 255, MAXMIFS);
583 for (vifi = 0; vifi < init_net.ipv6.maxvif; vifi++) {
584 if (MIF_EXISTS(&init_net, vifi) &&
585 ttls[vifi] && ttls[vifi] < 255) {
586 cache->mfc_un.res.ttls[vifi] = ttls[vifi];
587 if (cache->mfc_un.res.minvif > vifi)
588 cache->mfc_un.res.minvif = vifi;
589 if (cache->mfc_un.res.maxvif <= vifi)
590 cache->mfc_un.res.maxvif = vifi + 1;
595 static int mif6_add(struct mif6ctl *vifc, int mrtsock)
597 int vifi = vifc->mif6c_mifi;
598 struct mif_device *v = &init_net.ipv6.vif6_table[vifi];
599 struct net_device *dev;
603 if (MIF_EXISTS(&init_net, vifi))
606 switch (vifc->mif6c_flags) {
607 #ifdef CONFIG_IPV6_PIMSM_V2
610 * Special Purpose VIF in PIM
611 * All the packets will be sent to the daemon
613 if (init_net.ipv6.mroute_reg_vif_num >= 0)
615 dev = ip6mr_reg_vif();
618 err = dev_set_allmulti(dev, 1);
620 unregister_netdevice(dev);
627 dev = dev_get_by_index(&init_net, vifc->mif6c_pifi);
629 return -EADDRNOTAVAIL;
630 err = dev_set_allmulti(dev, 1);
641 * Fill in the VIF structures
643 v->rate_limit = vifc->vifc_rate_limit;
644 v->flags = vifc->mif6c_flags;
646 v->flags |= VIFF_STATIC;
647 v->threshold = vifc->vifc_threshold;
652 v->link = dev->ifindex;
653 if (v->flags & MIFF_REGISTER)
654 v->link = dev->iflink;
656 /* And finish update writing critical data */
657 write_lock_bh(&mrt_lock);
659 #ifdef CONFIG_IPV6_PIMSM_V2
660 if (v->flags & MIFF_REGISTER)
661 init_net.ipv6.mroute_reg_vif_num = vifi;
663 if (vifi + 1 > init_net.ipv6.maxvif)
664 init_net.ipv6.maxvif = vifi + 1;
665 write_unlock_bh(&mrt_lock);
669 static struct mfc6_cache *ip6mr_cache_find(struct in6_addr *origin, struct in6_addr *mcastgrp)
671 int line = MFC6_HASH(mcastgrp, origin);
672 struct mfc6_cache *c;
674 for (c = init_net.ipv6.mfc6_cache_array[line]; c; c = c->next) {
675 if (ipv6_addr_equal(&c->mf6c_origin, origin) &&
676 ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp))
683 * Allocate a multicast cache entry
685 static struct mfc6_cache *ip6mr_cache_alloc(struct net *net)
687 struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
690 c->mfc_un.res.minvif = MAXMIFS;
691 mfc6_net_set(c, net);
695 static struct mfc6_cache *ip6mr_cache_alloc_unres(struct net *net)
697 struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
700 skb_queue_head_init(&c->mfc_un.unres.unresolved);
701 c->mfc_un.unres.expires = jiffies + 10 * HZ;
702 mfc6_net_set(c, net);
707 * A cache entry has gone into a resolved state from queued
710 static void ip6mr_cache_resolve(struct mfc6_cache *uc, struct mfc6_cache *c)
715 * Play the pending entries through our router
718 while((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
719 if (ipv6_hdr(skb)->version == 0) {
721 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr));
723 if (ip6mr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
724 nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh;
726 nlh->nlmsg_type = NLMSG_ERROR;
727 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
728 skb_trim(skb, nlh->nlmsg_len);
729 ((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -EMSGSIZE;
731 err = rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
733 ip6_mr_forward(skb, c);
738 * Bounce a cache query up to pim6sd. We could use netlink for this but pim6sd
739 * expects the following bizarre scheme.
741 * Called under mrt_lock.
744 static int ip6mr_cache_report(struct sk_buff *pkt, mifi_t mifi, int assert)
750 #ifdef CONFIG_IPV6_PIMSM_V2
751 if (assert == MRT6MSG_WHOLEPKT)
752 skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt)
756 skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC);
761 /* I suppose that internal messages
762 * do not require checksums */
764 skb->ip_summed = CHECKSUM_UNNECESSARY;
766 #ifdef CONFIG_IPV6_PIMSM_V2
767 if (assert == MRT6MSG_WHOLEPKT) {
768 /* Ugly, but we have no choice with this interface.
769 Duplicate old header, fix length etc.
770 And all this only to mangle msg->im6_msgtype and
771 to set msg->im6_mbz to "mbz" :-)
773 skb_push(skb, -skb_network_offset(pkt));
775 skb_push(skb, sizeof(*msg));
776 skb_reset_transport_header(skb);
777 msg = (struct mrt6msg *)skb_transport_header(skb);
779 msg->im6_msgtype = MRT6MSG_WHOLEPKT;
780 msg->im6_mif = init_net.ipv6.mroute_reg_vif_num;
782 ipv6_addr_copy(&msg->im6_src, &ipv6_hdr(pkt)->saddr);
783 ipv6_addr_copy(&msg->im6_dst, &ipv6_hdr(pkt)->daddr);
785 skb->ip_summed = CHECKSUM_UNNECESSARY;
793 skb_put(skb, sizeof(struct ipv6hdr));
794 skb_reset_network_header(skb);
795 skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr));
800 skb_put(skb, sizeof(*msg));
801 skb_reset_transport_header(skb);
802 msg = (struct mrt6msg *)skb_transport_header(skb);
805 msg->im6_msgtype = assert;
808 ipv6_addr_copy(&msg->im6_src, &ipv6_hdr(pkt)->saddr);
809 ipv6_addr_copy(&msg->im6_dst, &ipv6_hdr(pkt)->daddr);
811 skb->dst = dst_clone(pkt->dst);
812 skb->ip_summed = CHECKSUM_UNNECESSARY;
814 skb_pull(skb, sizeof(struct ipv6hdr));
817 if (init_net.ipv6.mroute6_sk == NULL) {
823 * Deliver to user space multicast routing algorithms
825 ret = sock_queue_rcv_skb(init_net.ipv6.mroute6_sk, skb);
828 printk(KERN_WARNING "mroute6: pending queue full, dropping entries.\n");
836 * Queue a packet for resolution. It gets locked cache entry!
840 ip6mr_cache_unresolved(mifi_t mifi, struct sk_buff *skb)
843 struct mfc6_cache *c;
845 spin_lock_bh(&mfc_unres_lock);
846 for (c = mfc_unres_queue; c; c = c->next) {
847 if (net_eq(mfc6_net(c), &init_net) &&
848 ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) &&
849 ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr))
855 * Create a new entry if allowable
858 if (atomic_read(&init_net.ipv6.cache_resolve_queue_len) >= 10 ||
859 (c = ip6mr_cache_alloc_unres(&init_net)) == NULL) {
860 spin_unlock_bh(&mfc_unres_lock);
867 * Fill in the new cache entry
870 c->mf6c_origin = ipv6_hdr(skb)->saddr;
871 c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr;
874 * Reflect first query at pim6sd
876 if ((err = ip6mr_cache_report(skb, mifi, MRT6MSG_NOCACHE)) < 0) {
877 /* If the report failed throw the cache entry
880 spin_unlock_bh(&mfc_unres_lock);
887 atomic_inc(&init_net.ipv6.cache_resolve_queue_len);
888 c->next = mfc_unres_queue;
891 ipmr_do_expire_process(1);
895 * See if we can append the packet
897 if (c->mfc_un.unres.unresolved.qlen > 3) {
901 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
905 spin_unlock_bh(&mfc_unres_lock);
910 * MFC6 cache manipulation by user space
913 static int ip6mr_mfc_delete(struct mf6cctl *mfc)
916 struct mfc6_cache *c, **cp;
918 line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr);
920 for (cp = &init_net.ipv6.mfc6_cache_array[line];
921 (c = *cp) != NULL; cp = &c->next) {
922 if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) &&
923 ipv6_addr_equal(&c->mf6c_mcastgrp, &mfc->mf6cc_mcastgrp.sin6_addr)) {
924 write_lock_bh(&mrt_lock);
926 write_unlock_bh(&mrt_lock);
935 static int ip6mr_device_event(struct notifier_block *this,
936 unsigned long event, void *ptr)
938 struct net_device *dev = ptr;
939 struct mif_device *v;
942 if (!net_eq(dev_net(dev), &init_net))
945 if (event != NETDEV_UNREGISTER)
948 v = &init_net.ipv6.vif6_table[0];
949 for (ct = 0; ct < init_net.ipv6.maxvif; ct++, v++) {
956 static struct notifier_block ip6_mr_notifier = {
957 .notifier_call = ip6mr_device_event
961 * Setup for IP multicast routing
964 static int __net_init ip6mr_net_init(struct net *net)
967 net->ipv6.vif6_table = kcalloc(MAXMIFS, sizeof(struct mif_device),
969 if (!net->ipv6.vif6_table) {
974 /* Forwarding cache */
975 net->ipv6.mfc6_cache_array = kcalloc(MFC6_LINES,
976 sizeof(struct mfc6_cache *),
978 if (!net->ipv6.mfc6_cache_array) {
980 goto fail_mfc6_cache;
983 #ifdef CONFIG_IPV6_PIMSM_V2
984 net->ipv6.mroute_reg_vif_num = -1;
989 kfree(net->ipv6.vif6_table);
994 static void __net_exit ip6mr_net_exit(struct net *net)
996 kfree(net->ipv6.mfc6_cache_array);
997 kfree(net->ipv6.vif6_table);
1000 static struct pernet_operations ip6mr_net_ops = {
1001 .init = ip6mr_net_init,
1002 .exit = ip6mr_net_exit,
1005 int __init ip6_mr_init(void)
1009 mrt_cachep = kmem_cache_create("ip6_mrt_cache",
1010 sizeof(struct mfc6_cache),
1011 0, SLAB_HWCACHE_ALIGN,
1016 err = register_pernet_subsys(&ip6mr_net_ops);
1018 goto reg_pernet_fail;
1020 setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1021 err = register_netdevice_notifier(&ip6_mr_notifier);
1023 goto reg_notif_fail;
1024 #ifdef CONFIG_PROC_FS
1026 if (!proc_net_fops_create(&init_net, "ip6_mr_vif", 0, &ip6mr_vif_fops))
1028 if (!proc_net_fops_create(&init_net, "ip6_mr_cache",
1029 0, &ip6mr_mfc_fops))
1030 goto proc_cache_fail;
1033 #ifdef CONFIG_PROC_FS
1035 proc_net_remove(&init_net, "ip6_mr_vif");
1037 unregister_netdevice_notifier(&ip6_mr_notifier);
1040 del_timer(&ipmr_expire_timer);
1041 unregister_pernet_subsys(&ip6mr_net_ops);
1043 kmem_cache_destroy(mrt_cachep);
1047 void ip6_mr_cleanup(void)
1049 #ifdef CONFIG_PROC_FS
1050 proc_net_remove(&init_net, "ip6_mr_cache");
1051 proc_net_remove(&init_net, "ip6_mr_vif");
1053 unregister_netdevice_notifier(&ip6_mr_notifier);
1054 del_timer(&ipmr_expire_timer);
1055 unregister_pernet_subsys(&ip6mr_net_ops);
1056 kmem_cache_destroy(mrt_cachep);
1059 static int ip6mr_mfc_add(struct mf6cctl *mfc, int mrtsock)
1062 struct mfc6_cache *uc, *c, **cp;
1063 unsigned char ttls[MAXMIFS];
1066 memset(ttls, 255, MAXMIFS);
1067 for (i = 0; i < MAXMIFS; i++) {
1068 if (IF_ISSET(i, &mfc->mf6cc_ifset))
1073 line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr);
1075 for (cp = &init_net.ipv6.mfc6_cache_array[line];
1076 (c = *cp) != NULL; cp = &c->next) {
1077 if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) &&
1078 ipv6_addr_equal(&c->mf6c_mcastgrp, &mfc->mf6cc_mcastgrp.sin6_addr))
1083 write_lock_bh(&mrt_lock);
1084 c->mf6c_parent = mfc->mf6cc_parent;
1085 ip6mr_update_thresholds(c, ttls);
1087 c->mfc_flags |= MFC_STATIC;
1088 write_unlock_bh(&mrt_lock);
1092 if (!ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr))
1095 c = ip6mr_cache_alloc(&init_net);
1099 c->mf6c_origin = mfc->mf6cc_origin.sin6_addr;
1100 c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr;
1101 c->mf6c_parent = mfc->mf6cc_parent;
1102 ip6mr_update_thresholds(c, ttls);
1104 c->mfc_flags |= MFC_STATIC;
1106 write_lock_bh(&mrt_lock);
1107 c->next = init_net.ipv6.mfc6_cache_array[line];
1108 init_net.ipv6.mfc6_cache_array[line] = c;
1109 write_unlock_bh(&mrt_lock);
1112 * Check to see if we resolved a queued list. If so we
1113 * need to send on the frames and tidy up.
1115 spin_lock_bh(&mfc_unres_lock);
1116 for (cp = &mfc_unres_queue; (uc = *cp) != NULL;
1118 if (net_eq(mfc6_net(uc), &init_net) &&
1119 ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) &&
1120 ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) {
1122 atomic_dec(&init_net.ipv6.cache_resolve_queue_len);
1126 if (mfc_unres_queue == NULL)
1127 del_timer(&ipmr_expire_timer);
1128 spin_unlock_bh(&mfc_unres_lock);
1131 ip6mr_cache_resolve(uc, c);
1132 ip6mr_cache_free(uc);
1138 * Close the multicast socket, and clear the vif tables etc
1141 static void mroute_clean_tables(struct sock *sk)
1146 * Shut down all active vif entries
1148 for (i = 0; i < init_net.ipv6.maxvif; i++) {
1149 if (!(init_net.ipv6.vif6_table[i].flags & VIFF_STATIC))
1156 for (i = 0; i < MFC6_LINES; i++) {
1157 struct mfc6_cache *c, **cp;
1159 cp = &init_net.ipv6.mfc6_cache_array[i];
1160 while ((c = *cp) != NULL) {
1161 if (c->mfc_flags & MFC_STATIC) {
1165 write_lock_bh(&mrt_lock);
1167 write_unlock_bh(&mrt_lock);
1169 ip6mr_cache_free(c);
1173 if (atomic_read(&init_net.ipv6.cache_resolve_queue_len) != 0) {
1174 struct mfc6_cache *c, **cp;
1176 spin_lock_bh(&mfc_unres_lock);
1177 cp = &mfc_unres_queue;
1178 while ((c = *cp) != NULL) {
1179 if (!net_eq(mfc6_net(c), &init_net)) {
1184 ip6mr_destroy_unres(c);
1186 spin_unlock_bh(&mfc_unres_lock);
1190 static int ip6mr_sk_init(struct sock *sk)
1195 write_lock_bh(&mrt_lock);
1196 if (likely(init_net.ipv6.mroute6_sk == NULL))
1197 init_net.ipv6.mroute6_sk = sk;
1200 write_unlock_bh(&mrt_lock);
1207 int ip6mr_sk_done(struct sock *sk)
1212 if (sk == init_net.ipv6.mroute6_sk) {
1213 write_lock_bh(&mrt_lock);
1214 init_net.ipv6.mroute6_sk = NULL;
1215 write_unlock_bh(&mrt_lock);
1217 mroute_clean_tables(sk);
1226 * Socket options and virtual interface manipulation. The whole
1227 * virtual interface system is a complete heap, but unfortunately
1228 * that's how BSD mrouted happens to think. Maybe one day with a proper
1229 * MOSPF/PIM router set up we can clean this up.
1232 int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int optlen)
1239 if (optname != MRT6_INIT) {
1240 if (sk != init_net.ipv6.mroute6_sk && !capable(CAP_NET_ADMIN))
1246 if (sk->sk_type != SOCK_RAW ||
1247 inet_sk(sk)->num != IPPROTO_ICMPV6)
1249 if (optlen < sizeof(int))
1252 return ip6mr_sk_init(sk);
1255 return ip6mr_sk_done(sk);
1258 if (optlen < sizeof(vif))
1260 if (copy_from_user(&vif, optval, sizeof(vif)))
1262 if (vif.mif6c_mifi >= MAXMIFS)
1265 ret = mif6_add(&vif, sk == init_net.ipv6.mroute6_sk);
1270 if (optlen < sizeof(mifi_t))
1272 if (copy_from_user(&mifi, optval, sizeof(mifi_t)))
1275 ret = mif6_delete(mifi);
1280 * Manipulate the forwarding caches. These live
1281 * in a sort of kernel/user symbiosis.
1285 if (optlen < sizeof(mfc))
1287 if (copy_from_user(&mfc, optval, sizeof(mfc)))
1290 if (optname == MRT6_DEL_MFC)
1291 ret = ip6mr_mfc_delete(&mfc);
1293 ret = ip6mr_mfc_add(&mfc, sk == init_net.ipv6.mroute6_sk);
1298 * Control PIM assert (to activate pim will activate assert)
1303 if (get_user(v, (int __user *)optval))
1305 init_net.ipv6.mroute_do_assert = !!v;
1309 #ifdef CONFIG_IPV6_PIMSM_V2
1313 if (get_user(v, (int __user *)optval))
1318 if (v != init_net.ipv6.mroute_do_pim) {
1319 init_net.ipv6.mroute_do_pim = v;
1320 init_net.ipv6.mroute_do_assert = v;
1321 if (init_net.ipv6.mroute_do_pim)
1322 ret = inet6_add_protocol(&pim6_protocol,
1325 ret = inet6_del_protocol(&pim6_protocol,
1336 * Spurious command, or MRT6_VERSION which you cannot
1340 return -ENOPROTOOPT;
1345 * Getsock opt support for the multicast routing system.
1348 int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
1358 #ifdef CONFIG_IPV6_PIMSM_V2
1360 val = init_net.ipv6.mroute_do_pim;
1364 val = init_net.ipv6.mroute_do_assert;
1367 return -ENOPROTOOPT;
1370 if (get_user(olr, optlen))
1373 olr = min_t(int, olr, sizeof(int));
1377 if (put_user(olr, optlen))
1379 if (copy_to_user(optval, &val, olr))
1385 * The IP multicast ioctl support routines.
1388 int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
1390 struct sioc_sg_req6 sr;
1391 struct sioc_mif_req6 vr;
1392 struct mif_device *vif;
1393 struct mfc6_cache *c;
1396 case SIOCGETMIFCNT_IN6:
1397 if (copy_from_user(&vr, arg, sizeof(vr)))
1399 if (vr.mifi >= init_net.ipv6.maxvif)
1401 read_lock(&mrt_lock);
1402 vif = &init_net.ipv6.vif6_table[vr.mifi];
1403 if (MIF_EXISTS(&init_net, vr.mifi)) {
1404 vr.icount = vif->pkt_in;
1405 vr.ocount = vif->pkt_out;
1406 vr.ibytes = vif->bytes_in;
1407 vr.obytes = vif->bytes_out;
1408 read_unlock(&mrt_lock);
1410 if (copy_to_user(arg, &vr, sizeof(vr)))
1414 read_unlock(&mrt_lock);
1415 return -EADDRNOTAVAIL;
1416 case SIOCGETSGCNT_IN6:
1417 if (copy_from_user(&sr, arg, sizeof(sr)))
1420 read_lock(&mrt_lock);
1421 c = ip6mr_cache_find(&sr.src.sin6_addr, &sr.grp.sin6_addr);
1423 sr.pktcnt = c->mfc_un.res.pkt;
1424 sr.bytecnt = c->mfc_un.res.bytes;
1425 sr.wrong_if = c->mfc_un.res.wrong_if;
1426 read_unlock(&mrt_lock);
1428 if (copy_to_user(arg, &sr, sizeof(sr)))
1432 read_unlock(&mrt_lock);
1433 return -EADDRNOTAVAIL;
1435 return -ENOIOCTLCMD;
1440 static inline int ip6mr_forward2_finish(struct sk_buff *skb)
1442 IP6_INC_STATS_BH(dev_net(skb->dst->dev), ip6_dst_idev(skb->dst),
1443 IPSTATS_MIB_OUTFORWDATAGRAMS);
1444 return dst_output(skb);
1448 * Processing handlers for ip6mr_forward
1451 static int ip6mr_forward2(struct sk_buff *skb, struct mfc6_cache *c, int vifi)
1453 struct ipv6hdr *ipv6h;
1454 struct mif_device *vif = &init_net.ipv6.vif6_table[vifi];
1455 struct net_device *dev;
1456 struct dst_entry *dst;
1459 if (vif->dev == NULL)
1462 #ifdef CONFIG_IPV6_PIMSM_V2
1463 if (vif->flags & MIFF_REGISTER) {
1465 vif->bytes_out += skb->len;
1466 vif->dev->stats.tx_bytes += skb->len;
1467 vif->dev->stats.tx_packets++;
1468 ip6mr_cache_report(skb, vifi, MRT6MSG_WHOLEPKT);
1474 ipv6h = ipv6_hdr(skb);
1476 fl = (struct flowi) {
1479 { .daddr = ipv6h->daddr, }
1483 dst = ip6_route_output(&init_net, NULL, &fl);
1487 dst_release(skb->dst);
1491 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1492 * not only before forwarding, but after forwarding on all output
1493 * interfaces. It is clear, if mrouter runs a multicasting
1494 * program, it should receive packets not depending to what interface
1495 * program is joined.
1496 * If we will not make it, the program will have to join on all
1497 * interfaces. On the other hand, multihoming host (or router, but
1498 * not mrouter) cannot join to more than one interface - it will
1499 * result in receiving multiple packets.
1504 vif->bytes_out += skb->len;
1506 /* We are about to write */
1507 /* XXX: extension headers? */
1508 if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(dev)))
1511 ipv6h = ipv6_hdr(skb);
1514 IP6CB(skb)->flags |= IP6SKB_FORWARDED;
1516 return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dev,
1517 ip6mr_forward2_finish);
1524 static int ip6mr_find_vif(struct net_device *dev)
1527 for (ct = init_net.ipv6.maxvif - 1; ct >= 0; ct--) {
1528 if (init_net.ipv6.vif6_table[ct].dev == dev)
1534 static int ip6_mr_forward(struct sk_buff *skb, struct mfc6_cache *cache)
1539 vif = cache->mf6c_parent;
1540 cache->mfc_un.res.pkt++;
1541 cache->mfc_un.res.bytes += skb->len;
1544 * Wrong interface: drop packet and (maybe) send PIM assert.
1546 if (init_net.ipv6.vif6_table[vif].dev != skb->dev) {
1549 cache->mfc_un.res.wrong_if++;
1550 true_vifi = ip6mr_find_vif(skb->dev);
1552 if (true_vifi >= 0 && init_net.ipv6.mroute_do_assert &&
1553 /* pimsm uses asserts, when switching from RPT to SPT,
1554 so that we cannot check that packet arrived on an oif.
1555 It is bad, but otherwise we would need to move pretty
1556 large chunk of pimd to kernel. Ough... --ANK
1558 (init_net.ipv6.mroute_do_pim ||
1559 cache->mfc_un.res.ttls[true_vifi] < 255) &&
1561 cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1562 cache->mfc_un.res.last_assert = jiffies;
1563 ip6mr_cache_report(skb, true_vifi, MRT6MSG_WRONGMIF);
1568 init_net.ipv6.vif6_table[vif].pkt_in++;
1569 init_net.ipv6.vif6_table[vif].bytes_in += skb->len;
1574 for (ct = cache->mfc_un.res.maxvif - 1; ct >= cache->mfc_un.res.minvif; ct--) {
1575 if (ipv6_hdr(skb)->hop_limit > cache->mfc_un.res.ttls[ct]) {
1577 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1579 ip6mr_forward2(skb2, cache, psend);
1585 ip6mr_forward2(skb, cache, psend);
1596 * Multicast packets for forwarding arrive here
1599 int ip6_mr_input(struct sk_buff *skb)
1601 struct mfc6_cache *cache;
1603 read_lock(&mrt_lock);
1604 cache = ip6mr_cache_find(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr);
1607 * No usable cache entry
1609 if (cache == NULL) {
1612 vif = ip6mr_find_vif(skb->dev);
1614 int err = ip6mr_cache_unresolved(vif, skb);
1615 read_unlock(&mrt_lock);
1619 read_unlock(&mrt_lock);
1624 ip6_mr_forward(skb, cache);
1626 read_unlock(&mrt_lock);
1633 ip6mr_fill_mroute(struct sk_buff *skb, struct mfc6_cache *c, struct rtmsg *rtm)
1636 struct rtnexthop *nhp;
1637 struct net_device *dev = init_net.ipv6.vif6_table[c->mf6c_parent].dev;
1638 u8 *b = skb_tail_pointer(skb);
1639 struct rtattr *mp_head;
1642 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1644 mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1646 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1647 if (c->mfc_un.res.ttls[ct] < 255) {
1648 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1649 goto rtattr_failure;
1650 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1651 nhp->rtnh_flags = 0;
1652 nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1653 nhp->rtnh_ifindex = init_net.ipv6.vif6_table[ct].dev->ifindex;
1654 nhp->rtnh_len = sizeof(*nhp);
1657 mp_head->rta_type = RTA_MULTIPATH;
1658 mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1659 rtm->rtm_type = RTN_MULTICAST;
1667 int ip6mr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1670 struct mfc6_cache *cache;
1671 struct rt6_info *rt = (struct rt6_info *)skb->dst;
1673 read_lock(&mrt_lock);
1674 cache = ip6mr_cache_find(&rt->rt6i_src.addr, &rt->rt6i_dst.addr);
1677 struct sk_buff *skb2;
1678 struct ipv6hdr *iph;
1679 struct net_device *dev;
1683 read_unlock(&mrt_lock);
1688 if (dev == NULL || (vif = ip6mr_find_vif(dev)) < 0) {
1689 read_unlock(&mrt_lock);
1693 /* really correct? */
1694 skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
1696 read_unlock(&mrt_lock);
1700 skb_reset_transport_header(skb2);
1702 skb_put(skb2, sizeof(struct ipv6hdr));
1703 skb_reset_network_header(skb2);
1705 iph = ipv6_hdr(skb2);
1708 iph->flow_lbl[0] = 0;
1709 iph->flow_lbl[1] = 0;
1710 iph->flow_lbl[2] = 0;
1711 iph->payload_len = 0;
1712 iph->nexthdr = IPPROTO_NONE;
1714 ipv6_addr_copy(&iph->saddr, &rt->rt6i_src.addr);
1715 ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr);
1717 err = ip6mr_cache_unresolved(vif, skb2);
1718 read_unlock(&mrt_lock);
1723 if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1724 cache->mfc_flags |= MFC_NOTIFY;
1726 err = ip6mr_fill_mroute(skb, cache, rtm);
1727 read_unlock(&mrt_lock);