]> bbs.cooldavid.org Git - net-next-2.6.git/blame_incremental - net/ipv4/ipmr.c
ixgbe: Defeature Tx Head writeback
[net-next-2.6.git] / net / ipv4 / ipmr.c
... / ...
CommitLineData
1/*
2 * IP multicast routing support for mrouted 3.6/3.8
3 *
4 * (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5 * Linux Consultancy and Custom Driver Development
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Fixes:
13 * Michael Chastain : Incorrect size of copying.
14 * Alan Cox : Added the cache manager code
15 * Alan Cox : Fixed the clone/copy bug and device race.
16 * Mike McLagan : Routing by source
17 * Malcolm Beattie : Buffer handling fixes.
18 * Alexey Kuznetsov : Double buffer free and other fixes.
19 * SVR Anand : Fixed several multicast bugs and problems.
20 * Alexey Kuznetsov : Status, optimisations and more.
21 * Brad Parker : Better behaviour on mrouted upcall
22 * overflow.
23 * Carlos Picoto : PIMv1 Support
24 * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header
25 * Relax this requrement to work with older peers.
26 *
27 */
28
29#include <asm/system.h>
30#include <asm/uaccess.h>
31#include <linux/types.h>
32#include <linux/capability.h>
33#include <linux/errno.h>
34#include <linux/timer.h>
35#include <linux/mm.h>
36#include <linux/kernel.h>
37#include <linux/fcntl.h>
38#include <linux/stat.h>
39#include <linux/socket.h>
40#include <linux/in.h>
41#include <linux/inet.h>
42#include <linux/netdevice.h>
43#include <linux/inetdevice.h>
44#include <linux/igmp.h>
45#include <linux/proc_fs.h>
46#include <linux/seq_file.h>
47#include <linux/mroute.h>
48#include <linux/init.h>
49#include <linux/if_ether.h>
50#include <net/net_namespace.h>
51#include <net/ip.h>
52#include <net/protocol.h>
53#include <linux/skbuff.h>
54#include <net/route.h>
55#include <net/sock.h>
56#include <net/icmp.h>
57#include <net/udp.h>
58#include <net/raw.h>
59#include <linux/notifier.h>
60#include <linux/if_arp.h>
61#include <linux/netfilter_ipv4.h>
62#include <net/ipip.h>
63#include <net/checksum.h>
64#include <net/netlink.h>
65
66#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67#define CONFIG_IP_PIMSM 1
68#endif
69
70/* Big lock, protecting vif table, mrt cache and mroute socket state.
71 Note that the changes are semaphored via rtnl_lock.
72 */
73
74static DEFINE_RWLOCK(mrt_lock);
75
76/*
77 * Multicast router control variables
78 */
79
80#define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL)
81
82static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */
83
84/* Special spinlock for queue of unresolved entries */
85static DEFINE_SPINLOCK(mfc_unres_lock);
86
87/* We return to original Alan's scheme. Hash table of resolved
88 entries is changed only in process context and protected
89 with weak lock mrt_lock. Queue of unresolved entries is protected
90 with strong spinlock mfc_unres_lock.
91
92 In this case data path is free of exclusive locks at all.
93 */
94
95static struct kmem_cache *mrt_cachep __read_mostly;
96
97static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
98static int ipmr_cache_report(struct net *net,
99 struct sk_buff *pkt, vifi_t vifi, int assert);
100static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
101
102#ifdef CONFIG_IP_PIMSM_V2
103static struct net_protocol pim_protocol;
104#endif
105
106static struct timer_list ipmr_expire_timer;
107
108/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
109
110static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
111{
112 struct net *net = dev_net(dev);
113
114 dev_close(dev);
115
116 dev = __dev_get_by_name(net, "tunl0");
117 if (dev) {
118 const struct net_device_ops *ops = dev->netdev_ops;
119 struct ifreq ifr;
120 struct ip_tunnel_parm p;
121
122 memset(&p, 0, sizeof(p));
123 p.iph.daddr = v->vifc_rmt_addr.s_addr;
124 p.iph.saddr = v->vifc_lcl_addr.s_addr;
125 p.iph.version = 4;
126 p.iph.ihl = 5;
127 p.iph.protocol = IPPROTO_IPIP;
128 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
129 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
130
131 if (ops->ndo_do_ioctl) {
132 mm_segment_t oldfs = get_fs();
133
134 set_fs(KERNEL_DS);
135 ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
136 set_fs(oldfs);
137 }
138 }
139}
140
141static
142struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
143{
144 struct net_device *dev;
145
146 dev = __dev_get_by_name(net, "tunl0");
147
148 if (dev) {
149 const struct net_device_ops *ops = dev->netdev_ops;
150 int err;
151 struct ifreq ifr;
152 struct ip_tunnel_parm p;
153 struct in_device *in_dev;
154
155 memset(&p, 0, sizeof(p));
156 p.iph.daddr = v->vifc_rmt_addr.s_addr;
157 p.iph.saddr = v->vifc_lcl_addr.s_addr;
158 p.iph.version = 4;
159 p.iph.ihl = 5;
160 p.iph.protocol = IPPROTO_IPIP;
161 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
162 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
163
164 if (ops->ndo_do_ioctl) {
165 mm_segment_t oldfs = get_fs();
166
167 set_fs(KERNEL_DS);
168 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
169 set_fs(oldfs);
170 } else
171 err = -EOPNOTSUPP;
172
173 dev = NULL;
174
175 if (err == 0 &&
176 (dev = __dev_get_by_name(net, p.name)) != NULL) {
177 dev->flags |= IFF_MULTICAST;
178
179 in_dev = __in_dev_get_rtnl(dev);
180 if (in_dev == NULL)
181 goto failure;
182
183 ipv4_devconf_setall(in_dev);
184 IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
185
186 if (dev_open(dev))
187 goto failure;
188 dev_hold(dev);
189 }
190 }
191 return dev;
192
193failure:
194 /* allow the register to be completed before unregistering. */
195 rtnl_unlock();
196 rtnl_lock();
197
198 unregister_netdevice(dev);
199 return NULL;
200}
201
202#ifdef CONFIG_IP_PIMSM
203
204static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
205{
206 struct net *net = dev_net(dev);
207
208 read_lock(&mrt_lock);
209 dev->stats.tx_bytes += skb->len;
210 dev->stats.tx_packets++;
211 ipmr_cache_report(net, skb, net->ipv4.mroute_reg_vif_num,
212 IGMPMSG_WHOLEPKT);
213 read_unlock(&mrt_lock);
214 kfree_skb(skb);
215 return 0;
216}
217
218static const struct net_device_ops reg_vif_netdev_ops = {
219 .ndo_start_xmit = reg_vif_xmit,
220};
221
222static void reg_vif_setup(struct net_device *dev)
223{
224 dev->type = ARPHRD_PIMREG;
225 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
226 dev->flags = IFF_NOARP;
227 dev->netdev_ops = &reg_vif_netdev_ops,
228 dev->destructor = free_netdev;
229}
230
231static struct net_device *ipmr_reg_vif(void)
232{
233 struct net_device *dev;
234 struct in_device *in_dev;
235
236 dev = alloc_netdev(0, "pimreg", reg_vif_setup);
237
238 if (dev == NULL)
239 return NULL;
240
241 if (register_netdevice(dev)) {
242 free_netdev(dev);
243 return NULL;
244 }
245 dev->iflink = 0;
246
247 rcu_read_lock();
248 if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
249 rcu_read_unlock();
250 goto failure;
251 }
252
253 ipv4_devconf_setall(in_dev);
254 IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
255 rcu_read_unlock();
256
257 if (dev_open(dev))
258 goto failure;
259
260 dev_hold(dev);
261
262 return dev;
263
264failure:
265 /* allow the register to be completed before unregistering. */
266 rtnl_unlock();
267 rtnl_lock();
268
269 unregister_netdevice(dev);
270 return NULL;
271}
272#endif
273
274/*
275 * Delete a VIF entry
276 * @notify: Set to 1, if the caller is a notifier_call
277 */
278
279static int vif_delete(struct net *net, int vifi, int notify)
280{
281 struct vif_device *v;
282 struct net_device *dev;
283 struct in_device *in_dev;
284
285 if (vifi < 0 || vifi >= net->ipv4.maxvif)
286 return -EADDRNOTAVAIL;
287
288 v = &net->ipv4.vif_table[vifi];
289
290 write_lock_bh(&mrt_lock);
291 dev = v->dev;
292 v->dev = NULL;
293
294 if (!dev) {
295 write_unlock_bh(&mrt_lock);
296 return -EADDRNOTAVAIL;
297 }
298
299#ifdef CONFIG_IP_PIMSM
300 if (vifi == net->ipv4.mroute_reg_vif_num)
301 net->ipv4.mroute_reg_vif_num = -1;
302#endif
303
304 if (vifi+1 == net->ipv4.maxvif) {
305 int tmp;
306 for (tmp=vifi-1; tmp>=0; tmp--) {
307 if (VIF_EXISTS(net, tmp))
308 break;
309 }
310 net->ipv4.maxvif = tmp+1;
311 }
312
313 write_unlock_bh(&mrt_lock);
314
315 dev_set_allmulti(dev, -1);
316
317 if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
318 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
319 ip_rt_multicast_event(in_dev);
320 }
321
322 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
323 unregister_netdevice(dev);
324
325 dev_put(dev);
326 return 0;
327}
328
329static inline void ipmr_cache_free(struct mfc_cache *c)
330{
331 release_net(mfc_net(c));
332 kmem_cache_free(mrt_cachep, c);
333}
334
335/* Destroy an unresolved cache entry, killing queued skbs
336 and reporting error to netlink readers.
337 */
338
339static void ipmr_destroy_unres(struct mfc_cache *c)
340{
341 struct sk_buff *skb;
342 struct nlmsgerr *e;
343 struct net *net = mfc_net(c);
344
345 atomic_dec(&net->ipv4.cache_resolve_queue_len);
346
347 while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
348 if (ip_hdr(skb)->version == 0) {
349 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
350 nlh->nlmsg_type = NLMSG_ERROR;
351 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
352 skb_trim(skb, nlh->nlmsg_len);
353 e = NLMSG_DATA(nlh);
354 e->error = -ETIMEDOUT;
355 memset(&e->msg, 0, sizeof(e->msg));
356
357 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
358 } else
359 kfree_skb(skb);
360 }
361
362 ipmr_cache_free(c);
363}
364
365
366/* Single timer process for all the unresolved queue. */
367
368static void ipmr_expire_process(unsigned long dummy)
369{
370 unsigned long now;
371 unsigned long expires;
372 struct mfc_cache *c, **cp;
373
374 if (!spin_trylock(&mfc_unres_lock)) {
375 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
376 return;
377 }
378
379 if (mfc_unres_queue == NULL)
380 goto out;
381
382 now = jiffies;
383 expires = 10*HZ;
384 cp = &mfc_unres_queue;
385
386 while ((c=*cp) != NULL) {
387 if (time_after(c->mfc_un.unres.expires, now)) {
388 unsigned long interval = c->mfc_un.unres.expires - now;
389 if (interval < expires)
390 expires = interval;
391 cp = &c->next;
392 continue;
393 }
394
395 *cp = c->next;
396
397 ipmr_destroy_unres(c);
398 }
399
400 if (mfc_unres_queue != NULL)
401 mod_timer(&ipmr_expire_timer, jiffies + expires);
402
403out:
404 spin_unlock(&mfc_unres_lock);
405}
406
407/* Fill oifs list. It is called under write locked mrt_lock. */
408
409static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
410{
411 int vifi;
412 struct net *net = mfc_net(cache);
413
414 cache->mfc_un.res.minvif = MAXVIFS;
415 cache->mfc_un.res.maxvif = 0;
416 memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
417
418 for (vifi = 0; vifi < net->ipv4.maxvif; vifi++) {
419 if (VIF_EXISTS(net, vifi) &&
420 ttls[vifi] && ttls[vifi] < 255) {
421 cache->mfc_un.res.ttls[vifi] = ttls[vifi];
422 if (cache->mfc_un.res.minvif > vifi)
423 cache->mfc_un.res.minvif = vifi;
424 if (cache->mfc_un.res.maxvif <= vifi)
425 cache->mfc_un.res.maxvif = vifi + 1;
426 }
427 }
428}
429
430static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
431{
432 int vifi = vifc->vifc_vifi;
433 struct vif_device *v = &net->ipv4.vif_table[vifi];
434 struct net_device *dev;
435 struct in_device *in_dev;
436 int err;
437
438 /* Is vif busy ? */
439 if (VIF_EXISTS(net, vifi))
440 return -EADDRINUSE;
441
442 switch (vifc->vifc_flags) {
443#ifdef CONFIG_IP_PIMSM
444 case VIFF_REGISTER:
445 /*
446 * Special Purpose VIF in PIM
447 * All the packets will be sent to the daemon
448 */
449 if (net->ipv4.mroute_reg_vif_num >= 0)
450 return -EADDRINUSE;
451 dev = ipmr_reg_vif();
452 if (!dev)
453 return -ENOBUFS;
454 err = dev_set_allmulti(dev, 1);
455 if (err) {
456 unregister_netdevice(dev);
457 dev_put(dev);
458 return err;
459 }
460 break;
461#endif
462 case VIFF_TUNNEL:
463 dev = ipmr_new_tunnel(net, vifc);
464 if (!dev)
465 return -ENOBUFS;
466 err = dev_set_allmulti(dev, 1);
467 if (err) {
468 ipmr_del_tunnel(dev, vifc);
469 dev_put(dev);
470 return err;
471 }
472 break;
473 case 0:
474 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
475 if (!dev)
476 return -EADDRNOTAVAIL;
477 err = dev_set_allmulti(dev, 1);
478 if (err) {
479 dev_put(dev);
480 return err;
481 }
482 break;
483 default:
484 return -EINVAL;
485 }
486
487 if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
488 return -EADDRNOTAVAIL;
489 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
490 ip_rt_multicast_event(in_dev);
491
492 /*
493 * Fill in the VIF structures
494 */
495 v->rate_limit = vifc->vifc_rate_limit;
496 v->local = vifc->vifc_lcl_addr.s_addr;
497 v->remote = vifc->vifc_rmt_addr.s_addr;
498 v->flags = vifc->vifc_flags;
499 if (!mrtsock)
500 v->flags |= VIFF_STATIC;
501 v->threshold = vifc->vifc_threshold;
502 v->bytes_in = 0;
503 v->bytes_out = 0;
504 v->pkt_in = 0;
505 v->pkt_out = 0;
506 v->link = dev->ifindex;
507 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
508 v->link = dev->iflink;
509
510 /* And finish update writing critical data */
511 write_lock_bh(&mrt_lock);
512 v->dev = dev;
513#ifdef CONFIG_IP_PIMSM
514 if (v->flags&VIFF_REGISTER)
515 net->ipv4.mroute_reg_vif_num = vifi;
516#endif
517 if (vifi+1 > net->ipv4.maxvif)
518 net->ipv4.maxvif = vifi+1;
519 write_unlock_bh(&mrt_lock);
520 return 0;
521}
522
523static struct mfc_cache *ipmr_cache_find(struct net *net,
524 __be32 origin,
525 __be32 mcastgrp)
526{
527 int line = MFC_HASH(mcastgrp, origin);
528 struct mfc_cache *c;
529
530 for (c = net->ipv4.mfc_cache_array[line]; c; c = c->next) {
531 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
532 break;
533 }
534 return c;
535}
536
537/*
538 * Allocate a multicast cache entry
539 */
540static struct mfc_cache *ipmr_cache_alloc(struct net *net)
541{
542 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
543 if (c == NULL)
544 return NULL;
545 c->mfc_un.res.minvif = MAXVIFS;
546 mfc_net_set(c, net);
547 return c;
548}
549
550static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net)
551{
552 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
553 if (c == NULL)
554 return NULL;
555 skb_queue_head_init(&c->mfc_un.unres.unresolved);
556 c->mfc_un.unres.expires = jiffies + 10*HZ;
557 mfc_net_set(c, net);
558 return c;
559}
560
561/*
562 * A cache entry has gone into a resolved state from queued
563 */
564
565static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
566{
567 struct sk_buff *skb;
568 struct nlmsgerr *e;
569
570 /*
571 * Play the pending entries through our router
572 */
573
574 while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
575 if (ip_hdr(skb)->version == 0) {
576 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
577
578 if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
579 nlh->nlmsg_len = (skb_tail_pointer(skb) -
580 (u8 *)nlh);
581 } else {
582 nlh->nlmsg_type = NLMSG_ERROR;
583 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
584 skb_trim(skb, nlh->nlmsg_len);
585 e = NLMSG_DATA(nlh);
586 e->error = -EMSGSIZE;
587 memset(&e->msg, 0, sizeof(e->msg));
588 }
589
590 rtnl_unicast(skb, mfc_net(c), NETLINK_CB(skb).pid);
591 } else
592 ip_mr_forward(skb, c, 0);
593 }
594}
595
596/*
597 * Bounce a cache query up to mrouted. We could use netlink for this but mrouted
598 * expects the following bizarre scheme.
599 *
600 * Called under mrt_lock.
601 */
602
603static int ipmr_cache_report(struct net *net,
604 struct sk_buff *pkt, vifi_t vifi, int assert)
605{
606 struct sk_buff *skb;
607 const int ihl = ip_hdrlen(pkt);
608 struct igmphdr *igmp;
609 struct igmpmsg *msg;
610 int ret;
611
612#ifdef CONFIG_IP_PIMSM
613 if (assert == IGMPMSG_WHOLEPKT)
614 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
615 else
616#endif
617 skb = alloc_skb(128, GFP_ATOMIC);
618
619 if (!skb)
620 return -ENOBUFS;
621
622#ifdef CONFIG_IP_PIMSM
623 if (assert == IGMPMSG_WHOLEPKT) {
624 /* Ugly, but we have no choice with this interface.
625 Duplicate old header, fix ihl, length etc.
626 And all this only to mangle msg->im_msgtype and
627 to set msg->im_mbz to "mbz" :-)
628 */
629 skb_push(skb, sizeof(struct iphdr));
630 skb_reset_network_header(skb);
631 skb_reset_transport_header(skb);
632 msg = (struct igmpmsg *)skb_network_header(skb);
633 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
634 msg->im_msgtype = IGMPMSG_WHOLEPKT;
635 msg->im_mbz = 0;
636 msg->im_vif = net->ipv4.mroute_reg_vif_num;
637 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
638 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
639 sizeof(struct iphdr));
640 } else
641#endif
642 {
643
644 /*
645 * Copy the IP header
646 */
647
648 skb->network_header = skb->tail;
649 skb_put(skb, ihl);
650 skb_copy_to_linear_data(skb, pkt->data, ihl);
651 ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
652 msg = (struct igmpmsg *)skb_network_header(skb);
653 msg->im_vif = vifi;
654 skb->dst = dst_clone(pkt->dst);
655
656 /*
657 * Add our header
658 */
659
660 igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
661 igmp->type =
662 msg->im_msgtype = assert;
663 igmp->code = 0;
664 ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
665 skb->transport_header = skb->network_header;
666 }
667
668 if (net->ipv4.mroute_sk == NULL) {
669 kfree_skb(skb);
670 return -EINVAL;
671 }
672
673 /*
674 * Deliver to mrouted
675 */
676 ret = sock_queue_rcv_skb(net->ipv4.mroute_sk, skb);
677 if (ret < 0) {
678 if (net_ratelimit())
679 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
680 kfree_skb(skb);
681 }
682
683 return ret;
684}
685
686/*
687 * Queue a packet for resolution. It gets locked cache entry!
688 */
689
690static int
691ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
692{
693 int err;
694 struct mfc_cache *c;
695 const struct iphdr *iph = ip_hdr(skb);
696
697 spin_lock_bh(&mfc_unres_lock);
698 for (c=mfc_unres_queue; c; c=c->next) {
699 if (net_eq(mfc_net(c), net) &&
700 c->mfc_mcastgrp == iph->daddr &&
701 c->mfc_origin == iph->saddr)
702 break;
703 }
704
705 if (c == NULL) {
706 /*
707 * Create a new entry if allowable
708 */
709
710 if (atomic_read(&net->ipv4.cache_resolve_queue_len) >= 10 ||
711 (c = ipmr_cache_alloc_unres(net)) == NULL) {
712 spin_unlock_bh(&mfc_unres_lock);
713
714 kfree_skb(skb);
715 return -ENOBUFS;
716 }
717
718 /*
719 * Fill in the new cache entry
720 */
721 c->mfc_parent = -1;
722 c->mfc_origin = iph->saddr;
723 c->mfc_mcastgrp = iph->daddr;
724
725 /*
726 * Reflect first query at mrouted.
727 */
728 err = ipmr_cache_report(net, skb, vifi, IGMPMSG_NOCACHE);
729 if (err < 0) {
730 /* If the report failed throw the cache entry
731 out - Brad Parker
732 */
733 spin_unlock_bh(&mfc_unres_lock);
734
735 ipmr_cache_free(c);
736 kfree_skb(skb);
737 return err;
738 }
739
740 atomic_inc(&net->ipv4.cache_resolve_queue_len);
741 c->next = mfc_unres_queue;
742 mfc_unres_queue = c;
743
744 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
745 }
746
747 /*
748 * See if we can append the packet
749 */
750 if (c->mfc_un.unres.unresolved.qlen>3) {
751 kfree_skb(skb);
752 err = -ENOBUFS;
753 } else {
754 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
755 err = 0;
756 }
757
758 spin_unlock_bh(&mfc_unres_lock);
759 return err;
760}
761
762/*
763 * MFC cache manipulation by user space mroute daemon
764 */
765
766static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc)
767{
768 int line;
769 struct mfc_cache *c, **cp;
770
771 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
772
773 for (cp = &net->ipv4.mfc_cache_array[line];
774 (c = *cp) != NULL; cp = &c->next) {
775 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
776 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
777 write_lock_bh(&mrt_lock);
778 *cp = c->next;
779 write_unlock_bh(&mrt_lock);
780
781 ipmr_cache_free(c);
782 return 0;
783 }
784 }
785 return -ENOENT;
786}
787
788static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
789{
790 int line;
791 struct mfc_cache *uc, *c, **cp;
792
793 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
794
795 for (cp = &net->ipv4.mfc_cache_array[line];
796 (c = *cp) != NULL; cp = &c->next) {
797 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
798 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
799 break;
800 }
801
802 if (c != NULL) {
803 write_lock_bh(&mrt_lock);
804 c->mfc_parent = mfc->mfcc_parent;
805 ipmr_update_thresholds(c, mfc->mfcc_ttls);
806 if (!mrtsock)
807 c->mfc_flags |= MFC_STATIC;
808 write_unlock_bh(&mrt_lock);
809 return 0;
810 }
811
812 if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
813 return -EINVAL;
814
815 c = ipmr_cache_alloc(net);
816 if (c == NULL)
817 return -ENOMEM;
818
819 c->mfc_origin = mfc->mfcc_origin.s_addr;
820 c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
821 c->mfc_parent = mfc->mfcc_parent;
822 ipmr_update_thresholds(c, mfc->mfcc_ttls);
823 if (!mrtsock)
824 c->mfc_flags |= MFC_STATIC;
825
826 write_lock_bh(&mrt_lock);
827 c->next = net->ipv4.mfc_cache_array[line];
828 net->ipv4.mfc_cache_array[line] = c;
829 write_unlock_bh(&mrt_lock);
830
831 /*
832 * Check to see if we resolved a queued list. If so we
833 * need to send on the frames and tidy up.
834 */
835 spin_lock_bh(&mfc_unres_lock);
836 for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
837 cp = &uc->next) {
838 if (net_eq(mfc_net(uc), net) &&
839 uc->mfc_origin == c->mfc_origin &&
840 uc->mfc_mcastgrp == c->mfc_mcastgrp) {
841 *cp = uc->next;
842 atomic_dec(&net->ipv4.cache_resolve_queue_len);
843 break;
844 }
845 }
846 if (mfc_unres_queue == NULL)
847 del_timer(&ipmr_expire_timer);
848 spin_unlock_bh(&mfc_unres_lock);
849
850 if (uc) {
851 ipmr_cache_resolve(uc, c);
852 ipmr_cache_free(uc);
853 }
854 return 0;
855}
856
857/*
858 * Close the multicast socket, and clear the vif tables etc
859 */
860
861static void mroute_clean_tables(struct net *net)
862{
863 int i;
864
865 /*
866 * Shut down all active vif entries
867 */
868 for (i = 0; i < net->ipv4.maxvif; i++) {
869 if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC))
870 vif_delete(net, i, 0);
871 }
872
873 /*
874 * Wipe the cache
875 */
876 for (i=0; i<MFC_LINES; i++) {
877 struct mfc_cache *c, **cp;
878
879 cp = &net->ipv4.mfc_cache_array[i];
880 while ((c = *cp) != NULL) {
881 if (c->mfc_flags&MFC_STATIC) {
882 cp = &c->next;
883 continue;
884 }
885 write_lock_bh(&mrt_lock);
886 *cp = c->next;
887 write_unlock_bh(&mrt_lock);
888
889 ipmr_cache_free(c);
890 }
891 }
892
893 if (atomic_read(&net->ipv4.cache_resolve_queue_len) != 0) {
894 struct mfc_cache *c, **cp;
895
896 spin_lock_bh(&mfc_unres_lock);
897 cp = &mfc_unres_queue;
898 while ((c = *cp) != NULL) {
899 if (!net_eq(mfc_net(c), net)) {
900 cp = &c->next;
901 continue;
902 }
903 *cp = c->next;
904
905 ipmr_destroy_unres(c);
906 }
907 spin_unlock_bh(&mfc_unres_lock);
908 }
909}
910
911static void mrtsock_destruct(struct sock *sk)
912{
913 struct net *net = sock_net(sk);
914
915 rtnl_lock();
916 if (sk == net->ipv4.mroute_sk) {
917 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
918
919 write_lock_bh(&mrt_lock);
920 net->ipv4.mroute_sk = NULL;
921 write_unlock_bh(&mrt_lock);
922
923 mroute_clean_tables(net);
924 }
925 rtnl_unlock();
926}
927
928/*
929 * Socket options and virtual interface manipulation. The whole
930 * virtual interface system is a complete heap, but unfortunately
931 * that's how BSD mrouted happens to think. Maybe one day with a proper
932 * MOSPF/PIM router set up we can clean this up.
933 */
934
935int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int optlen)
936{
937 int ret;
938 struct vifctl vif;
939 struct mfcctl mfc;
940 struct net *net = sock_net(sk);
941
942 if (optname != MRT_INIT) {
943 if (sk != net->ipv4.mroute_sk && !capable(CAP_NET_ADMIN))
944 return -EACCES;
945 }
946
947 switch (optname) {
948 case MRT_INIT:
949 if (sk->sk_type != SOCK_RAW ||
950 inet_sk(sk)->num != IPPROTO_IGMP)
951 return -EOPNOTSUPP;
952 if (optlen != sizeof(int))
953 return -ENOPROTOOPT;
954
955 rtnl_lock();
956 if (net->ipv4.mroute_sk) {
957 rtnl_unlock();
958 return -EADDRINUSE;
959 }
960
961 ret = ip_ra_control(sk, 1, mrtsock_destruct);
962 if (ret == 0) {
963 write_lock_bh(&mrt_lock);
964 net->ipv4.mroute_sk = sk;
965 write_unlock_bh(&mrt_lock);
966
967 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
968 }
969 rtnl_unlock();
970 return ret;
971 case MRT_DONE:
972 if (sk != net->ipv4.mroute_sk)
973 return -EACCES;
974 return ip_ra_control(sk, 0, NULL);
975 case MRT_ADD_VIF:
976 case MRT_DEL_VIF:
977 if (optlen != sizeof(vif))
978 return -EINVAL;
979 if (copy_from_user(&vif, optval, sizeof(vif)))
980 return -EFAULT;
981 if (vif.vifc_vifi >= MAXVIFS)
982 return -ENFILE;
983 rtnl_lock();
984 if (optname == MRT_ADD_VIF) {
985 ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk);
986 } else {
987 ret = vif_delete(net, vif.vifc_vifi, 0);
988 }
989 rtnl_unlock();
990 return ret;
991
992 /*
993 * Manipulate the forwarding caches. These live
994 * in a sort of kernel/user symbiosis.
995 */
996 case MRT_ADD_MFC:
997 case MRT_DEL_MFC:
998 if (optlen != sizeof(mfc))
999 return -EINVAL;
1000 if (copy_from_user(&mfc, optval, sizeof(mfc)))
1001 return -EFAULT;
1002 rtnl_lock();
1003 if (optname == MRT_DEL_MFC)
1004 ret = ipmr_mfc_delete(net, &mfc);
1005 else
1006 ret = ipmr_mfc_add(net, &mfc, sk == net->ipv4.mroute_sk);
1007 rtnl_unlock();
1008 return ret;
1009 /*
1010 * Control PIM assert.
1011 */
1012 case MRT_ASSERT:
1013 {
1014 int v;
1015 if (get_user(v,(int __user *)optval))
1016 return -EFAULT;
1017 net->ipv4.mroute_do_assert = (v) ? 1 : 0;
1018 return 0;
1019 }
1020#ifdef CONFIG_IP_PIMSM
1021 case MRT_PIM:
1022 {
1023 int v;
1024
1025 if (get_user(v,(int __user *)optval))
1026 return -EFAULT;
1027 v = (v) ? 1 : 0;
1028
1029 rtnl_lock();
1030 ret = 0;
1031 if (v != net->ipv4.mroute_do_pim) {
1032 net->ipv4.mroute_do_pim = v;
1033 net->ipv4.mroute_do_assert = v;
1034#ifdef CONFIG_IP_PIMSM_V2
1035 if (net->ipv4.mroute_do_pim)
1036 ret = inet_add_protocol(&pim_protocol,
1037 IPPROTO_PIM);
1038 else
1039 ret = inet_del_protocol(&pim_protocol,
1040 IPPROTO_PIM);
1041 if (ret < 0)
1042 ret = -EAGAIN;
1043#endif
1044 }
1045 rtnl_unlock();
1046 return ret;
1047 }
1048#endif
1049 /*
1050 * Spurious command, or MRT_VERSION which you cannot
1051 * set.
1052 */
1053 default:
1054 return -ENOPROTOOPT;
1055 }
1056}
1057
1058/*
1059 * Getsock opt support for the multicast routing system.
1060 */
1061
1062int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1063{
1064 int olr;
1065 int val;
1066 struct net *net = sock_net(sk);
1067
1068 if (optname != MRT_VERSION &&
1069#ifdef CONFIG_IP_PIMSM
1070 optname!=MRT_PIM &&
1071#endif
1072 optname!=MRT_ASSERT)
1073 return -ENOPROTOOPT;
1074
1075 if (get_user(olr, optlen))
1076 return -EFAULT;
1077
1078 olr = min_t(unsigned int, olr, sizeof(int));
1079 if (olr < 0)
1080 return -EINVAL;
1081
1082 if (put_user(olr, optlen))
1083 return -EFAULT;
1084 if (optname == MRT_VERSION)
1085 val = 0x0305;
1086#ifdef CONFIG_IP_PIMSM
1087 else if (optname == MRT_PIM)
1088 val = net->ipv4.mroute_do_pim;
1089#endif
1090 else
1091 val = net->ipv4.mroute_do_assert;
1092 if (copy_to_user(optval, &val, olr))
1093 return -EFAULT;
1094 return 0;
1095}
1096
1097/*
1098 * The IP multicast ioctl support routines.
1099 */
1100
1101int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1102{
1103 struct sioc_sg_req sr;
1104 struct sioc_vif_req vr;
1105 struct vif_device *vif;
1106 struct mfc_cache *c;
1107 struct net *net = sock_net(sk);
1108
1109 switch (cmd) {
1110 case SIOCGETVIFCNT:
1111 if (copy_from_user(&vr, arg, sizeof(vr)))
1112 return -EFAULT;
1113 if (vr.vifi >= net->ipv4.maxvif)
1114 return -EINVAL;
1115 read_lock(&mrt_lock);
1116 vif = &net->ipv4.vif_table[vr.vifi];
1117 if (VIF_EXISTS(net, vr.vifi)) {
1118 vr.icount = vif->pkt_in;
1119 vr.ocount = vif->pkt_out;
1120 vr.ibytes = vif->bytes_in;
1121 vr.obytes = vif->bytes_out;
1122 read_unlock(&mrt_lock);
1123
1124 if (copy_to_user(arg, &vr, sizeof(vr)))
1125 return -EFAULT;
1126 return 0;
1127 }
1128 read_unlock(&mrt_lock);
1129 return -EADDRNOTAVAIL;
1130 case SIOCGETSGCNT:
1131 if (copy_from_user(&sr, arg, sizeof(sr)))
1132 return -EFAULT;
1133
1134 read_lock(&mrt_lock);
1135 c = ipmr_cache_find(net, sr.src.s_addr, sr.grp.s_addr);
1136 if (c) {
1137 sr.pktcnt = c->mfc_un.res.pkt;
1138 sr.bytecnt = c->mfc_un.res.bytes;
1139 sr.wrong_if = c->mfc_un.res.wrong_if;
1140 read_unlock(&mrt_lock);
1141
1142 if (copy_to_user(arg, &sr, sizeof(sr)))
1143 return -EFAULT;
1144 return 0;
1145 }
1146 read_unlock(&mrt_lock);
1147 return -EADDRNOTAVAIL;
1148 default:
1149 return -ENOIOCTLCMD;
1150 }
1151}
1152
1153
1154static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1155{
1156 struct net_device *dev = ptr;
1157 struct net *net = dev_net(dev);
1158 struct vif_device *v;
1159 int ct;
1160
1161 if (!net_eq(dev_net(dev), net))
1162 return NOTIFY_DONE;
1163
1164 if (event != NETDEV_UNREGISTER)
1165 return NOTIFY_DONE;
1166 v = &net->ipv4.vif_table[0];
1167 for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) {
1168 if (v->dev == dev)
1169 vif_delete(net, ct, 1);
1170 }
1171 return NOTIFY_DONE;
1172}
1173
1174
1175static struct notifier_block ip_mr_notifier = {
1176 .notifier_call = ipmr_device_event,
1177};
1178
1179/*
1180 * Encapsulate a packet by attaching a valid IPIP header to it.
1181 * This avoids tunnel drivers and other mess and gives us the speed so
1182 * important for multicast video.
1183 */
1184
1185static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1186{
1187 struct iphdr *iph;
1188 struct iphdr *old_iph = ip_hdr(skb);
1189
1190 skb_push(skb, sizeof(struct iphdr));
1191 skb->transport_header = skb->network_header;
1192 skb_reset_network_header(skb);
1193 iph = ip_hdr(skb);
1194
1195 iph->version = 4;
1196 iph->tos = old_iph->tos;
1197 iph->ttl = old_iph->ttl;
1198 iph->frag_off = 0;
1199 iph->daddr = daddr;
1200 iph->saddr = saddr;
1201 iph->protocol = IPPROTO_IPIP;
1202 iph->ihl = 5;
1203 iph->tot_len = htons(skb->len);
1204 ip_select_ident(iph, skb->dst, NULL);
1205 ip_send_check(iph);
1206
1207 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1208 nf_reset(skb);
1209}
1210
1211static inline int ipmr_forward_finish(struct sk_buff *skb)
1212{
1213 struct ip_options * opt = &(IPCB(skb)->opt);
1214
1215 IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1216
1217 if (unlikely(opt->optlen))
1218 ip_forward_options(skb);
1219
1220 return dst_output(skb);
1221}
1222
1223/*
1224 * Processing handlers for ipmr_forward
1225 */
1226
1227static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1228{
1229 struct net *net = mfc_net(c);
1230 const struct iphdr *iph = ip_hdr(skb);
1231 struct vif_device *vif = &net->ipv4.vif_table[vifi];
1232 struct net_device *dev;
1233 struct rtable *rt;
1234 int encap = 0;
1235
1236 if (vif->dev == NULL)
1237 goto out_free;
1238
1239#ifdef CONFIG_IP_PIMSM
1240 if (vif->flags & VIFF_REGISTER) {
1241 vif->pkt_out++;
1242 vif->bytes_out += skb->len;
1243 vif->dev->stats.tx_bytes += skb->len;
1244 vif->dev->stats.tx_packets++;
1245 ipmr_cache_report(net, skb, vifi, IGMPMSG_WHOLEPKT);
1246 kfree_skb(skb);
1247 return;
1248 }
1249#endif
1250
1251 if (vif->flags&VIFF_TUNNEL) {
1252 struct flowi fl = { .oif = vif->link,
1253 .nl_u = { .ip4_u =
1254 { .daddr = vif->remote,
1255 .saddr = vif->local,
1256 .tos = RT_TOS(iph->tos) } },
1257 .proto = IPPROTO_IPIP };
1258 if (ip_route_output_key(net, &rt, &fl))
1259 goto out_free;
1260 encap = sizeof(struct iphdr);
1261 } else {
1262 struct flowi fl = { .oif = vif->link,
1263 .nl_u = { .ip4_u =
1264 { .daddr = iph->daddr,
1265 .tos = RT_TOS(iph->tos) } },
1266 .proto = IPPROTO_IPIP };
1267 if (ip_route_output_key(net, &rt, &fl))
1268 goto out_free;
1269 }
1270
1271 dev = rt->u.dst.dev;
1272
1273 if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1274 /* Do not fragment multicasts. Alas, IPv4 does not
1275 allow to send ICMP, so that packets will disappear
1276 to blackhole.
1277 */
1278
1279 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1280 ip_rt_put(rt);
1281 goto out_free;
1282 }
1283
1284 encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1285
1286 if (skb_cow(skb, encap)) {
1287 ip_rt_put(rt);
1288 goto out_free;
1289 }
1290
1291 vif->pkt_out++;
1292 vif->bytes_out += skb->len;
1293
1294 dst_release(skb->dst);
1295 skb->dst = &rt->u.dst;
1296 ip_decrease_ttl(ip_hdr(skb));
1297
1298 /* FIXME: forward and output firewalls used to be called here.
1299 * What do we do with netfilter? -- RR */
1300 if (vif->flags & VIFF_TUNNEL) {
1301 ip_encap(skb, vif->local, vif->remote);
1302 /* FIXME: extra output firewall step used to be here. --RR */
1303 vif->dev->stats.tx_packets++;
1304 vif->dev->stats.tx_bytes += skb->len;
1305 }
1306
1307 IPCB(skb)->flags |= IPSKB_FORWARDED;
1308
1309 /*
1310 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1311 * not only before forwarding, but after forwarding on all output
1312 * interfaces. It is clear, if mrouter runs a multicasting
1313 * program, it should receive packets not depending to what interface
1314 * program is joined.
1315 * If we will not make it, the program will have to join on all
1316 * interfaces. On the other hand, multihoming host (or router, but
1317 * not mrouter) cannot join to more than one interface - it will
1318 * result in receiving multiple packets.
1319 */
1320 NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
1321 ipmr_forward_finish);
1322 return;
1323
1324out_free:
1325 kfree_skb(skb);
1326 return;
1327}
1328
1329static int ipmr_find_vif(struct net_device *dev)
1330{
1331 struct net *net = dev_net(dev);
1332 int ct;
1333 for (ct = net->ipv4.maxvif-1; ct >= 0; ct--) {
1334 if (net->ipv4.vif_table[ct].dev == dev)
1335 break;
1336 }
1337 return ct;
1338}
1339
1340/* "local" means that we should preserve one skb (for local delivery) */
1341
1342static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1343{
1344 int psend = -1;
1345 int vif, ct;
1346 struct net *net = mfc_net(cache);
1347
1348 vif = cache->mfc_parent;
1349 cache->mfc_un.res.pkt++;
1350 cache->mfc_un.res.bytes += skb->len;
1351
1352 /*
1353 * Wrong interface: drop packet and (maybe) send PIM assert.
1354 */
1355 if (net->ipv4.vif_table[vif].dev != skb->dev) {
1356 int true_vifi;
1357
1358 if (skb->rtable->fl.iif == 0) {
1359 /* It is our own packet, looped back.
1360 Very complicated situation...
1361
1362 The best workaround until routing daemons will be
1363 fixed is not to redistribute packet, if it was
1364 send through wrong interface. It means, that
1365 multicast applications WILL NOT work for
1366 (S,G), which have default multicast route pointing
1367 to wrong oif. In any case, it is not a good
1368 idea to use multicasting applications on router.
1369 */
1370 goto dont_forward;
1371 }
1372
1373 cache->mfc_un.res.wrong_if++;
1374 true_vifi = ipmr_find_vif(skb->dev);
1375
1376 if (true_vifi >= 0 && net->ipv4.mroute_do_assert &&
1377 /* pimsm uses asserts, when switching from RPT to SPT,
1378 so that we cannot check that packet arrived on an oif.
1379 It is bad, but otherwise we would need to move pretty
1380 large chunk of pimd to kernel. Ough... --ANK
1381 */
1382 (net->ipv4.mroute_do_pim ||
1383 cache->mfc_un.res.ttls[true_vifi] < 255) &&
1384 time_after(jiffies,
1385 cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1386 cache->mfc_un.res.last_assert = jiffies;
1387 ipmr_cache_report(net, skb, true_vifi, IGMPMSG_WRONGVIF);
1388 }
1389 goto dont_forward;
1390 }
1391
1392 net->ipv4.vif_table[vif].pkt_in++;
1393 net->ipv4.vif_table[vif].bytes_in += skb->len;
1394
1395 /*
1396 * Forward the frame
1397 */
1398 for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1399 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1400 if (psend != -1) {
1401 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1402 if (skb2)
1403 ipmr_queue_xmit(skb2, cache, psend);
1404 }
1405 psend = ct;
1406 }
1407 }
1408 if (psend != -1) {
1409 if (local) {
1410 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1411 if (skb2)
1412 ipmr_queue_xmit(skb2, cache, psend);
1413 } else {
1414 ipmr_queue_xmit(skb, cache, psend);
1415 return 0;
1416 }
1417 }
1418
1419dont_forward:
1420 if (!local)
1421 kfree_skb(skb);
1422 return 0;
1423}
1424
1425
1426/*
1427 * Multicast packets for forwarding arrive here
1428 */
1429
1430int ip_mr_input(struct sk_buff *skb)
1431{
1432 struct mfc_cache *cache;
1433 struct net *net = dev_net(skb->dev);
1434 int local = skb->rtable->rt_flags&RTCF_LOCAL;
1435
1436 /* Packet is looped back after forward, it should not be
1437 forwarded second time, but still can be delivered locally.
1438 */
1439 if (IPCB(skb)->flags&IPSKB_FORWARDED)
1440 goto dont_forward;
1441
1442 if (!local) {
1443 if (IPCB(skb)->opt.router_alert) {
1444 if (ip_call_ra_chain(skb))
1445 return 0;
1446 } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1447 /* IGMPv1 (and broken IGMPv2 implementations sort of
1448 Cisco IOS <= 11.2(8)) do not put router alert
1449 option to IGMP packets destined to routable
1450 groups. It is very bad, because it means
1451 that we can forward NO IGMP messages.
1452 */
1453 read_lock(&mrt_lock);
1454 if (net->ipv4.mroute_sk) {
1455 nf_reset(skb);
1456 raw_rcv(net->ipv4.mroute_sk, skb);
1457 read_unlock(&mrt_lock);
1458 return 0;
1459 }
1460 read_unlock(&mrt_lock);
1461 }
1462 }
1463
1464 read_lock(&mrt_lock);
1465 cache = ipmr_cache_find(net, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1466
1467 /*
1468 * No usable cache entry
1469 */
1470 if (cache == NULL) {
1471 int vif;
1472
1473 if (local) {
1474 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1475 ip_local_deliver(skb);
1476 if (skb2 == NULL) {
1477 read_unlock(&mrt_lock);
1478 return -ENOBUFS;
1479 }
1480 skb = skb2;
1481 }
1482
1483 vif = ipmr_find_vif(skb->dev);
1484 if (vif >= 0) {
1485 int err = ipmr_cache_unresolved(net, vif, skb);
1486 read_unlock(&mrt_lock);
1487
1488 return err;
1489 }
1490 read_unlock(&mrt_lock);
1491 kfree_skb(skb);
1492 return -ENODEV;
1493 }
1494
1495 ip_mr_forward(skb, cache, local);
1496
1497 read_unlock(&mrt_lock);
1498
1499 if (local)
1500 return ip_local_deliver(skb);
1501
1502 return 0;
1503
1504dont_forward:
1505 if (local)
1506 return ip_local_deliver(skb);
1507 kfree_skb(skb);
1508 return 0;
1509}
1510
1511#ifdef CONFIG_IP_PIMSM
1512static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1513{
1514 struct net_device *reg_dev = NULL;
1515 struct iphdr *encap;
1516 struct net *net = dev_net(skb->dev);
1517
1518 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1519 /*
1520 Check that:
1521 a. packet is really destinted to a multicast group
1522 b. packet is not a NULL-REGISTER
1523 c. packet is not truncated
1524 */
1525 if (!ipv4_is_multicast(encap->daddr) ||
1526 encap->tot_len == 0 ||
1527 ntohs(encap->tot_len) + pimlen > skb->len)
1528 return 1;
1529
1530 read_lock(&mrt_lock);
1531 if (net->ipv4.mroute_reg_vif_num >= 0)
1532 reg_dev = net->ipv4.vif_table[net->ipv4.mroute_reg_vif_num].dev;
1533 if (reg_dev)
1534 dev_hold(reg_dev);
1535 read_unlock(&mrt_lock);
1536
1537 if (reg_dev == NULL)
1538 return 1;
1539
1540 skb->mac_header = skb->network_header;
1541 skb_pull(skb, (u8*)encap - skb->data);
1542 skb_reset_network_header(skb);
1543 skb->dev = reg_dev;
1544 skb->protocol = htons(ETH_P_IP);
1545 skb->ip_summed = 0;
1546 skb->pkt_type = PACKET_HOST;
1547 dst_release(skb->dst);
1548 skb->dst = NULL;
1549 reg_dev->stats.rx_bytes += skb->len;
1550 reg_dev->stats.rx_packets++;
1551 nf_reset(skb);
1552 netif_rx(skb);
1553 dev_put(reg_dev);
1554
1555 return 0;
1556}
1557#endif
1558
1559#ifdef CONFIG_IP_PIMSM_V1
1560/*
1561 * Handle IGMP messages of PIMv1
1562 */
1563
1564int pim_rcv_v1(struct sk_buff * skb)
1565{
1566 struct igmphdr *pim;
1567 struct net *net = dev_net(skb->dev);
1568
1569 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1570 goto drop;
1571
1572 pim = igmp_hdr(skb);
1573
1574 if (!net->ipv4.mroute_do_pim ||
1575 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1576 goto drop;
1577
1578 if (__pim_rcv(skb, sizeof(*pim))) {
1579drop:
1580 kfree_skb(skb);
1581 }
1582 return 0;
1583}
1584#endif
1585
1586#ifdef CONFIG_IP_PIMSM_V2
1587static int pim_rcv(struct sk_buff * skb)
1588{
1589 struct pimreghdr *pim;
1590
1591 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1592 goto drop;
1593
1594 pim = (struct pimreghdr *)skb_transport_header(skb);
1595 if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1596 (pim->flags&PIM_NULL_REGISTER) ||
1597 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1598 csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1599 goto drop;
1600
1601 if (__pim_rcv(skb, sizeof(*pim))) {
1602drop:
1603 kfree_skb(skb);
1604 }
1605 return 0;
1606}
1607#endif
1608
1609static int
1610ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1611{
1612 int ct;
1613 struct rtnexthop *nhp;
1614 struct net *net = mfc_net(c);
1615 struct net_device *dev = net->ipv4.vif_table[c->mfc_parent].dev;
1616 u8 *b = skb_tail_pointer(skb);
1617 struct rtattr *mp_head;
1618
1619 if (dev)
1620 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1621
1622 mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1623
1624 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1625 if (c->mfc_un.res.ttls[ct] < 255) {
1626 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1627 goto rtattr_failure;
1628 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1629 nhp->rtnh_flags = 0;
1630 nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1631 nhp->rtnh_ifindex = net->ipv4.vif_table[ct].dev->ifindex;
1632 nhp->rtnh_len = sizeof(*nhp);
1633 }
1634 }
1635 mp_head->rta_type = RTA_MULTIPATH;
1636 mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1637 rtm->rtm_type = RTN_MULTICAST;
1638 return 1;
1639
1640rtattr_failure:
1641 nlmsg_trim(skb, b);
1642 return -EMSGSIZE;
1643}
1644
1645int ipmr_get_route(struct net *net,
1646 struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1647{
1648 int err;
1649 struct mfc_cache *cache;
1650 struct rtable *rt = skb->rtable;
1651
1652 read_lock(&mrt_lock);
1653 cache = ipmr_cache_find(net, rt->rt_src, rt->rt_dst);
1654
1655 if (cache == NULL) {
1656 struct sk_buff *skb2;
1657 struct iphdr *iph;
1658 struct net_device *dev;
1659 int vif;
1660
1661 if (nowait) {
1662 read_unlock(&mrt_lock);
1663 return -EAGAIN;
1664 }
1665
1666 dev = skb->dev;
1667 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1668 read_unlock(&mrt_lock);
1669 return -ENODEV;
1670 }
1671 skb2 = skb_clone(skb, GFP_ATOMIC);
1672 if (!skb2) {
1673 read_unlock(&mrt_lock);
1674 return -ENOMEM;
1675 }
1676
1677 skb_push(skb2, sizeof(struct iphdr));
1678 skb_reset_network_header(skb2);
1679 iph = ip_hdr(skb2);
1680 iph->ihl = sizeof(struct iphdr) >> 2;
1681 iph->saddr = rt->rt_src;
1682 iph->daddr = rt->rt_dst;
1683 iph->version = 0;
1684 err = ipmr_cache_unresolved(net, vif, skb2);
1685 read_unlock(&mrt_lock);
1686 return err;
1687 }
1688
1689 if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1690 cache->mfc_flags |= MFC_NOTIFY;
1691 err = ipmr_fill_mroute(skb, cache, rtm);
1692 read_unlock(&mrt_lock);
1693 return err;
1694}
1695
1696#ifdef CONFIG_PROC_FS
1697/*
1698 * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1699 */
1700struct ipmr_vif_iter {
1701 struct seq_net_private p;
1702 int ct;
1703};
1704
1705static struct vif_device *ipmr_vif_seq_idx(struct net *net,
1706 struct ipmr_vif_iter *iter,
1707 loff_t pos)
1708{
1709 for (iter->ct = 0; iter->ct < net->ipv4.maxvif; ++iter->ct) {
1710 if (!VIF_EXISTS(net, iter->ct))
1711 continue;
1712 if (pos-- == 0)
1713 return &net->ipv4.vif_table[iter->ct];
1714 }
1715 return NULL;
1716}
1717
1718static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1719 __acquires(mrt_lock)
1720{
1721 struct net *net = seq_file_net(seq);
1722
1723 read_lock(&mrt_lock);
1724 return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
1725 : SEQ_START_TOKEN;
1726}
1727
1728static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1729{
1730 struct ipmr_vif_iter *iter = seq->private;
1731 struct net *net = seq_file_net(seq);
1732
1733 ++*pos;
1734 if (v == SEQ_START_TOKEN)
1735 return ipmr_vif_seq_idx(net, iter, 0);
1736
1737 while (++iter->ct < net->ipv4.maxvif) {
1738 if (!VIF_EXISTS(net, iter->ct))
1739 continue;
1740 return &net->ipv4.vif_table[iter->ct];
1741 }
1742 return NULL;
1743}
1744
1745static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1746 __releases(mrt_lock)
1747{
1748 read_unlock(&mrt_lock);
1749}
1750
1751static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1752{
1753 struct net *net = seq_file_net(seq);
1754
1755 if (v == SEQ_START_TOKEN) {
1756 seq_puts(seq,
1757 "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n");
1758 } else {
1759 const struct vif_device *vif = v;
1760 const char *name = vif->dev ? vif->dev->name : "none";
1761
1762 seq_printf(seq,
1763 "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
1764 vif - net->ipv4.vif_table,
1765 name, vif->bytes_in, vif->pkt_in,
1766 vif->bytes_out, vif->pkt_out,
1767 vif->flags, vif->local, vif->remote);
1768 }
1769 return 0;
1770}
1771
1772static const struct seq_operations ipmr_vif_seq_ops = {
1773 .start = ipmr_vif_seq_start,
1774 .next = ipmr_vif_seq_next,
1775 .stop = ipmr_vif_seq_stop,
1776 .show = ipmr_vif_seq_show,
1777};
1778
1779static int ipmr_vif_open(struct inode *inode, struct file *file)
1780{
1781 return seq_open_net(inode, file, &ipmr_vif_seq_ops,
1782 sizeof(struct ipmr_vif_iter));
1783}
1784
1785static const struct file_operations ipmr_vif_fops = {
1786 .owner = THIS_MODULE,
1787 .open = ipmr_vif_open,
1788 .read = seq_read,
1789 .llseek = seq_lseek,
1790 .release = seq_release_net,
1791};
1792
1793struct ipmr_mfc_iter {
1794 struct seq_net_private p;
1795 struct mfc_cache **cache;
1796 int ct;
1797};
1798
1799
1800static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
1801 struct ipmr_mfc_iter *it, loff_t pos)
1802{
1803 struct mfc_cache *mfc;
1804
1805 it->cache = net->ipv4.mfc_cache_array;
1806 read_lock(&mrt_lock);
1807 for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1808 for (mfc = net->ipv4.mfc_cache_array[it->ct];
1809 mfc; mfc = mfc->next)
1810 if (pos-- == 0)
1811 return mfc;
1812 read_unlock(&mrt_lock);
1813
1814 it->cache = &mfc_unres_queue;
1815 spin_lock_bh(&mfc_unres_lock);
1816 for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1817 if (net_eq(mfc_net(mfc), net) &&
1818 pos-- == 0)
1819 return mfc;
1820 spin_unlock_bh(&mfc_unres_lock);
1821
1822 it->cache = NULL;
1823 return NULL;
1824}
1825
1826
1827static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1828{
1829 struct ipmr_mfc_iter *it = seq->private;
1830 struct net *net = seq_file_net(seq);
1831
1832 it->cache = NULL;
1833 it->ct = 0;
1834 return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
1835 : SEQ_START_TOKEN;
1836}
1837
1838static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1839{
1840 struct mfc_cache *mfc = v;
1841 struct ipmr_mfc_iter *it = seq->private;
1842 struct net *net = seq_file_net(seq);
1843
1844 ++*pos;
1845
1846 if (v == SEQ_START_TOKEN)
1847 return ipmr_mfc_seq_idx(net, seq->private, 0);
1848
1849 if (mfc->next)
1850 return mfc->next;
1851
1852 if (it->cache == &mfc_unres_queue)
1853 goto end_of_list;
1854
1855 BUG_ON(it->cache != net->ipv4.mfc_cache_array);
1856
1857 while (++it->ct < MFC_LINES) {
1858 mfc = net->ipv4.mfc_cache_array[it->ct];
1859 if (mfc)
1860 return mfc;
1861 }
1862
1863 /* exhausted cache_array, show unresolved */
1864 read_unlock(&mrt_lock);
1865 it->cache = &mfc_unres_queue;
1866 it->ct = 0;
1867
1868 spin_lock_bh(&mfc_unres_lock);
1869 mfc = mfc_unres_queue;
1870 while (mfc && !net_eq(mfc_net(mfc), net))
1871 mfc = mfc->next;
1872 if (mfc)
1873 return mfc;
1874
1875 end_of_list:
1876 spin_unlock_bh(&mfc_unres_lock);
1877 it->cache = NULL;
1878
1879 return NULL;
1880}
1881
1882static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1883{
1884 struct ipmr_mfc_iter *it = seq->private;
1885 struct net *net = seq_file_net(seq);
1886
1887 if (it->cache == &mfc_unres_queue)
1888 spin_unlock_bh(&mfc_unres_lock);
1889 else if (it->cache == net->ipv4.mfc_cache_array)
1890 read_unlock(&mrt_lock);
1891}
1892
1893static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1894{
1895 int n;
1896 struct net *net = seq_file_net(seq);
1897
1898 if (v == SEQ_START_TOKEN) {
1899 seq_puts(seq,
1900 "Group Origin Iif Pkts Bytes Wrong Oifs\n");
1901 } else {
1902 const struct mfc_cache *mfc = v;
1903 const struct ipmr_mfc_iter *it = seq->private;
1904
1905 seq_printf(seq, "%08lX %08lX %-3hd",
1906 (unsigned long) mfc->mfc_mcastgrp,
1907 (unsigned long) mfc->mfc_origin,
1908 mfc->mfc_parent);
1909
1910 if (it->cache != &mfc_unres_queue) {
1911 seq_printf(seq, " %8lu %8lu %8lu",
1912 mfc->mfc_un.res.pkt,
1913 mfc->mfc_un.res.bytes,
1914 mfc->mfc_un.res.wrong_if);
1915 for (n = mfc->mfc_un.res.minvif;
1916 n < mfc->mfc_un.res.maxvif; n++ ) {
1917 if (VIF_EXISTS(net, n) &&
1918 mfc->mfc_un.res.ttls[n] < 255)
1919 seq_printf(seq,
1920 " %2d:%-3d",
1921 n, mfc->mfc_un.res.ttls[n]);
1922 }
1923 } else {
1924 /* unresolved mfc_caches don't contain
1925 * pkt, bytes and wrong_if values
1926 */
1927 seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
1928 }
1929 seq_putc(seq, '\n');
1930 }
1931 return 0;
1932}
1933
1934static const struct seq_operations ipmr_mfc_seq_ops = {
1935 .start = ipmr_mfc_seq_start,
1936 .next = ipmr_mfc_seq_next,
1937 .stop = ipmr_mfc_seq_stop,
1938 .show = ipmr_mfc_seq_show,
1939};
1940
1941static int ipmr_mfc_open(struct inode *inode, struct file *file)
1942{
1943 return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
1944 sizeof(struct ipmr_mfc_iter));
1945}
1946
1947static const struct file_operations ipmr_mfc_fops = {
1948 .owner = THIS_MODULE,
1949 .open = ipmr_mfc_open,
1950 .read = seq_read,
1951 .llseek = seq_lseek,
1952 .release = seq_release_net,
1953};
1954#endif
1955
1956#ifdef CONFIG_IP_PIMSM_V2
1957static struct net_protocol pim_protocol = {
1958 .handler = pim_rcv,
1959};
1960#endif
1961
1962
1963/*
1964 * Setup for IP multicast routing
1965 */
1966static int __net_init ipmr_net_init(struct net *net)
1967{
1968 int err = 0;
1969
1970 net->ipv4.vif_table = kcalloc(MAXVIFS, sizeof(struct vif_device),
1971 GFP_KERNEL);
1972 if (!net->ipv4.vif_table) {
1973 err = -ENOMEM;
1974 goto fail;
1975 }
1976
1977 /* Forwarding cache */
1978 net->ipv4.mfc_cache_array = kcalloc(MFC_LINES,
1979 sizeof(struct mfc_cache *),
1980 GFP_KERNEL);
1981 if (!net->ipv4.mfc_cache_array) {
1982 err = -ENOMEM;
1983 goto fail_mfc_cache;
1984 }
1985
1986#ifdef CONFIG_IP_PIMSM
1987 net->ipv4.mroute_reg_vif_num = -1;
1988#endif
1989
1990#ifdef CONFIG_PROC_FS
1991 err = -ENOMEM;
1992 if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops))
1993 goto proc_vif_fail;
1994 if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
1995 goto proc_cache_fail;
1996#endif
1997 return 0;
1998
1999#ifdef CONFIG_PROC_FS
2000proc_cache_fail:
2001 proc_net_remove(net, "ip_mr_vif");
2002proc_vif_fail:
2003 kfree(net->ipv4.mfc_cache_array);
2004#endif
2005fail_mfc_cache:
2006 kfree(net->ipv4.vif_table);
2007fail:
2008 return err;
2009}
2010
2011static void __net_exit ipmr_net_exit(struct net *net)
2012{
2013#ifdef CONFIG_PROC_FS
2014 proc_net_remove(net, "ip_mr_cache");
2015 proc_net_remove(net, "ip_mr_vif");
2016#endif
2017 kfree(net->ipv4.mfc_cache_array);
2018 kfree(net->ipv4.vif_table);
2019}
2020
2021static struct pernet_operations ipmr_net_ops = {
2022 .init = ipmr_net_init,
2023 .exit = ipmr_net_exit,
2024};
2025
2026int __init ip_mr_init(void)
2027{
2028 int err;
2029
2030 mrt_cachep = kmem_cache_create("ip_mrt_cache",
2031 sizeof(struct mfc_cache),
2032 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2033 NULL);
2034 if (!mrt_cachep)
2035 return -ENOMEM;
2036
2037 err = register_pernet_subsys(&ipmr_net_ops);
2038 if (err)
2039 goto reg_pernet_fail;
2040
2041 setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
2042 err = register_netdevice_notifier(&ip_mr_notifier);
2043 if (err)
2044 goto reg_notif_fail;
2045 return 0;
2046
2047reg_notif_fail:
2048 del_timer(&ipmr_expire_timer);
2049 unregister_pernet_subsys(&ipmr_net_ops);
2050reg_pernet_fail:
2051 kmem_cache_destroy(mrt_cachep);
2052 return err;
2053}