]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv4/ipmr.c
netdev: remove pathetic compile-command lines
[net-next-2.6.git] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Fixes:
13  *      Michael Chastain        :       Incorrect size of copying.
14  *      Alan Cox                :       Added the cache manager code
15  *      Alan Cox                :       Fixed the clone/copy bug and device race.
16  *      Mike McLagan            :       Routing by source
17  *      Malcolm Beattie         :       Buffer handling fixes.
18  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
19  *      SVR Anand               :       Fixed several multicast bugs and problems.
20  *      Alexey Kuznetsov        :       Status, optimisations and more.
21  *      Brad Parker             :       Better behaviour on mrouted upcall
22  *                                      overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
25  *                                      Relax this requrement to work with older peers.
26  *
27  */
28
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <net/net_namespace.h>
51 #include <net/ip.h>
52 #include <net/protocol.h>
53 #include <linux/skbuff.h>
54 #include <net/route.h>
55 #include <net/sock.h>
56 #include <net/icmp.h>
57 #include <net/udp.h>
58 #include <net/raw.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
62 #include <net/ipip.h>
63 #include <net/checksum.h>
64 #include <net/netlink.h>
65
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM 1
68 #endif
69
70 static struct sock *mroute_socket;
71
72
73 /* Big lock, protecting vif table, mrt cache and mroute socket state.
74    Note that the changes are semaphored via rtnl_lock.
75  */
76
77 static DEFINE_RWLOCK(mrt_lock);
78
79 /*
80  *      Multicast router control variables
81  */
82
83 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
84 static int maxvif;
85
86 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
87
88 static int mroute_do_assert;                            /* Set in PIM assert    */
89 static int mroute_do_pim;
90
91 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
92
93 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
94 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
95
96 /* Special spinlock for queue of unresolved entries */
97 static DEFINE_SPINLOCK(mfc_unres_lock);
98
99 /* We return to original Alan's scheme. Hash table of resolved
100    entries is changed only in process context and protected
101    with weak lock mrt_lock. Queue of unresolved entries is protected
102    with strong spinlock mfc_unres_lock.
103
104    In this case data path is free of exclusive locks at all.
105  */
106
107 static struct kmem_cache *mrt_cachep __read_mostly;
108
109 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
110 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
111 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
112
113 #ifdef CONFIG_IP_PIMSM_V2
114 static struct net_protocol pim_protocol;
115 #endif
116
117 static struct timer_list ipmr_expire_timer;
118
119 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
120
121 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
122 {
123         dev_close(dev);
124
125         dev = __dev_get_by_name(&init_net, "tunl0");
126         if (dev) {
127                 const struct net_device_ops *ops = dev->netdev_ops;
128                 struct ifreq ifr;
129                 struct ip_tunnel_parm p;
130
131                 memset(&p, 0, sizeof(p));
132                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
133                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
134                 p.iph.version = 4;
135                 p.iph.ihl = 5;
136                 p.iph.protocol = IPPROTO_IPIP;
137                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
138                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
139
140                 if (ops->ndo_do_ioctl) {
141                         mm_segment_t oldfs = get_fs();
142
143                         set_fs(KERNEL_DS);
144                         ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
145                         set_fs(oldfs);
146                 }
147         }
148 }
149
150 static
151 struct net_device *ipmr_new_tunnel(struct vifctl *v)
152 {
153         struct net_device  *dev;
154
155         dev = __dev_get_by_name(&init_net, "tunl0");
156
157         if (dev) {
158                 const struct net_device_ops *ops = dev->netdev_ops;
159                 int err;
160                 struct ifreq ifr;
161                 struct ip_tunnel_parm p;
162                 struct in_device  *in_dev;
163
164                 memset(&p, 0, sizeof(p));
165                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
166                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
167                 p.iph.version = 4;
168                 p.iph.ihl = 5;
169                 p.iph.protocol = IPPROTO_IPIP;
170                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
171                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
172
173                 if (ops->ndo_do_ioctl) {
174                         mm_segment_t oldfs = get_fs();
175
176                         set_fs(KERNEL_DS);
177                         err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
178                         set_fs(oldfs);
179                 } else
180                         err = -EOPNOTSUPP;
181
182                 dev = NULL;
183
184                 if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
185                         dev->flags |= IFF_MULTICAST;
186
187                         in_dev = __in_dev_get_rtnl(dev);
188                         if (in_dev == NULL)
189                                 goto failure;
190
191                         ipv4_devconf_setall(in_dev);
192                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
193
194                         if (dev_open(dev))
195                                 goto failure;
196                         dev_hold(dev);
197                 }
198         }
199         return dev;
200
201 failure:
202         /* allow the register to be completed before unregistering. */
203         rtnl_unlock();
204         rtnl_lock();
205
206         unregister_netdevice(dev);
207         return NULL;
208 }
209
210 #ifdef CONFIG_IP_PIMSM
211
212 static int reg_vif_num = -1;
213
214 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
215 {
216         read_lock(&mrt_lock);
217         dev->stats.tx_bytes += skb->len;
218         dev->stats.tx_packets++;
219         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
220         read_unlock(&mrt_lock);
221         kfree_skb(skb);
222         return 0;
223 }
224
225 static const struct net_device_ops reg_vif_netdev_ops = {
226         .ndo_start_xmit = reg_vif_xmit,
227 };
228
229 static void reg_vif_setup(struct net_device *dev)
230 {
231         dev->type               = ARPHRD_PIMREG;
232         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
233         dev->flags              = IFF_NOARP;
234         dev->netdev_ops         = &reg_vif_netdev_ops,
235         dev->destructor         = free_netdev;
236 }
237
238 static struct net_device *ipmr_reg_vif(void)
239 {
240         struct net_device *dev;
241         struct in_device *in_dev;
242
243         dev = alloc_netdev(0, "pimreg", reg_vif_setup);
244
245         if (dev == NULL)
246                 return NULL;
247
248         if (register_netdevice(dev)) {
249                 free_netdev(dev);
250                 return NULL;
251         }
252         dev->iflink = 0;
253
254         rcu_read_lock();
255         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
256                 rcu_read_unlock();
257                 goto failure;
258         }
259
260         ipv4_devconf_setall(in_dev);
261         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
262         rcu_read_unlock();
263
264         if (dev_open(dev))
265                 goto failure;
266
267         dev_hold(dev);
268
269         return dev;
270
271 failure:
272         /* allow the register to be completed before unregistering. */
273         rtnl_unlock();
274         rtnl_lock();
275
276         unregister_netdevice(dev);
277         return NULL;
278 }
279 #endif
280
281 /*
282  *      Delete a VIF entry
283  *      @notify: Set to 1, if the caller is a notifier_call
284  */
285
286 static int vif_delete(int vifi, int notify)
287 {
288         struct vif_device *v;
289         struct net_device *dev;
290         struct in_device *in_dev;
291
292         if (vifi < 0 || vifi >= maxvif)
293                 return -EADDRNOTAVAIL;
294
295         v = &vif_table[vifi];
296
297         write_lock_bh(&mrt_lock);
298         dev = v->dev;
299         v->dev = NULL;
300
301         if (!dev) {
302                 write_unlock_bh(&mrt_lock);
303                 return -EADDRNOTAVAIL;
304         }
305
306 #ifdef CONFIG_IP_PIMSM
307         if (vifi == reg_vif_num)
308                 reg_vif_num = -1;
309 #endif
310
311         if (vifi+1 == maxvif) {
312                 int tmp;
313                 for (tmp=vifi-1; tmp>=0; tmp--) {
314                         if (VIF_EXISTS(tmp))
315                                 break;
316                 }
317                 maxvif = tmp+1;
318         }
319
320         write_unlock_bh(&mrt_lock);
321
322         dev_set_allmulti(dev, -1);
323
324         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
325                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
326                 ip_rt_multicast_event(in_dev);
327         }
328
329         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
330                 unregister_netdevice(dev);
331
332         dev_put(dev);
333         return 0;
334 }
335
336 /* Destroy an unresolved cache entry, killing queued skbs
337    and reporting error to netlink readers.
338  */
339
340 static void ipmr_destroy_unres(struct mfc_cache *c)
341 {
342         struct sk_buff *skb;
343         struct nlmsgerr *e;
344
345         atomic_dec(&cache_resolve_queue_len);
346
347         while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
348                 if (ip_hdr(skb)->version == 0) {
349                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
350                         nlh->nlmsg_type = NLMSG_ERROR;
351                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
352                         skb_trim(skb, nlh->nlmsg_len);
353                         e = NLMSG_DATA(nlh);
354                         e->error = -ETIMEDOUT;
355                         memset(&e->msg, 0, sizeof(e->msg));
356
357                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
358                 } else
359                         kfree_skb(skb);
360         }
361
362         kmem_cache_free(mrt_cachep, c);
363 }
364
365
366 /* Single timer process for all the unresolved queue. */
367
368 static void ipmr_expire_process(unsigned long dummy)
369 {
370         unsigned long now;
371         unsigned long expires;
372         struct mfc_cache *c, **cp;
373
374         if (!spin_trylock(&mfc_unres_lock)) {
375                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
376                 return;
377         }
378
379         if (atomic_read(&cache_resolve_queue_len) == 0)
380                 goto out;
381
382         now = jiffies;
383         expires = 10*HZ;
384         cp = &mfc_unres_queue;
385
386         while ((c=*cp) != NULL) {
387                 if (time_after(c->mfc_un.unres.expires, now)) {
388                         unsigned long interval = c->mfc_un.unres.expires - now;
389                         if (interval < expires)
390                                 expires = interval;
391                         cp = &c->next;
392                         continue;
393                 }
394
395                 *cp = c->next;
396
397                 ipmr_destroy_unres(c);
398         }
399
400         if (atomic_read(&cache_resolve_queue_len))
401                 mod_timer(&ipmr_expire_timer, jiffies + expires);
402
403 out:
404         spin_unlock(&mfc_unres_lock);
405 }
406
407 /* Fill oifs list. It is called under write locked mrt_lock. */
408
409 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
410 {
411         int vifi;
412
413         cache->mfc_un.res.minvif = MAXVIFS;
414         cache->mfc_un.res.maxvif = 0;
415         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
416
417         for (vifi=0; vifi<maxvif; vifi++) {
418                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
419                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
420                         if (cache->mfc_un.res.minvif > vifi)
421                                 cache->mfc_un.res.minvif = vifi;
422                         if (cache->mfc_un.res.maxvif <= vifi)
423                                 cache->mfc_un.res.maxvif = vifi + 1;
424                 }
425         }
426 }
427
428 static int vif_add(struct vifctl *vifc, int mrtsock)
429 {
430         int vifi = vifc->vifc_vifi;
431         struct vif_device *v = &vif_table[vifi];
432         struct net_device *dev;
433         struct in_device *in_dev;
434         int err;
435
436         /* Is vif busy ? */
437         if (VIF_EXISTS(vifi))
438                 return -EADDRINUSE;
439
440         switch (vifc->vifc_flags) {
441 #ifdef CONFIG_IP_PIMSM
442         case VIFF_REGISTER:
443                 /*
444                  * Special Purpose VIF in PIM
445                  * All the packets will be sent to the daemon
446                  */
447                 if (reg_vif_num >= 0)
448                         return -EADDRINUSE;
449                 dev = ipmr_reg_vif();
450                 if (!dev)
451                         return -ENOBUFS;
452                 err = dev_set_allmulti(dev, 1);
453                 if (err) {
454                         unregister_netdevice(dev);
455                         dev_put(dev);
456                         return err;
457                 }
458                 break;
459 #endif
460         case VIFF_TUNNEL:
461                 dev = ipmr_new_tunnel(vifc);
462                 if (!dev)
463                         return -ENOBUFS;
464                 err = dev_set_allmulti(dev, 1);
465                 if (err) {
466                         ipmr_del_tunnel(dev, vifc);
467                         dev_put(dev);
468                         return err;
469                 }
470                 break;
471         case 0:
472                 dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
473                 if (!dev)
474                         return -EADDRNOTAVAIL;
475                 err = dev_set_allmulti(dev, 1);
476                 if (err) {
477                         dev_put(dev);
478                         return err;
479                 }
480                 break;
481         default:
482                 return -EINVAL;
483         }
484
485         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
486                 return -EADDRNOTAVAIL;
487         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
488         ip_rt_multicast_event(in_dev);
489
490         /*
491          *      Fill in the VIF structures
492          */
493         v->rate_limit = vifc->vifc_rate_limit;
494         v->local = vifc->vifc_lcl_addr.s_addr;
495         v->remote = vifc->vifc_rmt_addr.s_addr;
496         v->flags = vifc->vifc_flags;
497         if (!mrtsock)
498                 v->flags |= VIFF_STATIC;
499         v->threshold = vifc->vifc_threshold;
500         v->bytes_in = 0;
501         v->bytes_out = 0;
502         v->pkt_in = 0;
503         v->pkt_out = 0;
504         v->link = dev->ifindex;
505         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
506                 v->link = dev->iflink;
507
508         /* And finish update writing critical data */
509         write_lock_bh(&mrt_lock);
510         v->dev = dev;
511 #ifdef CONFIG_IP_PIMSM
512         if (v->flags&VIFF_REGISTER)
513                 reg_vif_num = vifi;
514 #endif
515         if (vifi+1 > maxvif)
516                 maxvif = vifi+1;
517         write_unlock_bh(&mrt_lock);
518         return 0;
519 }
520
521 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
522 {
523         int line = MFC_HASH(mcastgrp, origin);
524         struct mfc_cache *c;
525
526         for (c=mfc_cache_array[line]; c; c = c->next) {
527                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
528                         break;
529         }
530         return c;
531 }
532
533 /*
534  *      Allocate a multicast cache entry
535  */
536 static struct mfc_cache *ipmr_cache_alloc(void)
537 {
538         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
539         if (c == NULL)
540                 return NULL;
541         c->mfc_un.res.minvif = MAXVIFS;
542         return c;
543 }
544
545 static struct mfc_cache *ipmr_cache_alloc_unres(void)
546 {
547         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
548         if (c == NULL)
549                 return NULL;
550         skb_queue_head_init(&c->mfc_un.unres.unresolved);
551         c->mfc_un.unres.expires = jiffies + 10*HZ;
552         return c;
553 }
554
555 /*
556  *      A cache entry has gone into a resolved state from queued
557  */
558
559 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
560 {
561         struct sk_buff *skb;
562         struct nlmsgerr *e;
563
564         /*
565          *      Play the pending entries through our router
566          */
567
568         while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
569                 if (ip_hdr(skb)->version == 0) {
570                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
571
572                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
573                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
574                                                   (u8 *)nlh);
575                         } else {
576                                 nlh->nlmsg_type = NLMSG_ERROR;
577                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
578                                 skb_trim(skb, nlh->nlmsg_len);
579                                 e = NLMSG_DATA(nlh);
580                                 e->error = -EMSGSIZE;
581                                 memset(&e->msg, 0, sizeof(e->msg));
582                         }
583
584                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
585                 } else
586                         ip_mr_forward(skb, c, 0);
587         }
588 }
589
590 /*
591  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
592  *      expects the following bizarre scheme.
593  *
594  *      Called under mrt_lock.
595  */
596
597 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
598 {
599         struct sk_buff *skb;
600         const int ihl = ip_hdrlen(pkt);
601         struct igmphdr *igmp;
602         struct igmpmsg *msg;
603         int ret;
604
605 #ifdef CONFIG_IP_PIMSM
606         if (assert == IGMPMSG_WHOLEPKT)
607                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
608         else
609 #endif
610                 skb = alloc_skb(128, GFP_ATOMIC);
611
612         if (!skb)
613                 return -ENOBUFS;
614
615 #ifdef CONFIG_IP_PIMSM
616         if (assert == IGMPMSG_WHOLEPKT) {
617                 /* Ugly, but we have no choice with this interface.
618                    Duplicate old header, fix ihl, length etc.
619                    And all this only to mangle msg->im_msgtype and
620                    to set msg->im_mbz to "mbz" :-)
621                  */
622                 skb_push(skb, sizeof(struct iphdr));
623                 skb_reset_network_header(skb);
624                 skb_reset_transport_header(skb);
625                 msg = (struct igmpmsg *)skb_network_header(skb);
626                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
627                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
628                 msg->im_mbz = 0;
629                 msg->im_vif = reg_vif_num;
630                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
631                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
632                                              sizeof(struct iphdr));
633         } else
634 #endif
635         {
636
637         /*
638          *      Copy the IP header
639          */
640
641         skb->network_header = skb->tail;
642         skb_put(skb, ihl);
643         skb_copy_to_linear_data(skb, pkt->data, ihl);
644         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
645         msg = (struct igmpmsg *)skb_network_header(skb);
646         msg->im_vif = vifi;
647         skb->dst = dst_clone(pkt->dst);
648
649         /*
650          *      Add our header
651          */
652
653         igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
654         igmp->type      =
655         msg->im_msgtype = assert;
656         igmp->code      =       0;
657         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
658         skb->transport_header = skb->network_header;
659         }
660
661         if (mroute_socket == NULL) {
662                 kfree_skb(skb);
663                 return -EINVAL;
664         }
665
666         /*
667          *      Deliver to mrouted
668          */
669         if ((ret = sock_queue_rcv_skb(mroute_socket, skb))<0) {
670                 if (net_ratelimit())
671                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
672                 kfree_skb(skb);
673         }
674
675         return ret;
676 }
677
678 /*
679  *      Queue a packet for resolution. It gets locked cache entry!
680  */
681
682 static int
683 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
684 {
685         int err;
686         struct mfc_cache *c;
687         const struct iphdr *iph = ip_hdr(skb);
688
689         spin_lock_bh(&mfc_unres_lock);
690         for (c=mfc_unres_queue; c; c=c->next) {
691                 if (c->mfc_mcastgrp == iph->daddr &&
692                     c->mfc_origin == iph->saddr)
693                         break;
694         }
695
696         if (c == NULL) {
697                 /*
698                  *      Create a new entry if allowable
699                  */
700
701                 if (atomic_read(&cache_resolve_queue_len) >= 10 ||
702                     (c=ipmr_cache_alloc_unres())==NULL) {
703                         spin_unlock_bh(&mfc_unres_lock);
704
705                         kfree_skb(skb);
706                         return -ENOBUFS;
707                 }
708
709                 /*
710                  *      Fill in the new cache entry
711                  */
712                 c->mfc_parent   = -1;
713                 c->mfc_origin   = iph->saddr;
714                 c->mfc_mcastgrp = iph->daddr;
715
716                 /*
717                  *      Reflect first query at mrouted.
718                  */
719                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
720                         /* If the report failed throw the cache entry
721                            out - Brad Parker
722                          */
723                         spin_unlock_bh(&mfc_unres_lock);
724
725                         kmem_cache_free(mrt_cachep, c);
726                         kfree_skb(skb);
727                         return err;
728                 }
729
730                 atomic_inc(&cache_resolve_queue_len);
731                 c->next = mfc_unres_queue;
732                 mfc_unres_queue = c;
733
734                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
735         }
736
737         /*
738          *      See if we can append the packet
739          */
740         if (c->mfc_un.unres.unresolved.qlen>3) {
741                 kfree_skb(skb);
742                 err = -ENOBUFS;
743         } else {
744                 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
745                 err = 0;
746         }
747
748         spin_unlock_bh(&mfc_unres_lock);
749         return err;
750 }
751
752 /*
753  *      MFC cache manipulation by user space mroute daemon
754  */
755
756 static int ipmr_mfc_delete(struct mfcctl *mfc)
757 {
758         int line;
759         struct mfc_cache *c, **cp;
760
761         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
762
763         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
764                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
765                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
766                         write_lock_bh(&mrt_lock);
767                         *cp = c->next;
768                         write_unlock_bh(&mrt_lock);
769
770                         kmem_cache_free(mrt_cachep, c);
771                         return 0;
772                 }
773         }
774         return -ENOENT;
775 }
776
777 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
778 {
779         int line;
780         struct mfc_cache *uc, *c, **cp;
781
782         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
783
784         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
785                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
786                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
787                         break;
788         }
789
790         if (c != NULL) {
791                 write_lock_bh(&mrt_lock);
792                 c->mfc_parent = mfc->mfcc_parent;
793                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
794                 if (!mrtsock)
795                         c->mfc_flags |= MFC_STATIC;
796                 write_unlock_bh(&mrt_lock);
797                 return 0;
798         }
799
800         if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
801                 return -EINVAL;
802
803         c = ipmr_cache_alloc();
804         if (c == NULL)
805                 return -ENOMEM;
806
807         c->mfc_origin = mfc->mfcc_origin.s_addr;
808         c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
809         c->mfc_parent = mfc->mfcc_parent;
810         ipmr_update_thresholds(c, mfc->mfcc_ttls);
811         if (!mrtsock)
812                 c->mfc_flags |= MFC_STATIC;
813
814         write_lock_bh(&mrt_lock);
815         c->next = mfc_cache_array[line];
816         mfc_cache_array[line] = c;
817         write_unlock_bh(&mrt_lock);
818
819         /*
820          *      Check to see if we resolved a queued list. If so we
821          *      need to send on the frames and tidy up.
822          */
823         spin_lock_bh(&mfc_unres_lock);
824         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
825              cp = &uc->next) {
826                 if (uc->mfc_origin == c->mfc_origin &&
827                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
828                         *cp = uc->next;
829                         if (atomic_dec_and_test(&cache_resolve_queue_len))
830                                 del_timer(&ipmr_expire_timer);
831                         break;
832                 }
833         }
834         spin_unlock_bh(&mfc_unres_lock);
835
836         if (uc) {
837                 ipmr_cache_resolve(uc, c);
838                 kmem_cache_free(mrt_cachep, uc);
839         }
840         return 0;
841 }
842
843 /*
844  *      Close the multicast socket, and clear the vif tables etc
845  */
846
847 static void mroute_clean_tables(struct sock *sk)
848 {
849         int i;
850
851         /*
852          *      Shut down all active vif entries
853          */
854         for (i=0; i<maxvif; i++) {
855                 if (!(vif_table[i].flags&VIFF_STATIC))
856                         vif_delete(i, 0);
857         }
858
859         /*
860          *      Wipe the cache
861          */
862         for (i=0; i<MFC_LINES; i++) {
863                 struct mfc_cache *c, **cp;
864
865                 cp = &mfc_cache_array[i];
866                 while ((c = *cp) != NULL) {
867                         if (c->mfc_flags&MFC_STATIC) {
868                                 cp = &c->next;
869                                 continue;
870                         }
871                         write_lock_bh(&mrt_lock);
872                         *cp = c->next;
873                         write_unlock_bh(&mrt_lock);
874
875                         kmem_cache_free(mrt_cachep, c);
876                 }
877         }
878
879         if (atomic_read(&cache_resolve_queue_len) != 0) {
880                 struct mfc_cache *c;
881
882                 spin_lock_bh(&mfc_unres_lock);
883                 while (mfc_unres_queue != NULL) {
884                         c = mfc_unres_queue;
885                         mfc_unres_queue = c->next;
886                         spin_unlock_bh(&mfc_unres_lock);
887
888                         ipmr_destroy_unres(c);
889
890                         spin_lock_bh(&mfc_unres_lock);
891                 }
892                 spin_unlock_bh(&mfc_unres_lock);
893         }
894 }
895
896 static void mrtsock_destruct(struct sock *sk)
897 {
898         rtnl_lock();
899         if (sk == mroute_socket) {
900                 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--;
901
902                 write_lock_bh(&mrt_lock);
903                 mroute_socket = NULL;
904                 write_unlock_bh(&mrt_lock);
905
906                 mroute_clean_tables(sk);
907         }
908         rtnl_unlock();
909 }
910
911 /*
912  *      Socket options and virtual interface manipulation. The whole
913  *      virtual interface system is a complete heap, but unfortunately
914  *      that's how BSD mrouted happens to think. Maybe one day with a proper
915  *      MOSPF/PIM router set up we can clean this up.
916  */
917
918 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int optlen)
919 {
920         int ret;
921         struct vifctl vif;
922         struct mfcctl mfc;
923
924         if (optname != MRT_INIT) {
925                 if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
926                         return -EACCES;
927         }
928
929         switch (optname) {
930         case MRT_INIT:
931                 if (sk->sk_type != SOCK_RAW ||
932                     inet_sk(sk)->num != IPPROTO_IGMP)
933                         return -EOPNOTSUPP;
934                 if (optlen != sizeof(int))
935                         return -ENOPROTOOPT;
936
937                 rtnl_lock();
938                 if (mroute_socket) {
939                         rtnl_unlock();
940                         return -EADDRINUSE;
941                 }
942
943                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
944                 if (ret == 0) {
945                         write_lock_bh(&mrt_lock);
946                         mroute_socket = sk;
947                         write_unlock_bh(&mrt_lock);
948
949                         IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++;
950                 }
951                 rtnl_unlock();
952                 return ret;
953         case MRT_DONE:
954                 if (sk != mroute_socket)
955                         return -EACCES;
956                 return ip_ra_control(sk, 0, NULL);
957         case MRT_ADD_VIF:
958         case MRT_DEL_VIF:
959                 if (optlen != sizeof(vif))
960                         return -EINVAL;
961                 if (copy_from_user(&vif, optval, sizeof(vif)))
962                         return -EFAULT;
963                 if (vif.vifc_vifi >= MAXVIFS)
964                         return -ENFILE;
965                 rtnl_lock();
966                 if (optname == MRT_ADD_VIF) {
967                         ret = vif_add(&vif, sk==mroute_socket);
968                 } else {
969                         ret = vif_delete(vif.vifc_vifi, 0);
970                 }
971                 rtnl_unlock();
972                 return ret;
973
974                 /*
975                  *      Manipulate the forwarding caches. These live
976                  *      in a sort of kernel/user symbiosis.
977                  */
978         case MRT_ADD_MFC:
979         case MRT_DEL_MFC:
980                 if (optlen != sizeof(mfc))
981                         return -EINVAL;
982                 if (copy_from_user(&mfc, optval, sizeof(mfc)))
983                         return -EFAULT;
984                 rtnl_lock();
985                 if (optname == MRT_DEL_MFC)
986                         ret = ipmr_mfc_delete(&mfc);
987                 else
988                         ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
989                 rtnl_unlock();
990                 return ret;
991                 /*
992                  *      Control PIM assert.
993                  */
994         case MRT_ASSERT:
995         {
996                 int v;
997                 if (get_user(v,(int __user *)optval))
998                         return -EFAULT;
999                 mroute_do_assert=(v)?1:0;
1000                 return 0;
1001         }
1002 #ifdef CONFIG_IP_PIMSM
1003         case MRT_PIM:
1004         {
1005                 int v;
1006
1007                 if (get_user(v,(int __user *)optval))
1008                         return -EFAULT;
1009                 v = (v) ? 1 : 0;
1010
1011                 rtnl_lock();
1012                 ret = 0;
1013                 if (v != mroute_do_pim) {
1014                         mroute_do_pim = v;
1015                         mroute_do_assert = v;
1016 #ifdef CONFIG_IP_PIMSM_V2
1017                         if (mroute_do_pim)
1018                                 ret = inet_add_protocol(&pim_protocol,
1019                                                         IPPROTO_PIM);
1020                         else
1021                                 ret = inet_del_protocol(&pim_protocol,
1022                                                         IPPROTO_PIM);
1023                         if (ret < 0)
1024                                 ret = -EAGAIN;
1025 #endif
1026                 }
1027                 rtnl_unlock();
1028                 return ret;
1029         }
1030 #endif
1031         /*
1032          *      Spurious command, or MRT_VERSION which you cannot
1033          *      set.
1034          */
1035         default:
1036                 return -ENOPROTOOPT;
1037         }
1038 }
1039
1040 /*
1041  *      Getsock opt support for the multicast routing system.
1042  */
1043
1044 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1045 {
1046         int olr;
1047         int val;
1048
1049         if (optname != MRT_VERSION &&
1050 #ifdef CONFIG_IP_PIMSM
1051            optname!=MRT_PIM &&
1052 #endif
1053            optname!=MRT_ASSERT)
1054                 return -ENOPROTOOPT;
1055
1056         if (get_user(olr, optlen))
1057                 return -EFAULT;
1058
1059         olr = min_t(unsigned int, olr, sizeof(int));
1060         if (olr < 0)
1061                 return -EINVAL;
1062
1063         if (put_user(olr, optlen))
1064                 return -EFAULT;
1065         if (optname == MRT_VERSION)
1066                 val = 0x0305;
1067 #ifdef CONFIG_IP_PIMSM
1068         else if (optname == MRT_PIM)
1069                 val = mroute_do_pim;
1070 #endif
1071         else
1072                 val = mroute_do_assert;
1073         if (copy_to_user(optval, &val, olr))
1074                 return -EFAULT;
1075         return 0;
1076 }
1077
1078 /*
1079  *      The IP multicast ioctl support routines.
1080  */
1081
1082 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1083 {
1084         struct sioc_sg_req sr;
1085         struct sioc_vif_req vr;
1086         struct vif_device *vif;
1087         struct mfc_cache *c;
1088
1089         switch (cmd) {
1090         case SIOCGETVIFCNT:
1091                 if (copy_from_user(&vr, arg, sizeof(vr)))
1092                         return -EFAULT;
1093                 if (vr.vifi >= maxvif)
1094                         return -EINVAL;
1095                 read_lock(&mrt_lock);
1096                 vif=&vif_table[vr.vifi];
1097                 if (VIF_EXISTS(vr.vifi))        {
1098                         vr.icount = vif->pkt_in;
1099                         vr.ocount = vif->pkt_out;
1100                         vr.ibytes = vif->bytes_in;
1101                         vr.obytes = vif->bytes_out;
1102                         read_unlock(&mrt_lock);
1103
1104                         if (copy_to_user(arg, &vr, sizeof(vr)))
1105                                 return -EFAULT;
1106                         return 0;
1107                 }
1108                 read_unlock(&mrt_lock);
1109                 return -EADDRNOTAVAIL;
1110         case SIOCGETSGCNT:
1111                 if (copy_from_user(&sr, arg, sizeof(sr)))
1112                         return -EFAULT;
1113
1114                 read_lock(&mrt_lock);
1115                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1116                 if (c) {
1117                         sr.pktcnt = c->mfc_un.res.pkt;
1118                         sr.bytecnt = c->mfc_un.res.bytes;
1119                         sr.wrong_if = c->mfc_un.res.wrong_if;
1120                         read_unlock(&mrt_lock);
1121
1122                         if (copy_to_user(arg, &sr, sizeof(sr)))
1123                                 return -EFAULT;
1124                         return 0;
1125                 }
1126                 read_unlock(&mrt_lock);
1127                 return -EADDRNOTAVAIL;
1128         default:
1129                 return -ENOIOCTLCMD;
1130         }
1131 }
1132
1133
1134 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1135 {
1136         struct net_device *dev = ptr;
1137         struct vif_device *v;
1138         int ct;
1139
1140         if (!net_eq(dev_net(dev), &init_net))
1141                 return NOTIFY_DONE;
1142
1143         if (event != NETDEV_UNREGISTER)
1144                 return NOTIFY_DONE;
1145         v=&vif_table[0];
1146         for (ct=0; ct<maxvif; ct++,v++) {
1147                 if (v->dev == dev)
1148                         vif_delete(ct, 1);
1149         }
1150         return NOTIFY_DONE;
1151 }
1152
1153
1154 static struct notifier_block ip_mr_notifier = {
1155         .notifier_call = ipmr_device_event,
1156 };
1157
1158 /*
1159  *      Encapsulate a packet by attaching a valid IPIP header to it.
1160  *      This avoids tunnel drivers and other mess and gives us the speed so
1161  *      important for multicast video.
1162  */
1163
1164 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1165 {
1166         struct iphdr *iph;
1167         struct iphdr *old_iph = ip_hdr(skb);
1168
1169         skb_push(skb, sizeof(struct iphdr));
1170         skb->transport_header = skb->network_header;
1171         skb_reset_network_header(skb);
1172         iph = ip_hdr(skb);
1173
1174         iph->version    =       4;
1175         iph->tos        =       old_iph->tos;
1176         iph->ttl        =       old_iph->ttl;
1177         iph->frag_off   =       0;
1178         iph->daddr      =       daddr;
1179         iph->saddr      =       saddr;
1180         iph->protocol   =       IPPROTO_IPIP;
1181         iph->ihl        =       5;
1182         iph->tot_len    =       htons(skb->len);
1183         ip_select_ident(iph, skb->dst, NULL);
1184         ip_send_check(iph);
1185
1186         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1187         nf_reset(skb);
1188 }
1189
1190 static inline int ipmr_forward_finish(struct sk_buff *skb)
1191 {
1192         struct ip_options * opt = &(IPCB(skb)->opt);
1193
1194         IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1195
1196         if (unlikely(opt->optlen))
1197                 ip_forward_options(skb);
1198
1199         return dst_output(skb);
1200 }
1201
1202 /*
1203  *      Processing handlers for ipmr_forward
1204  */
1205
1206 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1207 {
1208         const struct iphdr *iph = ip_hdr(skb);
1209         struct vif_device *vif = &vif_table[vifi];
1210         struct net_device *dev;
1211         struct rtable *rt;
1212         int    encap = 0;
1213
1214         if (vif->dev == NULL)
1215                 goto out_free;
1216
1217 #ifdef CONFIG_IP_PIMSM
1218         if (vif->flags & VIFF_REGISTER) {
1219                 vif->pkt_out++;
1220                 vif->bytes_out += skb->len;
1221                 vif->dev->stats.tx_bytes += skb->len;
1222                 vif->dev->stats.tx_packets++;
1223                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1224                 kfree_skb(skb);
1225                 return;
1226         }
1227 #endif
1228
1229         if (vif->flags&VIFF_TUNNEL) {
1230                 struct flowi fl = { .oif = vif->link,
1231                                     .nl_u = { .ip4_u =
1232                                               { .daddr = vif->remote,
1233                                                 .saddr = vif->local,
1234                                                 .tos = RT_TOS(iph->tos) } },
1235                                     .proto = IPPROTO_IPIP };
1236                 if (ip_route_output_key(&init_net, &rt, &fl))
1237                         goto out_free;
1238                 encap = sizeof(struct iphdr);
1239         } else {
1240                 struct flowi fl = { .oif = vif->link,
1241                                     .nl_u = { .ip4_u =
1242                                               { .daddr = iph->daddr,
1243                                                 .tos = RT_TOS(iph->tos) } },
1244                                     .proto = IPPROTO_IPIP };
1245                 if (ip_route_output_key(&init_net, &rt, &fl))
1246                         goto out_free;
1247         }
1248
1249         dev = rt->u.dst.dev;
1250
1251         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1252                 /* Do not fragment multicasts. Alas, IPv4 does not
1253                    allow to send ICMP, so that packets will disappear
1254                    to blackhole.
1255                  */
1256
1257                 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1258                 ip_rt_put(rt);
1259                 goto out_free;
1260         }
1261
1262         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1263
1264         if (skb_cow(skb, encap)) {
1265                 ip_rt_put(rt);
1266                 goto out_free;
1267         }
1268
1269         vif->pkt_out++;
1270         vif->bytes_out += skb->len;
1271
1272         dst_release(skb->dst);
1273         skb->dst = &rt->u.dst;
1274         ip_decrease_ttl(ip_hdr(skb));
1275
1276         /* FIXME: forward and output firewalls used to be called here.
1277          * What do we do with netfilter? -- RR */
1278         if (vif->flags & VIFF_TUNNEL) {
1279                 ip_encap(skb, vif->local, vif->remote);
1280                 /* FIXME: extra output firewall step used to be here. --RR */
1281                 vif->dev->stats.tx_packets++;
1282                 vif->dev->stats.tx_bytes += skb->len;
1283         }
1284
1285         IPCB(skb)->flags |= IPSKB_FORWARDED;
1286
1287         /*
1288          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1289          * not only before forwarding, but after forwarding on all output
1290          * interfaces. It is clear, if mrouter runs a multicasting
1291          * program, it should receive packets not depending to what interface
1292          * program is joined.
1293          * If we will not make it, the program will have to join on all
1294          * interfaces. On the other hand, multihoming host (or router, but
1295          * not mrouter) cannot join to more than one interface - it will
1296          * result in receiving multiple packets.
1297          */
1298         NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
1299                 ipmr_forward_finish);
1300         return;
1301
1302 out_free:
1303         kfree_skb(skb);
1304         return;
1305 }
1306
1307 static int ipmr_find_vif(struct net_device *dev)
1308 {
1309         int ct;
1310         for (ct=maxvif-1; ct>=0; ct--) {
1311                 if (vif_table[ct].dev == dev)
1312                         break;
1313         }
1314         return ct;
1315 }
1316
1317 /* "local" means that we should preserve one skb (for local delivery) */
1318
1319 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1320 {
1321         int psend = -1;
1322         int vif, ct;
1323
1324         vif = cache->mfc_parent;
1325         cache->mfc_un.res.pkt++;
1326         cache->mfc_un.res.bytes += skb->len;
1327
1328         /*
1329          * Wrong interface: drop packet and (maybe) send PIM assert.
1330          */
1331         if (vif_table[vif].dev != skb->dev) {
1332                 int true_vifi;
1333
1334                 if (skb->rtable->fl.iif == 0) {
1335                         /* It is our own packet, looped back.
1336                            Very complicated situation...
1337
1338                            The best workaround until routing daemons will be
1339                            fixed is not to redistribute packet, if it was
1340                            send through wrong interface. It means, that
1341                            multicast applications WILL NOT work for
1342                            (S,G), which have default multicast route pointing
1343                            to wrong oif. In any case, it is not a good
1344                            idea to use multicasting applications on router.
1345                          */
1346                         goto dont_forward;
1347                 }
1348
1349                 cache->mfc_un.res.wrong_if++;
1350                 true_vifi = ipmr_find_vif(skb->dev);
1351
1352                 if (true_vifi >= 0 && mroute_do_assert &&
1353                     /* pimsm uses asserts, when switching from RPT to SPT,
1354                        so that we cannot check that packet arrived on an oif.
1355                        It is bad, but otherwise we would need to move pretty
1356                        large chunk of pimd to kernel. Ough... --ANK
1357                      */
1358                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1359                     time_after(jiffies,
1360                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1361                         cache->mfc_un.res.last_assert = jiffies;
1362                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1363                 }
1364                 goto dont_forward;
1365         }
1366
1367         vif_table[vif].pkt_in++;
1368         vif_table[vif].bytes_in += skb->len;
1369
1370         /*
1371          *      Forward the frame
1372          */
1373         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1374                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1375                         if (psend != -1) {
1376                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1377                                 if (skb2)
1378                                         ipmr_queue_xmit(skb2, cache, psend);
1379                         }
1380                         psend = ct;
1381                 }
1382         }
1383         if (psend != -1) {
1384                 if (local) {
1385                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1386                         if (skb2)
1387                                 ipmr_queue_xmit(skb2, cache, psend);
1388                 } else {
1389                         ipmr_queue_xmit(skb, cache, psend);
1390                         return 0;
1391                 }
1392         }
1393
1394 dont_forward:
1395         if (!local)
1396                 kfree_skb(skb);
1397         return 0;
1398 }
1399
1400
1401 /*
1402  *      Multicast packets for forwarding arrive here
1403  */
1404
1405 int ip_mr_input(struct sk_buff *skb)
1406 {
1407         struct mfc_cache *cache;
1408         int local = skb->rtable->rt_flags&RTCF_LOCAL;
1409
1410         /* Packet is looped back after forward, it should not be
1411            forwarded second time, but still can be delivered locally.
1412          */
1413         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1414                 goto dont_forward;
1415
1416         if (!local) {
1417                     if (IPCB(skb)->opt.router_alert) {
1418                             if (ip_call_ra_chain(skb))
1419                                     return 0;
1420                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1421                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1422                                Cisco IOS <= 11.2(8)) do not put router alert
1423                                option to IGMP packets destined to routable
1424                                groups. It is very bad, because it means
1425                                that we can forward NO IGMP messages.
1426                              */
1427                             read_lock(&mrt_lock);
1428                             if (mroute_socket) {
1429                                     nf_reset(skb);
1430                                     raw_rcv(mroute_socket, skb);
1431                                     read_unlock(&mrt_lock);
1432                                     return 0;
1433                             }
1434                             read_unlock(&mrt_lock);
1435                     }
1436         }
1437
1438         read_lock(&mrt_lock);
1439         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1440
1441         /*
1442          *      No usable cache entry
1443          */
1444         if (cache == NULL) {
1445                 int vif;
1446
1447                 if (local) {
1448                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1449                         ip_local_deliver(skb);
1450                         if (skb2 == NULL) {
1451                                 read_unlock(&mrt_lock);
1452                                 return -ENOBUFS;
1453                         }
1454                         skb = skb2;
1455                 }
1456
1457                 vif = ipmr_find_vif(skb->dev);
1458                 if (vif >= 0) {
1459                         int err = ipmr_cache_unresolved(vif, skb);
1460                         read_unlock(&mrt_lock);
1461
1462                         return err;
1463                 }
1464                 read_unlock(&mrt_lock);
1465                 kfree_skb(skb);
1466                 return -ENODEV;
1467         }
1468
1469         ip_mr_forward(skb, cache, local);
1470
1471         read_unlock(&mrt_lock);
1472
1473         if (local)
1474                 return ip_local_deliver(skb);
1475
1476         return 0;
1477
1478 dont_forward:
1479         if (local)
1480                 return ip_local_deliver(skb);
1481         kfree_skb(skb);
1482         return 0;
1483 }
1484
1485 #ifdef CONFIG_IP_PIMSM_V1
1486 /*
1487  * Handle IGMP messages of PIMv1
1488  */
1489
1490 int pim_rcv_v1(struct sk_buff * skb)
1491 {
1492         struct igmphdr *pim;
1493         struct iphdr   *encap;
1494         struct net_device  *reg_dev = NULL;
1495
1496         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1497                 goto drop;
1498
1499         pim = igmp_hdr(skb);
1500
1501         if (!mroute_do_pim ||
1502             skb->len < sizeof(*pim) + sizeof(*encap) ||
1503             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1504                 goto drop;
1505
1506         encap = (struct iphdr *)(skb_transport_header(skb) +
1507                                  sizeof(struct igmphdr));
1508         /*
1509            Check that:
1510            a. packet is really destinted to a multicast group
1511            b. packet is not a NULL-REGISTER
1512            c. packet is not truncated
1513          */
1514         if (!ipv4_is_multicast(encap->daddr) ||
1515             encap->tot_len == 0 ||
1516             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1517                 goto drop;
1518
1519         read_lock(&mrt_lock);
1520         if (reg_vif_num >= 0)
1521                 reg_dev = vif_table[reg_vif_num].dev;
1522         if (reg_dev)
1523                 dev_hold(reg_dev);
1524         read_unlock(&mrt_lock);
1525
1526         if (reg_dev == NULL)
1527                 goto drop;
1528
1529         skb->mac_header = skb->network_header;
1530         skb_pull(skb, (u8*)encap - skb->data);
1531         skb_reset_network_header(skb);
1532         skb->dev = reg_dev;
1533         skb->protocol = htons(ETH_P_IP);
1534         skb->ip_summed = 0;
1535         skb->pkt_type = PACKET_HOST;
1536         dst_release(skb->dst);
1537         skb->dst = NULL;
1538         reg_dev->stats.rx_bytes += skb->len;
1539         reg_dev->stats.rx_packets++;
1540         nf_reset(skb);
1541         netif_rx(skb);
1542         dev_put(reg_dev);
1543         return 0;
1544  drop:
1545         kfree_skb(skb);
1546         return 0;
1547 }
1548 #endif
1549
1550 #ifdef CONFIG_IP_PIMSM_V2
1551 static int pim_rcv(struct sk_buff * skb)
1552 {
1553         struct pimreghdr *pim;
1554         struct iphdr   *encap;
1555         struct net_device  *reg_dev = NULL;
1556
1557         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1558                 goto drop;
1559
1560         pim = (struct pimreghdr *)skb_transport_header(skb);
1561         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1562             (pim->flags&PIM_NULL_REGISTER) ||
1563             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1564              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1565                 goto drop;
1566
1567         /* check if the inner packet is destined to mcast group */
1568         encap = (struct iphdr *)(skb_transport_header(skb) +
1569                                  sizeof(struct pimreghdr));
1570         if (!ipv4_is_multicast(encap->daddr) ||
1571             encap->tot_len == 0 ||
1572             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1573                 goto drop;
1574
1575         read_lock(&mrt_lock);
1576         if (reg_vif_num >= 0)
1577                 reg_dev = vif_table[reg_vif_num].dev;
1578         if (reg_dev)
1579                 dev_hold(reg_dev);
1580         read_unlock(&mrt_lock);
1581
1582         if (reg_dev == NULL)
1583                 goto drop;
1584
1585         skb->mac_header = skb->network_header;
1586         skb_pull(skb, (u8*)encap - skb->data);
1587         skb_reset_network_header(skb);
1588         skb->dev = reg_dev;
1589         skb->protocol = htons(ETH_P_IP);
1590         skb->ip_summed = 0;
1591         skb->pkt_type = PACKET_HOST;
1592         dst_release(skb->dst);
1593         reg_dev->stats.rx_bytes += skb->len;
1594         reg_dev->stats.rx_packets++;
1595         skb->dst = NULL;
1596         nf_reset(skb);
1597         netif_rx(skb);
1598         dev_put(reg_dev);
1599         return 0;
1600  drop:
1601         kfree_skb(skb);
1602         return 0;
1603 }
1604 #endif
1605
1606 static int
1607 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1608 {
1609         int ct;
1610         struct rtnexthop *nhp;
1611         struct net_device *dev = vif_table[c->mfc_parent].dev;
1612         u8 *b = skb_tail_pointer(skb);
1613         struct rtattr *mp_head;
1614
1615         if (dev)
1616                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1617
1618         mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1619
1620         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1621                 if (c->mfc_un.res.ttls[ct] < 255) {
1622                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1623                                 goto rtattr_failure;
1624                         nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1625                         nhp->rtnh_flags = 0;
1626                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1627                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1628                         nhp->rtnh_len = sizeof(*nhp);
1629                 }
1630         }
1631         mp_head->rta_type = RTA_MULTIPATH;
1632         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1633         rtm->rtm_type = RTN_MULTICAST;
1634         return 1;
1635
1636 rtattr_failure:
1637         nlmsg_trim(skb, b);
1638         return -EMSGSIZE;
1639 }
1640
1641 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1642 {
1643         int err;
1644         struct mfc_cache *cache;
1645         struct rtable *rt = skb->rtable;
1646
1647         read_lock(&mrt_lock);
1648         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1649
1650         if (cache == NULL) {
1651                 struct sk_buff *skb2;
1652                 struct iphdr *iph;
1653                 struct net_device *dev;
1654                 int vif;
1655
1656                 if (nowait) {
1657                         read_unlock(&mrt_lock);
1658                         return -EAGAIN;
1659                 }
1660
1661                 dev = skb->dev;
1662                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1663                         read_unlock(&mrt_lock);
1664                         return -ENODEV;
1665                 }
1666                 skb2 = skb_clone(skb, GFP_ATOMIC);
1667                 if (!skb2) {
1668                         read_unlock(&mrt_lock);
1669                         return -ENOMEM;
1670                 }
1671
1672                 skb_push(skb2, sizeof(struct iphdr));
1673                 skb_reset_network_header(skb2);
1674                 iph = ip_hdr(skb2);
1675                 iph->ihl = sizeof(struct iphdr) >> 2;
1676                 iph->saddr = rt->rt_src;
1677                 iph->daddr = rt->rt_dst;
1678                 iph->version = 0;
1679                 err = ipmr_cache_unresolved(vif, skb2);
1680                 read_unlock(&mrt_lock);
1681                 return err;
1682         }
1683
1684         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1685                 cache->mfc_flags |= MFC_NOTIFY;
1686         err = ipmr_fill_mroute(skb, cache, rtm);
1687         read_unlock(&mrt_lock);
1688         return err;
1689 }
1690
1691 #ifdef CONFIG_PROC_FS
1692 /*
1693  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1694  */
1695 struct ipmr_vif_iter {
1696         int ct;
1697 };
1698
1699 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1700                                            loff_t pos)
1701 {
1702         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1703                 if (!VIF_EXISTS(iter->ct))
1704                         continue;
1705                 if (pos-- == 0)
1706                         return &vif_table[iter->ct];
1707         }
1708         return NULL;
1709 }
1710
1711 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1712         __acquires(mrt_lock)
1713 {
1714         read_lock(&mrt_lock);
1715         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1716                 : SEQ_START_TOKEN;
1717 }
1718
1719 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1720 {
1721         struct ipmr_vif_iter *iter = seq->private;
1722
1723         ++*pos;
1724         if (v == SEQ_START_TOKEN)
1725                 return ipmr_vif_seq_idx(iter, 0);
1726
1727         while (++iter->ct < maxvif) {
1728                 if (!VIF_EXISTS(iter->ct))
1729                         continue;
1730                 return &vif_table[iter->ct];
1731         }
1732         return NULL;
1733 }
1734
1735 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1736         __releases(mrt_lock)
1737 {
1738         read_unlock(&mrt_lock);
1739 }
1740
1741 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1742 {
1743         if (v == SEQ_START_TOKEN) {
1744                 seq_puts(seq,
1745                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1746         } else {
1747                 const struct vif_device *vif = v;
1748                 const char *name =  vif->dev ? vif->dev->name : "none";
1749
1750                 seq_printf(seq,
1751                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1752                            vif - vif_table,
1753                            name, vif->bytes_in, vif->pkt_in,
1754                            vif->bytes_out, vif->pkt_out,
1755                            vif->flags, vif->local, vif->remote);
1756         }
1757         return 0;
1758 }
1759
1760 static const struct seq_operations ipmr_vif_seq_ops = {
1761         .start = ipmr_vif_seq_start,
1762         .next  = ipmr_vif_seq_next,
1763         .stop  = ipmr_vif_seq_stop,
1764         .show  = ipmr_vif_seq_show,
1765 };
1766
1767 static int ipmr_vif_open(struct inode *inode, struct file *file)
1768 {
1769         return seq_open_private(file, &ipmr_vif_seq_ops,
1770                         sizeof(struct ipmr_vif_iter));
1771 }
1772
1773 static const struct file_operations ipmr_vif_fops = {
1774         .owner   = THIS_MODULE,
1775         .open    = ipmr_vif_open,
1776         .read    = seq_read,
1777         .llseek  = seq_lseek,
1778         .release = seq_release_private,
1779 };
1780
1781 struct ipmr_mfc_iter {
1782         struct mfc_cache **cache;
1783         int ct;
1784 };
1785
1786
1787 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1788 {
1789         struct mfc_cache *mfc;
1790
1791         it->cache = mfc_cache_array;
1792         read_lock(&mrt_lock);
1793         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1794                 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1795                         if (pos-- == 0)
1796                                 return mfc;
1797         read_unlock(&mrt_lock);
1798
1799         it->cache = &mfc_unres_queue;
1800         spin_lock_bh(&mfc_unres_lock);
1801         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1802                 if (pos-- == 0)
1803                         return mfc;
1804         spin_unlock_bh(&mfc_unres_lock);
1805
1806         it->cache = NULL;
1807         return NULL;
1808 }
1809
1810
1811 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1812 {
1813         struct ipmr_mfc_iter *it = seq->private;
1814         it->cache = NULL;
1815         it->ct = 0;
1816         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1817                 : SEQ_START_TOKEN;
1818 }
1819
1820 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1821 {
1822         struct mfc_cache *mfc = v;
1823         struct ipmr_mfc_iter *it = seq->private;
1824
1825         ++*pos;
1826
1827         if (v == SEQ_START_TOKEN)
1828                 return ipmr_mfc_seq_idx(seq->private, 0);
1829
1830         if (mfc->next)
1831                 return mfc->next;
1832
1833         if (it->cache == &mfc_unres_queue)
1834                 goto end_of_list;
1835
1836         BUG_ON(it->cache != mfc_cache_array);
1837
1838         while (++it->ct < MFC_LINES) {
1839                 mfc = mfc_cache_array[it->ct];
1840                 if (mfc)
1841                         return mfc;
1842         }
1843
1844         /* exhausted cache_array, show unresolved */
1845         read_unlock(&mrt_lock);
1846         it->cache = &mfc_unres_queue;
1847         it->ct = 0;
1848
1849         spin_lock_bh(&mfc_unres_lock);
1850         mfc = mfc_unres_queue;
1851         if (mfc)
1852                 return mfc;
1853
1854  end_of_list:
1855         spin_unlock_bh(&mfc_unres_lock);
1856         it->cache = NULL;
1857
1858         return NULL;
1859 }
1860
1861 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1862 {
1863         struct ipmr_mfc_iter *it = seq->private;
1864
1865         if (it->cache == &mfc_unres_queue)
1866                 spin_unlock_bh(&mfc_unres_lock);
1867         else if (it->cache == mfc_cache_array)
1868                 read_unlock(&mrt_lock);
1869 }
1870
1871 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1872 {
1873         int n;
1874
1875         if (v == SEQ_START_TOKEN) {
1876                 seq_puts(seq,
1877                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1878         } else {
1879                 const struct mfc_cache *mfc = v;
1880                 const struct ipmr_mfc_iter *it = seq->private;
1881
1882                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1883                            (unsigned long) mfc->mfc_mcastgrp,
1884                            (unsigned long) mfc->mfc_origin,
1885                            mfc->mfc_parent,
1886                            mfc->mfc_un.res.pkt,
1887                            mfc->mfc_un.res.bytes,
1888                            mfc->mfc_un.res.wrong_if);
1889
1890                 if (it->cache != &mfc_unres_queue) {
1891                         for (n = mfc->mfc_un.res.minvif;
1892                              n < mfc->mfc_un.res.maxvif; n++ ) {
1893                                 if (VIF_EXISTS(n)
1894                                    && mfc->mfc_un.res.ttls[n] < 255)
1895                                 seq_printf(seq,
1896                                            " %2d:%-3d",
1897                                            n, mfc->mfc_un.res.ttls[n]);
1898                         }
1899                 }
1900                 seq_putc(seq, '\n');
1901         }
1902         return 0;
1903 }
1904
1905 static const struct seq_operations ipmr_mfc_seq_ops = {
1906         .start = ipmr_mfc_seq_start,
1907         .next  = ipmr_mfc_seq_next,
1908         .stop  = ipmr_mfc_seq_stop,
1909         .show  = ipmr_mfc_seq_show,
1910 };
1911
1912 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1913 {
1914         return seq_open_private(file, &ipmr_mfc_seq_ops,
1915                         sizeof(struct ipmr_mfc_iter));
1916 }
1917
1918 static const struct file_operations ipmr_mfc_fops = {
1919         .owner   = THIS_MODULE,
1920         .open    = ipmr_mfc_open,
1921         .read    = seq_read,
1922         .llseek  = seq_lseek,
1923         .release = seq_release_private,
1924 };
1925 #endif
1926
1927 #ifdef CONFIG_IP_PIMSM_V2
1928 static struct net_protocol pim_protocol = {
1929         .handler        =       pim_rcv,
1930 };
1931 #endif
1932
1933
1934 /*
1935  *      Setup for IP multicast routing
1936  */
1937
1938 int __init ip_mr_init(void)
1939 {
1940         int err;
1941
1942         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1943                                        sizeof(struct mfc_cache),
1944                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1945                                        NULL);
1946         if (!mrt_cachep)
1947                 return -ENOMEM;
1948
1949         setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1950         err = register_netdevice_notifier(&ip_mr_notifier);
1951         if (err)
1952                 goto reg_notif_fail;
1953 #ifdef CONFIG_PROC_FS
1954         err = -ENOMEM;
1955         if (!proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops))
1956                 goto proc_vif_fail;
1957         if (!proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops))
1958                 goto proc_cache_fail;
1959 #endif
1960         return 0;
1961 #ifdef CONFIG_PROC_FS
1962 proc_cache_fail:
1963         proc_net_remove(&init_net, "ip_mr_vif");
1964 proc_vif_fail:
1965         unregister_netdevice_notifier(&ip_mr_notifier);
1966 #endif
1967 reg_notif_fail:
1968         del_timer(&ipmr_expire_timer);
1969         kmem_cache_destroy(mrt_cachep);
1970         return err;
1971 }