]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/packet/af_packet.c
8298e676f5a015f58d1b6005cf85938f8c8e142a
[net-next-2.6.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <linux/slab.h>
64 #include <net/net_namespace.h>
65 #include <net/ip.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
68 #include <net/sock.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
74 #include <asm/page.h>
75 #include <asm/cacheflush.h>
76 #include <asm/io.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/poll.h>
80 #include <linux/module.h>
81 #include <linux/init.h>
82 #include <linux/mutex.h>
83 #include <linux/if_vlan.h>
84 #include <linux/virtio_net.h>
85 #include <linux/errqueue.h>
86 #include <linux/net_tstamp.h>
87
88 #ifdef CONFIG_INET
89 #include <net/inet_common.h>
90 #endif
91
92 /*
93    Assumptions:
94    - if device has no dev->hard_header routine, it adds and removes ll header
95      inside itself. In this case ll header is invisible outside of device,
96      but higher levels still should reserve dev->hard_header_len.
97      Some devices are enough clever to reallocate skb, when header
98      will not fit to reserved space (tunnel), another ones are silly
99      (PPP).
100    - packet socket receives packets with pulled ll header,
101      so that SOCK_RAW should push it back.
102
103 On receive:
104 -----------
105
106 Incoming, dev->hard_header!=NULL
107    mac_header -> ll header
108    data       -> data
109
110 Outgoing, dev->hard_header!=NULL
111    mac_header -> ll header
112    data       -> ll header
113
114 Incoming, dev->hard_header==NULL
115    mac_header -> UNKNOWN position. It is very likely, that it points to ll
116                  header.  PPP makes it, that is wrong, because introduce
117                  assymetry between rx and tx paths.
118    data       -> data
119
120 Outgoing, dev->hard_header==NULL
121    mac_header -> data. ll header is still not built!
122    data       -> data
123
124 Resume
125   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
126
127
128 On transmit:
129 ------------
130
131 dev->hard_header != NULL
132    mac_header -> ll header
133    data       -> ll header
134
135 dev->hard_header == NULL (ll header is added by device, we cannot control it)
136    mac_header -> data
137    data       -> data
138
139    We should set nh.raw on output to correct posistion,
140    packet classifier depends on it.
141  */
142
143 /* Private packet socket structures. */
144
145 struct packet_mclist {
146         struct packet_mclist    *next;
147         int                     ifindex;
148         int                     count;
149         unsigned short          type;
150         unsigned short          alen;
151         unsigned char           addr[MAX_ADDR_LEN];
152 };
153 /* identical to struct packet_mreq except it has
154  * a longer address field.
155  */
156 struct packet_mreq_max {
157         int             mr_ifindex;
158         unsigned short  mr_type;
159         unsigned short  mr_alen;
160         unsigned char   mr_address[MAX_ADDR_LEN];
161 };
162
163 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
164                 int closing, int tx_ring);
165
166 struct packet_ring_buffer {
167         char                    **pg_vec;
168         unsigned int            head;
169         unsigned int            frames_per_block;
170         unsigned int            frame_size;
171         unsigned int            frame_max;
172
173         unsigned int            pg_vec_order;
174         unsigned int            pg_vec_pages;
175         unsigned int            pg_vec_len;
176
177         atomic_t                pending;
178 };
179
180 struct packet_sock;
181 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
182
183 static void packet_flush_mclist(struct sock *sk);
184
185 struct packet_sock {
186         /* struct sock has to be the first member of packet_sock */
187         struct sock             sk;
188         struct tpacket_stats    stats;
189         struct packet_ring_buffer       rx_ring;
190         struct packet_ring_buffer       tx_ring;
191         int                     copy_thresh;
192         spinlock_t              bind_lock;
193         struct mutex            pg_vec_lock;
194         unsigned int            running:1,      /* prot_hook is attached*/
195                                 auxdata:1,
196                                 origdev:1,
197                                 has_vnet_hdr:1;
198         int                     ifindex;        /* bound device         */
199         __be16                  num;
200         struct packet_mclist    *mclist;
201         atomic_t                mapped;
202         enum tpacket_versions   tp_version;
203         unsigned int            tp_hdrlen;
204         unsigned int            tp_reserve;
205         unsigned int            tp_loss:1;
206         unsigned int            tp_tstamp;
207         struct packet_type      prot_hook ____cacheline_aligned_in_smp;
208 };
209
210 struct packet_skb_cb {
211         unsigned int origlen;
212         union {
213                 struct sockaddr_pkt pkt;
214                 struct sockaddr_ll ll;
215         } sa;
216 };
217
218 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
219
220 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
221 {
222         union {
223                 struct tpacket_hdr *h1;
224                 struct tpacket2_hdr *h2;
225                 void *raw;
226         } h;
227
228         h.raw = frame;
229         switch (po->tp_version) {
230         case TPACKET_V1:
231                 h.h1->tp_status = status;
232                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
233                 break;
234         case TPACKET_V2:
235                 h.h2->tp_status = status;
236                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
237                 break;
238         default:
239                 pr_err("TPACKET version not supported\n");
240                 BUG();
241         }
242
243         smp_wmb();
244 }
245
246 static int __packet_get_status(struct packet_sock *po, void *frame)
247 {
248         union {
249                 struct tpacket_hdr *h1;
250                 struct tpacket2_hdr *h2;
251                 void *raw;
252         } h;
253
254         smp_rmb();
255
256         h.raw = frame;
257         switch (po->tp_version) {
258         case TPACKET_V1:
259                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
260                 return h.h1->tp_status;
261         case TPACKET_V2:
262                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
263                 return h.h2->tp_status;
264         default:
265                 pr_err("TPACKET version not supported\n");
266                 BUG();
267                 return 0;
268         }
269 }
270
271 static void *packet_lookup_frame(struct packet_sock *po,
272                 struct packet_ring_buffer *rb,
273                 unsigned int position,
274                 int status)
275 {
276         unsigned int pg_vec_pos, frame_offset;
277         union {
278                 struct tpacket_hdr *h1;
279                 struct tpacket2_hdr *h2;
280                 void *raw;
281         } h;
282
283         pg_vec_pos = position / rb->frames_per_block;
284         frame_offset = position % rb->frames_per_block;
285
286         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
287
288         if (status != __packet_get_status(po, h.raw))
289                 return NULL;
290
291         return h.raw;
292 }
293
294 static inline void *packet_current_frame(struct packet_sock *po,
295                 struct packet_ring_buffer *rb,
296                 int status)
297 {
298         return packet_lookup_frame(po, rb, rb->head, status);
299 }
300
301 static inline void *packet_previous_frame(struct packet_sock *po,
302                 struct packet_ring_buffer *rb,
303                 int status)
304 {
305         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
306         return packet_lookup_frame(po, rb, previous, status);
307 }
308
309 static inline void packet_increment_head(struct packet_ring_buffer *buff)
310 {
311         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
312 }
313
314 static inline struct packet_sock *pkt_sk(struct sock *sk)
315 {
316         return (struct packet_sock *)sk;
317 }
318
319 static void packet_sock_destruct(struct sock *sk)
320 {
321         skb_queue_purge(&sk->sk_error_queue);
322
323         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
324         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
325
326         if (!sock_flag(sk, SOCK_DEAD)) {
327                 pr_err("Attempt to release alive packet socket: %p\n", sk);
328                 return;
329         }
330
331         sk_refcnt_debug_dec(sk);
332 }
333
334
335 static const struct proto_ops packet_ops;
336
337 static const struct proto_ops packet_ops_spkt;
338
339 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
340                            struct packet_type *pt, struct net_device *orig_dev)
341 {
342         struct sock *sk;
343         struct sockaddr_pkt *spkt;
344
345         /*
346          *      When we registered the protocol we saved the socket in the data
347          *      field for just this event.
348          */
349
350         sk = pt->af_packet_priv;
351
352         /*
353          *      Yank back the headers [hope the device set this
354          *      right or kerboom...]
355          *
356          *      Incoming packets have ll header pulled,
357          *      push it back.
358          *
359          *      For outgoing ones skb->data == skb_mac_header(skb)
360          *      so that this procedure is noop.
361          */
362
363         if (skb->pkt_type == PACKET_LOOPBACK)
364                 goto out;
365
366         if (!net_eq(dev_net(dev), sock_net(sk)))
367                 goto out;
368
369         skb = skb_share_check(skb, GFP_ATOMIC);
370         if (skb == NULL)
371                 goto oom;
372
373         /* drop any routing info */
374         skb_dst_drop(skb);
375
376         /* drop conntrack reference */
377         nf_reset(skb);
378
379         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
380
381         skb_push(skb, skb->data - skb_mac_header(skb));
382
383         /*
384          *      The SOCK_PACKET socket receives _all_ frames.
385          */
386
387         spkt->spkt_family = dev->type;
388         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
389         spkt->spkt_protocol = skb->protocol;
390
391         /*
392          *      Charge the memory to the socket. This is done specifically
393          *      to prevent sockets using all the memory up.
394          */
395
396         if (sock_queue_rcv_skb(sk, skb) == 0)
397                 return 0;
398
399 out:
400         kfree_skb(skb);
401 oom:
402         return 0;
403 }
404
405
406 /*
407  *      Output a raw packet to a device layer. This bypasses all the other
408  *      protocol layers and you must therefore supply it with a complete frame
409  */
410
411 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
412                                struct msghdr *msg, size_t len)
413 {
414         struct sock *sk = sock->sk;
415         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
416         struct sk_buff *skb = NULL;
417         struct net_device *dev;
418         __be16 proto = 0;
419         int err;
420
421         /*
422          *      Get and verify the address.
423          */
424
425         if (saddr) {
426                 if (msg->msg_namelen < sizeof(struct sockaddr))
427                         return -EINVAL;
428                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
429                         proto = saddr->spkt_protocol;
430         } else
431                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
432
433         /*
434          *      Find the device first to size check it
435          */
436
437         saddr->spkt_device[13] = 0;
438 retry:
439         rcu_read_lock();
440         dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
441         err = -ENODEV;
442         if (dev == NULL)
443                 goto out_unlock;
444
445         err = -ENETDOWN;
446         if (!(dev->flags & IFF_UP))
447                 goto out_unlock;
448
449         /*
450          * You may not queue a frame bigger than the mtu. This is the lowest level
451          * raw protocol and you must do your own fragmentation at this level.
452          */
453
454         err = -EMSGSIZE;
455         if (len > dev->mtu + dev->hard_header_len)
456                 goto out_unlock;
457
458         if (!skb) {
459                 size_t reserved = LL_RESERVED_SPACE(dev);
460                 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
461
462                 rcu_read_unlock();
463                 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
464                 if (skb == NULL)
465                         return -ENOBUFS;
466                 /* FIXME: Save some space for broken drivers that write a hard
467                  * header at transmission time by themselves. PPP is the notable
468                  * one here. This should really be fixed at the driver level.
469                  */
470                 skb_reserve(skb, reserved);
471                 skb_reset_network_header(skb);
472
473                 /* Try to align data part correctly */
474                 if (hhlen) {
475                         skb->data -= hhlen;
476                         skb->tail -= hhlen;
477                         if (len < hhlen)
478                                 skb_reset_network_header(skb);
479                 }
480                 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
481                 if (err)
482                         goto out_free;
483                 goto retry;
484         }
485
486
487         skb->protocol = proto;
488         skb->dev = dev;
489         skb->priority = sk->sk_priority;
490         skb->mark = sk->sk_mark;
491         err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
492         if (err < 0)
493                 goto out_unlock;
494
495         dev_queue_xmit(skb);
496         rcu_read_unlock();
497         return len;
498
499 out_unlock:
500         rcu_read_unlock();
501 out_free:
502         kfree_skb(skb);
503         return err;
504 }
505
506 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
507                                       unsigned int res)
508 {
509         struct sk_filter *filter;
510
511         rcu_read_lock_bh();
512         filter = rcu_dereference_bh(sk->sk_filter);
513         if (filter != NULL)
514                 res = sk_run_filter(skb, filter->insns, filter->len);
515         rcu_read_unlock_bh();
516
517         return res;
518 }
519
520 /*
521    This function makes lazy skb cloning in hope that most of packets
522    are discarded by BPF.
523
524    Note tricky part: we DO mangle shared skb! skb->data, skb->len
525    and skb->cb are mangled. It works because (and until) packets
526    falling here are owned by current CPU. Output packets are cloned
527    by dev_queue_xmit_nit(), input packets are processed by net_bh
528    sequencially, so that if we return skb to original state on exit,
529    we will not harm anyone.
530  */
531
532 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
533                       struct packet_type *pt, struct net_device *orig_dev)
534 {
535         struct sock *sk;
536         struct sockaddr_ll *sll;
537         struct packet_sock *po;
538         u8 *skb_head = skb->data;
539         int skb_len = skb->len;
540         unsigned int snaplen, res;
541
542         if (skb->pkt_type == PACKET_LOOPBACK)
543                 goto drop;
544
545         sk = pt->af_packet_priv;
546         po = pkt_sk(sk);
547
548         if (!net_eq(dev_net(dev), sock_net(sk)))
549                 goto drop;
550
551         skb->dev = dev;
552
553         if (dev->header_ops) {
554                 /* The device has an explicit notion of ll header,
555                    exported to higher levels.
556
557                    Otherwise, the device hides datails of it frame
558                    structure, so that corresponding packet head
559                    never delivered to user.
560                  */
561                 if (sk->sk_type != SOCK_DGRAM)
562                         skb_push(skb, skb->data - skb_mac_header(skb));
563                 else if (skb->pkt_type == PACKET_OUTGOING) {
564                         /* Special case: outgoing packets have ll header at head */
565                         skb_pull(skb, skb_network_offset(skb));
566                 }
567         }
568
569         snaplen = skb->len;
570
571         res = run_filter(skb, sk, snaplen);
572         if (!res)
573                 goto drop_n_restore;
574         if (snaplen > res)
575                 snaplen = res;
576
577         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
578             (unsigned)sk->sk_rcvbuf)
579                 goto drop_n_acct;
580
581         if (skb_shared(skb)) {
582                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
583                 if (nskb == NULL)
584                         goto drop_n_acct;
585
586                 if (skb_head != skb->data) {
587                         skb->data = skb_head;
588                         skb->len = skb_len;
589                 }
590                 kfree_skb(skb);
591                 skb = nskb;
592         }
593
594         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
595                      sizeof(skb->cb));
596
597         sll = &PACKET_SKB_CB(skb)->sa.ll;
598         sll->sll_family = AF_PACKET;
599         sll->sll_hatype = dev->type;
600         sll->sll_protocol = skb->protocol;
601         sll->sll_pkttype = skb->pkt_type;
602         if (unlikely(po->origdev))
603                 sll->sll_ifindex = orig_dev->ifindex;
604         else
605                 sll->sll_ifindex = dev->ifindex;
606
607         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
608
609         PACKET_SKB_CB(skb)->origlen = skb->len;
610
611         if (pskb_trim(skb, snaplen))
612                 goto drop_n_acct;
613
614         skb_set_owner_r(skb, sk);
615         skb->dev = NULL;
616         skb_dst_drop(skb);
617
618         /* drop conntrack reference */
619         nf_reset(skb);
620
621         spin_lock(&sk->sk_receive_queue.lock);
622         po->stats.tp_packets++;
623         skb->dropcount = atomic_read(&sk->sk_drops);
624         __skb_queue_tail(&sk->sk_receive_queue, skb);
625         spin_unlock(&sk->sk_receive_queue.lock);
626         sk->sk_data_ready(sk, skb->len);
627         return 0;
628
629 drop_n_acct:
630         po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
631
632 drop_n_restore:
633         if (skb_head != skb->data && skb_shared(skb)) {
634                 skb->data = skb_head;
635                 skb->len = skb_len;
636         }
637 drop:
638         consume_skb(skb);
639         return 0;
640 }
641
642 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
643                        struct packet_type *pt, struct net_device *orig_dev)
644 {
645         struct sock *sk;
646         struct packet_sock *po;
647         struct sockaddr_ll *sll;
648         union {
649                 struct tpacket_hdr *h1;
650                 struct tpacket2_hdr *h2;
651                 void *raw;
652         } h;
653         u8 *skb_head = skb->data;
654         int skb_len = skb->len;
655         unsigned int snaplen, res;
656         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
657         unsigned short macoff, netoff, hdrlen;
658         struct sk_buff *copy_skb = NULL;
659         struct timeval tv;
660         struct timespec ts;
661         struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
662
663         if (skb->pkt_type == PACKET_LOOPBACK)
664                 goto drop;
665
666         sk = pt->af_packet_priv;
667         po = pkt_sk(sk);
668
669         if (!net_eq(dev_net(dev), sock_net(sk)))
670                 goto drop;
671
672         if (dev->header_ops) {
673                 if (sk->sk_type != SOCK_DGRAM)
674                         skb_push(skb, skb->data - skb_mac_header(skb));
675                 else if (skb->pkt_type == PACKET_OUTGOING) {
676                         /* Special case: outgoing packets have ll header at head */
677                         skb_pull(skb, skb_network_offset(skb));
678                 }
679         }
680
681         if (skb->ip_summed == CHECKSUM_PARTIAL)
682                 status |= TP_STATUS_CSUMNOTREADY;
683
684         snaplen = skb->len;
685
686         res = run_filter(skb, sk, snaplen);
687         if (!res)
688                 goto drop_n_restore;
689         if (snaplen > res)
690                 snaplen = res;
691
692         if (sk->sk_type == SOCK_DGRAM) {
693                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
694                                   po->tp_reserve;
695         } else {
696                 unsigned maclen = skb_network_offset(skb);
697                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
698                                        (maclen < 16 ? 16 : maclen)) +
699                         po->tp_reserve;
700                 macoff = netoff - maclen;
701         }
702
703         if (macoff + snaplen > po->rx_ring.frame_size) {
704                 if (po->copy_thresh &&
705                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
706                     (unsigned)sk->sk_rcvbuf) {
707                         if (skb_shared(skb)) {
708                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
709                         } else {
710                                 copy_skb = skb_get(skb);
711                                 skb_head = skb->data;
712                         }
713                         if (copy_skb)
714                                 skb_set_owner_r(copy_skb, sk);
715                 }
716                 snaplen = po->rx_ring.frame_size - macoff;
717                 if ((int)snaplen < 0)
718                         snaplen = 0;
719         }
720
721         spin_lock(&sk->sk_receive_queue.lock);
722         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
723         if (!h.raw)
724                 goto ring_is_full;
725         packet_increment_head(&po->rx_ring);
726         po->stats.tp_packets++;
727         if (copy_skb) {
728                 status |= TP_STATUS_COPY;
729                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
730         }
731         if (!po->stats.tp_drops)
732                 status &= ~TP_STATUS_LOSING;
733         spin_unlock(&sk->sk_receive_queue.lock);
734
735         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
736
737         switch (po->tp_version) {
738         case TPACKET_V1:
739                 h.h1->tp_len = skb->len;
740                 h.h1->tp_snaplen = snaplen;
741                 h.h1->tp_mac = macoff;
742                 h.h1->tp_net = netoff;
743                 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
744                                 && shhwtstamps->syststamp.tv64)
745                         tv = ktime_to_timeval(shhwtstamps->syststamp);
746                 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
747                                 && shhwtstamps->hwtstamp.tv64)
748                         tv = ktime_to_timeval(shhwtstamps->hwtstamp);
749                 else if (skb->tstamp.tv64)
750                         tv = ktime_to_timeval(skb->tstamp);
751                 else
752                         do_gettimeofday(&tv);
753                 h.h1->tp_sec = tv.tv_sec;
754                 h.h1->tp_usec = tv.tv_usec;
755                 hdrlen = sizeof(*h.h1);
756                 break;
757         case TPACKET_V2:
758                 h.h2->tp_len = skb->len;
759                 h.h2->tp_snaplen = snaplen;
760                 h.h2->tp_mac = macoff;
761                 h.h2->tp_net = netoff;
762                 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
763                                 && shhwtstamps->syststamp.tv64)
764                         ts = ktime_to_timespec(shhwtstamps->syststamp);
765                 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
766                                 && shhwtstamps->hwtstamp.tv64)
767                         ts = ktime_to_timespec(shhwtstamps->hwtstamp);
768                 else if (skb->tstamp.tv64)
769                         ts = ktime_to_timespec(skb->tstamp);
770                 else
771                         getnstimeofday(&ts);
772                 h.h2->tp_sec = ts.tv_sec;
773                 h.h2->tp_nsec = ts.tv_nsec;
774                 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
775                 hdrlen = sizeof(*h.h2);
776                 break;
777         default:
778                 BUG();
779         }
780
781         sll = h.raw + TPACKET_ALIGN(hdrlen);
782         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
783         sll->sll_family = AF_PACKET;
784         sll->sll_hatype = dev->type;
785         sll->sll_protocol = skb->protocol;
786         sll->sll_pkttype = skb->pkt_type;
787         if (unlikely(po->origdev))
788                 sll->sll_ifindex = orig_dev->ifindex;
789         else
790                 sll->sll_ifindex = dev->ifindex;
791
792         __packet_set_status(po, h.raw, status);
793         smp_mb();
794         {
795                 struct page *p_start, *p_end;
796                 u8 *h_end = h.raw + macoff + snaplen - 1;
797
798                 p_start = virt_to_page(h.raw);
799                 p_end = virt_to_page(h_end);
800                 while (p_start <= p_end) {
801                         flush_dcache_page(p_start);
802                         p_start++;
803                 }
804         }
805
806         sk->sk_data_ready(sk, 0);
807
808 drop_n_restore:
809         if (skb_head != skb->data && skb_shared(skb)) {
810                 skb->data = skb_head;
811                 skb->len = skb_len;
812         }
813 drop:
814         kfree_skb(skb);
815         return 0;
816
817 ring_is_full:
818         po->stats.tp_drops++;
819         spin_unlock(&sk->sk_receive_queue.lock);
820
821         sk->sk_data_ready(sk, 0);
822         kfree_skb(copy_skb);
823         goto drop_n_restore;
824 }
825
826 static void tpacket_destruct_skb(struct sk_buff *skb)
827 {
828         struct packet_sock *po = pkt_sk(skb->sk);
829         void *ph;
830
831         BUG_ON(skb == NULL);
832
833         if (likely(po->tx_ring.pg_vec)) {
834                 ph = skb_shinfo(skb)->destructor_arg;
835                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
836                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
837                 atomic_dec(&po->tx_ring.pending);
838                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
839         }
840
841         sock_wfree(skb);
842 }
843
844 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
845                 void *frame, struct net_device *dev, int size_max,
846                 __be16 proto, unsigned char *addr)
847 {
848         union {
849                 struct tpacket_hdr *h1;
850                 struct tpacket2_hdr *h2;
851                 void *raw;
852         } ph;
853         int to_write, offset, len, tp_len, nr_frags, len_max;
854         struct socket *sock = po->sk.sk_socket;
855         struct page *page;
856         void *data;
857         int err;
858
859         ph.raw = frame;
860
861         skb->protocol = proto;
862         skb->dev = dev;
863         skb->priority = po->sk.sk_priority;
864         skb->mark = po->sk.sk_mark;
865         skb_shinfo(skb)->destructor_arg = ph.raw;
866
867         switch (po->tp_version) {
868         case TPACKET_V2:
869                 tp_len = ph.h2->tp_len;
870                 break;
871         default:
872                 tp_len = ph.h1->tp_len;
873                 break;
874         }
875         if (unlikely(tp_len > size_max)) {
876                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
877                 return -EMSGSIZE;
878         }
879
880         skb_reserve(skb, LL_RESERVED_SPACE(dev));
881         skb_reset_network_header(skb);
882
883         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
884         to_write = tp_len;
885
886         if (sock->type == SOCK_DGRAM) {
887                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
888                                 NULL, tp_len);
889                 if (unlikely(err < 0))
890                         return -EINVAL;
891         } else if (dev->hard_header_len) {
892                 /* net device doesn't like empty head */
893                 if (unlikely(tp_len <= dev->hard_header_len)) {
894                         pr_err("packet size is too short (%d < %d)\n",
895                                tp_len, dev->hard_header_len);
896                         return -EINVAL;
897                 }
898
899                 skb_push(skb, dev->hard_header_len);
900                 err = skb_store_bits(skb, 0, data,
901                                 dev->hard_header_len);
902                 if (unlikely(err))
903                         return err;
904
905                 data += dev->hard_header_len;
906                 to_write -= dev->hard_header_len;
907         }
908
909         err = -EFAULT;
910         page = virt_to_page(data);
911         offset = offset_in_page(data);
912         len_max = PAGE_SIZE - offset;
913         len = ((to_write > len_max) ? len_max : to_write);
914
915         skb->data_len = to_write;
916         skb->len += to_write;
917         skb->truesize += to_write;
918         atomic_add(to_write, &po->sk.sk_wmem_alloc);
919
920         while (likely(to_write)) {
921                 nr_frags = skb_shinfo(skb)->nr_frags;
922
923                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
924                         pr_err("Packet exceed the number of skb frags(%lu)\n",
925                                MAX_SKB_FRAGS);
926                         return -EFAULT;
927                 }
928
929                 flush_dcache_page(page);
930                 get_page(page);
931                 skb_fill_page_desc(skb,
932                                 nr_frags,
933                                 page++, offset, len);
934                 to_write -= len;
935                 offset = 0;
936                 len_max = PAGE_SIZE;
937                 len = ((to_write > len_max) ? len_max : to_write);
938         }
939
940         return tp_len;
941 }
942
943 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
944 {
945         struct socket *sock;
946         struct sk_buff *skb;
947         struct net_device *dev;
948         __be16 proto;
949         int ifindex, err, reserve = 0;
950         void *ph;
951         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
952         int tp_len, size_max;
953         unsigned char *addr;
954         int len_sum = 0;
955         int status = 0;
956
957         sock = po->sk.sk_socket;
958
959         mutex_lock(&po->pg_vec_lock);
960
961         err = -EBUSY;
962         if (saddr == NULL) {
963                 ifindex = po->ifindex;
964                 proto   = po->num;
965                 addr    = NULL;
966         } else {
967                 err = -EINVAL;
968                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
969                         goto out;
970                 if (msg->msg_namelen < (saddr->sll_halen
971                                         + offsetof(struct sockaddr_ll,
972                                                 sll_addr)))
973                         goto out;
974                 ifindex = saddr->sll_ifindex;
975                 proto   = saddr->sll_protocol;
976                 addr    = saddr->sll_addr;
977         }
978
979         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
980         err = -ENXIO;
981         if (unlikely(dev == NULL))
982                 goto out;
983
984         reserve = dev->hard_header_len;
985
986         err = -ENETDOWN;
987         if (unlikely(!(dev->flags & IFF_UP)))
988                 goto out_put;
989
990         size_max = po->tx_ring.frame_size
991                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
992
993         if (size_max > dev->mtu + reserve)
994                 size_max = dev->mtu + reserve;
995
996         do {
997                 ph = packet_current_frame(po, &po->tx_ring,
998                                 TP_STATUS_SEND_REQUEST);
999
1000                 if (unlikely(ph == NULL)) {
1001                         schedule();
1002                         continue;
1003                 }
1004
1005                 status = TP_STATUS_SEND_REQUEST;
1006                 skb = sock_alloc_send_skb(&po->sk,
1007                                 LL_ALLOCATED_SPACE(dev)
1008                                 + sizeof(struct sockaddr_ll),
1009                                 0, &err);
1010
1011                 if (unlikely(skb == NULL))
1012                         goto out_status;
1013
1014                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1015                                 addr);
1016
1017                 if (unlikely(tp_len < 0)) {
1018                         if (po->tp_loss) {
1019                                 __packet_set_status(po, ph,
1020                                                 TP_STATUS_AVAILABLE);
1021                                 packet_increment_head(&po->tx_ring);
1022                                 kfree_skb(skb);
1023                                 continue;
1024                         } else {
1025                                 status = TP_STATUS_WRONG_FORMAT;
1026                                 err = tp_len;
1027                                 goto out_status;
1028                         }
1029                 }
1030
1031                 skb->destructor = tpacket_destruct_skb;
1032                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1033                 atomic_inc(&po->tx_ring.pending);
1034
1035                 status = TP_STATUS_SEND_REQUEST;
1036                 err = dev_queue_xmit(skb);
1037                 if (unlikely(err > 0)) {
1038                         err = net_xmit_errno(err);
1039                         if (err && __packet_get_status(po, ph) ==
1040                                    TP_STATUS_AVAILABLE) {
1041                                 /* skb was destructed already */
1042                                 skb = NULL;
1043                                 goto out_status;
1044                         }
1045                         /*
1046                          * skb was dropped but not destructed yet;
1047                          * let's treat it like congestion or err < 0
1048                          */
1049                         err = 0;
1050                 }
1051                 packet_increment_head(&po->tx_ring);
1052                 len_sum += tp_len;
1053         } while (likely((ph != NULL) ||
1054                         ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1055                          (atomic_read(&po->tx_ring.pending))))
1056                 );
1057
1058         err = len_sum;
1059         goto out_put;
1060
1061 out_status:
1062         __packet_set_status(po, ph, status);
1063         kfree_skb(skb);
1064 out_put:
1065         dev_put(dev);
1066 out:
1067         mutex_unlock(&po->pg_vec_lock);
1068         return err;
1069 }
1070
1071 static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1072                                                size_t reserve, size_t len,
1073                                                size_t linear, int noblock,
1074                                                int *err)
1075 {
1076         struct sk_buff *skb;
1077
1078         /* Under a page?  Don't bother with paged skb. */
1079         if (prepad + len < PAGE_SIZE || !linear)
1080                 linear = len;
1081
1082         skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1083                                    err);
1084         if (!skb)
1085                 return NULL;
1086
1087         skb_reserve(skb, reserve);
1088         skb_put(skb, linear);
1089         skb->data_len = len - linear;
1090         skb->len += len - linear;
1091
1092         return skb;
1093 }
1094
1095 static int packet_snd(struct socket *sock,
1096                           struct msghdr *msg, size_t len)
1097 {
1098         struct sock *sk = sock->sk;
1099         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1100         struct sk_buff *skb;
1101         struct net_device *dev;
1102         __be16 proto;
1103         unsigned char *addr;
1104         int ifindex, err, reserve = 0;
1105         struct virtio_net_hdr vnet_hdr = { 0 };
1106         int offset = 0;
1107         int vnet_hdr_len;
1108         struct packet_sock *po = pkt_sk(sk);
1109         unsigned short gso_type = 0;
1110
1111         /*
1112          *      Get and verify the address.
1113          */
1114
1115         if (saddr == NULL) {
1116                 ifindex = po->ifindex;
1117                 proto   = po->num;
1118                 addr    = NULL;
1119         } else {
1120                 err = -EINVAL;
1121                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1122                         goto out;
1123                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1124                         goto out;
1125                 ifindex = saddr->sll_ifindex;
1126                 proto   = saddr->sll_protocol;
1127                 addr    = saddr->sll_addr;
1128         }
1129
1130
1131         dev = dev_get_by_index(sock_net(sk), ifindex);
1132         err = -ENXIO;
1133         if (dev == NULL)
1134                 goto out_unlock;
1135         if (sock->type == SOCK_RAW)
1136                 reserve = dev->hard_header_len;
1137
1138         err = -ENETDOWN;
1139         if (!(dev->flags & IFF_UP))
1140                 goto out_unlock;
1141
1142         if (po->has_vnet_hdr) {
1143                 vnet_hdr_len = sizeof(vnet_hdr);
1144
1145                 err = -EINVAL;
1146                 if (len < vnet_hdr_len)
1147                         goto out_unlock;
1148
1149                 len -= vnet_hdr_len;
1150
1151                 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1152                                        vnet_hdr_len);
1153                 if (err < 0)
1154                         goto out_unlock;
1155
1156                 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1157                     (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1158                       vnet_hdr.hdr_len))
1159                         vnet_hdr.hdr_len = vnet_hdr.csum_start +
1160                                                  vnet_hdr.csum_offset + 2;
1161
1162                 err = -EINVAL;
1163                 if (vnet_hdr.hdr_len > len)
1164                         goto out_unlock;
1165
1166                 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1167                         switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1168                         case VIRTIO_NET_HDR_GSO_TCPV4:
1169                                 gso_type = SKB_GSO_TCPV4;
1170                                 break;
1171                         case VIRTIO_NET_HDR_GSO_TCPV6:
1172                                 gso_type = SKB_GSO_TCPV6;
1173                                 break;
1174                         case VIRTIO_NET_HDR_GSO_UDP:
1175                                 gso_type = SKB_GSO_UDP;
1176                                 break;
1177                         default:
1178                                 goto out_unlock;
1179                         }
1180
1181                         if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1182                                 gso_type |= SKB_GSO_TCP_ECN;
1183
1184                         if (vnet_hdr.gso_size == 0)
1185                                 goto out_unlock;
1186
1187                 }
1188         }
1189
1190         err = -EMSGSIZE;
1191         if (!gso_type && (len > dev->mtu+reserve))
1192                 goto out_unlock;
1193
1194         err = -ENOBUFS;
1195         skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1196                                LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1197                                msg->msg_flags & MSG_DONTWAIT, &err);
1198         if (skb == NULL)
1199                 goto out_unlock;
1200
1201         skb_set_network_header(skb, reserve);
1202
1203         err = -EINVAL;
1204         if (sock->type == SOCK_DGRAM &&
1205             (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1206                 goto out_free;
1207
1208         /* Returns -EFAULT on error */
1209         err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1210         if (err)
1211                 goto out_free;
1212         err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1213         if (err < 0)
1214                 goto out_free;
1215
1216         skb->protocol = proto;
1217         skb->dev = dev;
1218         skb->priority = sk->sk_priority;
1219         skb->mark = sk->sk_mark;
1220
1221         if (po->has_vnet_hdr) {
1222                 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1223                         if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1224                                                   vnet_hdr.csum_offset)) {
1225                                 err = -EINVAL;
1226                                 goto out_free;
1227                         }
1228                 }
1229
1230                 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1231                 skb_shinfo(skb)->gso_type = gso_type;
1232
1233                 /* Header must be checked, and gso_segs computed. */
1234                 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1235                 skb_shinfo(skb)->gso_segs = 0;
1236
1237                 len += vnet_hdr_len;
1238         }
1239
1240         /*
1241          *      Now send it
1242          */
1243
1244         err = dev_queue_xmit(skb);
1245         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1246                 goto out_unlock;
1247
1248         dev_put(dev);
1249
1250         return len;
1251
1252 out_free:
1253         kfree_skb(skb);
1254 out_unlock:
1255         if (dev)
1256                 dev_put(dev);
1257 out:
1258         return err;
1259 }
1260
1261 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1262                 struct msghdr *msg, size_t len)
1263 {
1264         struct sock *sk = sock->sk;
1265         struct packet_sock *po = pkt_sk(sk);
1266         if (po->tx_ring.pg_vec)
1267                 return tpacket_snd(po, msg);
1268         else
1269                 return packet_snd(sock, msg, len);
1270 }
1271
1272 /*
1273  *      Close a PACKET socket. This is fairly simple. We immediately go
1274  *      to 'closed' state and remove our protocol entry in the device list.
1275  */
1276
1277 static int packet_release(struct socket *sock)
1278 {
1279         struct sock *sk = sock->sk;
1280         struct packet_sock *po;
1281         struct net *net;
1282         struct tpacket_req req;
1283
1284         if (!sk)
1285                 return 0;
1286
1287         net = sock_net(sk);
1288         po = pkt_sk(sk);
1289
1290         spin_lock_bh(&net->packet.sklist_lock);
1291         sk_del_node_init_rcu(sk);
1292         sock_prot_inuse_add(net, sk->sk_prot, -1);
1293         spin_unlock_bh(&net->packet.sklist_lock);
1294
1295         spin_lock(&po->bind_lock);
1296         if (po->running) {
1297                 /*
1298                  * Remove from protocol table
1299                  */
1300                 po->running = 0;
1301                 po->num = 0;
1302                 __dev_remove_pack(&po->prot_hook);
1303                 __sock_put(sk);
1304         }
1305         spin_unlock(&po->bind_lock);
1306
1307         packet_flush_mclist(sk);
1308
1309         memset(&req, 0, sizeof(req));
1310
1311         if (po->rx_ring.pg_vec)
1312                 packet_set_ring(sk, &req, 1, 0);
1313
1314         if (po->tx_ring.pg_vec)
1315                 packet_set_ring(sk, &req, 1, 1);
1316
1317         synchronize_net();
1318         /*
1319          *      Now the socket is dead. No more input will appear.
1320          */
1321         sock_orphan(sk);
1322         sock->sk = NULL;
1323
1324         /* Purge queues */
1325
1326         skb_queue_purge(&sk->sk_receive_queue);
1327         sk_refcnt_debug_release(sk);
1328
1329         sock_put(sk);
1330         return 0;
1331 }
1332
1333 /*
1334  *      Attach a packet hook.
1335  */
1336
1337 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1338 {
1339         struct packet_sock *po = pkt_sk(sk);
1340         /*
1341          *      Detach an existing hook if present.
1342          */
1343
1344         lock_sock(sk);
1345
1346         spin_lock(&po->bind_lock);
1347         if (po->running) {
1348                 __sock_put(sk);
1349                 po->running = 0;
1350                 po->num = 0;
1351                 spin_unlock(&po->bind_lock);
1352                 dev_remove_pack(&po->prot_hook);
1353                 spin_lock(&po->bind_lock);
1354         }
1355
1356         po->num = protocol;
1357         po->prot_hook.type = protocol;
1358         po->prot_hook.dev = dev;
1359
1360         po->ifindex = dev ? dev->ifindex : 0;
1361
1362         if (protocol == 0)
1363                 goto out_unlock;
1364
1365         if (!dev || (dev->flags & IFF_UP)) {
1366                 dev_add_pack(&po->prot_hook);
1367                 sock_hold(sk);
1368                 po->running = 1;
1369         } else {
1370                 sk->sk_err = ENETDOWN;
1371                 if (!sock_flag(sk, SOCK_DEAD))
1372                         sk->sk_error_report(sk);
1373         }
1374
1375 out_unlock:
1376         spin_unlock(&po->bind_lock);
1377         release_sock(sk);
1378         return 0;
1379 }
1380
1381 /*
1382  *      Bind a packet socket to a device
1383  */
1384
1385 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1386                             int addr_len)
1387 {
1388         struct sock *sk = sock->sk;
1389         char name[15];
1390         struct net_device *dev;
1391         int err = -ENODEV;
1392
1393         /*
1394          *      Check legality
1395          */
1396
1397         if (addr_len != sizeof(struct sockaddr))
1398                 return -EINVAL;
1399         strlcpy(name, uaddr->sa_data, sizeof(name));
1400
1401         dev = dev_get_by_name(sock_net(sk), name);
1402         if (dev) {
1403                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1404                 dev_put(dev);
1405         }
1406         return err;
1407 }
1408
1409 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1410 {
1411         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1412         struct sock *sk = sock->sk;
1413         struct net_device *dev = NULL;
1414         int err;
1415
1416
1417         /*
1418          *      Check legality
1419          */
1420
1421         if (addr_len < sizeof(struct sockaddr_ll))
1422                 return -EINVAL;
1423         if (sll->sll_family != AF_PACKET)
1424                 return -EINVAL;
1425
1426         if (sll->sll_ifindex) {
1427                 err = -ENODEV;
1428                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1429                 if (dev == NULL)
1430                         goto out;
1431         }
1432         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1433         if (dev)
1434                 dev_put(dev);
1435
1436 out:
1437         return err;
1438 }
1439
1440 static struct proto packet_proto = {
1441         .name     = "PACKET",
1442         .owner    = THIS_MODULE,
1443         .obj_size = sizeof(struct packet_sock),
1444 };
1445
1446 /*
1447  *      Create a packet of type SOCK_PACKET.
1448  */
1449
1450 static int packet_create(struct net *net, struct socket *sock, int protocol,
1451                          int kern)
1452 {
1453         struct sock *sk;
1454         struct packet_sock *po;
1455         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1456         int err;
1457
1458         if (!capable(CAP_NET_RAW))
1459                 return -EPERM;
1460         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1461             sock->type != SOCK_PACKET)
1462                 return -ESOCKTNOSUPPORT;
1463
1464         sock->state = SS_UNCONNECTED;
1465
1466         err = -ENOBUFS;
1467         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1468         if (sk == NULL)
1469                 goto out;
1470
1471         sock->ops = &packet_ops;
1472         if (sock->type == SOCK_PACKET)
1473                 sock->ops = &packet_ops_spkt;
1474
1475         sock_init_data(sock, sk);
1476
1477         po = pkt_sk(sk);
1478         sk->sk_family = PF_PACKET;
1479         po->num = proto;
1480
1481         sk->sk_destruct = packet_sock_destruct;
1482         sk_refcnt_debug_inc(sk);
1483
1484         /*
1485          *      Attach a protocol block
1486          */
1487
1488         spin_lock_init(&po->bind_lock);
1489         mutex_init(&po->pg_vec_lock);
1490         po->prot_hook.func = packet_rcv;
1491
1492         if (sock->type == SOCK_PACKET)
1493                 po->prot_hook.func = packet_rcv_spkt;
1494
1495         po->prot_hook.af_packet_priv = sk;
1496
1497         if (proto) {
1498                 po->prot_hook.type = proto;
1499                 dev_add_pack(&po->prot_hook);
1500                 sock_hold(sk);
1501                 po->running = 1;
1502         }
1503
1504         spin_lock_bh(&net->packet.sklist_lock);
1505         sk_add_node_rcu(sk, &net->packet.sklist);
1506         sock_prot_inuse_add(net, &packet_proto, 1);
1507         spin_unlock_bh(&net->packet.sklist_lock);
1508
1509         return 0;
1510 out:
1511         return err;
1512 }
1513
1514 static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
1515 {
1516         struct sock_exterr_skb *serr;
1517         struct sk_buff *skb, *skb2;
1518         int copied, err;
1519
1520         err = -EAGAIN;
1521         skb = skb_dequeue(&sk->sk_error_queue);
1522         if (skb == NULL)
1523                 goto out;
1524
1525         copied = skb->len;
1526         if (copied > len) {
1527                 msg->msg_flags |= MSG_TRUNC;
1528                 copied = len;
1529         }
1530         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1531         if (err)
1532                 goto out_free_skb;
1533
1534         sock_recv_timestamp(msg, sk, skb);
1535
1536         serr = SKB_EXT_ERR(skb);
1537         put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
1538                  sizeof(serr->ee), &serr->ee);
1539
1540         msg->msg_flags |= MSG_ERRQUEUE;
1541         err = copied;
1542
1543         /* Reset and regenerate socket error */
1544         spin_lock_bh(&sk->sk_error_queue.lock);
1545         sk->sk_err = 0;
1546         if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
1547                 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
1548                 spin_unlock_bh(&sk->sk_error_queue.lock);
1549                 sk->sk_error_report(sk);
1550         } else
1551                 spin_unlock_bh(&sk->sk_error_queue.lock);
1552
1553 out_free_skb:
1554         kfree_skb(skb);
1555 out:
1556         return err;
1557 }
1558
1559 /*
1560  *      Pull a packet from our receive queue and hand it to the user.
1561  *      If necessary we block.
1562  */
1563
1564 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1565                           struct msghdr *msg, size_t len, int flags)
1566 {
1567         struct sock *sk = sock->sk;
1568         struct sk_buff *skb;
1569         int copied, err;
1570         struct sockaddr_ll *sll;
1571         int vnet_hdr_len = 0;
1572
1573         err = -EINVAL;
1574         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1575                 goto out;
1576
1577 #if 0
1578         /* What error should we return now? EUNATTACH? */
1579         if (pkt_sk(sk)->ifindex < 0)
1580                 return -ENODEV;
1581 #endif
1582
1583         if (flags & MSG_ERRQUEUE) {
1584                 err = packet_recv_error(sk, msg, len);
1585                 goto out;
1586         }
1587
1588         /*
1589          *      Call the generic datagram receiver. This handles all sorts
1590          *      of horrible races and re-entrancy so we can forget about it
1591          *      in the protocol layers.
1592          *
1593          *      Now it will return ENETDOWN, if device have just gone down,
1594          *      but then it will block.
1595          */
1596
1597         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1598
1599         /*
1600          *      An error occurred so return it. Because skb_recv_datagram()
1601          *      handles the blocking we don't see and worry about blocking
1602          *      retries.
1603          */
1604
1605         if (skb == NULL)
1606                 goto out;
1607
1608         if (pkt_sk(sk)->has_vnet_hdr) {
1609                 struct virtio_net_hdr vnet_hdr = { 0 };
1610
1611                 err = -EINVAL;
1612                 vnet_hdr_len = sizeof(vnet_hdr);
1613                 if (len < vnet_hdr_len)
1614                         goto out_free;
1615
1616                 len -= vnet_hdr_len;
1617
1618                 if (skb_is_gso(skb)) {
1619                         struct skb_shared_info *sinfo = skb_shinfo(skb);
1620
1621                         /* This is a hint as to how much should be linear. */
1622                         vnet_hdr.hdr_len = skb_headlen(skb);
1623                         vnet_hdr.gso_size = sinfo->gso_size;
1624                         if (sinfo->gso_type & SKB_GSO_TCPV4)
1625                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1626                         else if (sinfo->gso_type & SKB_GSO_TCPV6)
1627                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1628                         else if (sinfo->gso_type & SKB_GSO_UDP)
1629                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1630                         else if (sinfo->gso_type & SKB_GSO_FCOE)
1631                                 goto out_free;
1632                         else
1633                                 BUG();
1634                         if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1635                                 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1636                 } else
1637                         vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1638
1639                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1640                         vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1641                         vnet_hdr.csum_start = skb->csum_start -
1642                                                         skb_headroom(skb);
1643                         vnet_hdr.csum_offset = skb->csum_offset;
1644                 } /* else everything is zero */
1645
1646                 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1647                                      vnet_hdr_len);
1648                 if (err < 0)
1649                         goto out_free;
1650         }
1651
1652         /*
1653          *      If the address length field is there to be filled in, we fill
1654          *      it in now.
1655          */
1656
1657         sll = &PACKET_SKB_CB(skb)->sa.ll;
1658         if (sock->type == SOCK_PACKET)
1659                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1660         else
1661                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1662
1663         /*
1664          *      You lose any data beyond the buffer you gave. If it worries a
1665          *      user program they can ask the device for its MTU anyway.
1666          */
1667
1668         copied = skb->len;
1669         if (copied > len) {
1670                 copied = len;
1671                 msg->msg_flags |= MSG_TRUNC;
1672         }
1673
1674         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1675         if (err)
1676                 goto out_free;
1677
1678         sock_recv_ts_and_drops(msg, sk, skb);
1679
1680         if (msg->msg_name)
1681                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1682                        msg->msg_namelen);
1683
1684         if (pkt_sk(sk)->auxdata) {
1685                 struct tpacket_auxdata aux;
1686
1687                 aux.tp_status = TP_STATUS_USER;
1688                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1689                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1690                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1691                 aux.tp_snaplen = skb->len;
1692                 aux.tp_mac = 0;
1693                 aux.tp_net = skb_network_offset(skb);
1694                 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1695
1696                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1697         }
1698
1699         /*
1700          *      Free or return the buffer as appropriate. Again this
1701          *      hides all the races and re-entrancy issues from us.
1702          */
1703         err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1704
1705 out_free:
1706         skb_free_datagram(sk, skb);
1707 out:
1708         return err;
1709 }
1710
1711 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1712                                int *uaddr_len, int peer)
1713 {
1714         struct net_device *dev;
1715         struct sock *sk = sock->sk;
1716
1717         if (peer)
1718                 return -EOPNOTSUPP;
1719
1720         uaddr->sa_family = AF_PACKET;
1721         rcu_read_lock();
1722         dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1723         if (dev)
1724                 strncpy(uaddr->sa_data, dev->name, 14);
1725         else
1726                 memset(uaddr->sa_data, 0, 14);
1727         rcu_read_unlock();
1728         *uaddr_len = sizeof(*uaddr);
1729
1730         return 0;
1731 }
1732
1733 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1734                           int *uaddr_len, int peer)
1735 {
1736         struct net_device *dev;
1737         struct sock *sk = sock->sk;
1738         struct packet_sock *po = pkt_sk(sk);
1739         DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1740
1741         if (peer)
1742                 return -EOPNOTSUPP;
1743
1744         sll->sll_family = AF_PACKET;
1745         sll->sll_ifindex = po->ifindex;
1746         sll->sll_protocol = po->num;
1747         sll->sll_pkttype = 0;
1748         rcu_read_lock();
1749         dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1750         if (dev) {
1751                 sll->sll_hatype = dev->type;
1752                 sll->sll_halen = dev->addr_len;
1753                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1754         } else {
1755                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1756                 sll->sll_halen = 0;
1757         }
1758         rcu_read_unlock();
1759         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1760
1761         return 0;
1762 }
1763
1764 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1765                          int what)
1766 {
1767         switch (i->type) {
1768         case PACKET_MR_MULTICAST:
1769                 if (i->alen != dev->addr_len)
1770                         return -EINVAL;
1771                 if (what > 0)
1772                         return dev_mc_add(dev, i->addr);
1773                 else
1774                         return dev_mc_del(dev, i->addr);
1775                 break;
1776         case PACKET_MR_PROMISC:
1777                 return dev_set_promiscuity(dev, what);
1778                 break;
1779         case PACKET_MR_ALLMULTI:
1780                 return dev_set_allmulti(dev, what);
1781                 break;
1782         case PACKET_MR_UNICAST:
1783                 if (i->alen != dev->addr_len)
1784                         return -EINVAL;
1785                 if (what > 0)
1786                         return dev_uc_add(dev, i->addr);
1787                 else
1788                         return dev_uc_del(dev, i->addr);
1789                 break;
1790         default:
1791                 break;
1792         }
1793         return 0;
1794 }
1795
1796 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1797 {
1798         for ( ; i; i = i->next) {
1799                 if (i->ifindex == dev->ifindex)
1800                         packet_dev_mc(dev, i, what);
1801         }
1802 }
1803
1804 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1805 {
1806         struct packet_sock *po = pkt_sk(sk);
1807         struct packet_mclist *ml, *i;
1808         struct net_device *dev;
1809         int err;
1810
1811         rtnl_lock();
1812
1813         err = -ENODEV;
1814         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1815         if (!dev)
1816                 goto done;
1817
1818         err = -EINVAL;
1819         if (mreq->mr_alen > dev->addr_len)
1820                 goto done;
1821
1822         err = -ENOBUFS;
1823         i = kmalloc(sizeof(*i), GFP_KERNEL);
1824         if (i == NULL)
1825                 goto done;
1826
1827         err = 0;
1828         for (ml = po->mclist; ml; ml = ml->next) {
1829                 if (ml->ifindex == mreq->mr_ifindex &&
1830                     ml->type == mreq->mr_type &&
1831                     ml->alen == mreq->mr_alen &&
1832                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1833                         ml->count++;
1834                         /* Free the new element ... */
1835                         kfree(i);
1836                         goto done;
1837                 }
1838         }
1839
1840         i->type = mreq->mr_type;
1841         i->ifindex = mreq->mr_ifindex;
1842         i->alen = mreq->mr_alen;
1843         memcpy(i->addr, mreq->mr_address, i->alen);
1844         i->count = 1;
1845         i->next = po->mclist;
1846         po->mclist = i;
1847         err = packet_dev_mc(dev, i, 1);
1848         if (err) {
1849                 po->mclist = i->next;
1850                 kfree(i);
1851         }
1852
1853 done:
1854         rtnl_unlock();
1855         return err;
1856 }
1857
1858 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1859 {
1860         struct packet_mclist *ml, **mlp;
1861
1862         rtnl_lock();
1863
1864         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1865                 if (ml->ifindex == mreq->mr_ifindex &&
1866                     ml->type == mreq->mr_type &&
1867                     ml->alen == mreq->mr_alen &&
1868                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1869                         if (--ml->count == 0) {
1870                                 struct net_device *dev;
1871                                 *mlp = ml->next;
1872                                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1873                                 if (dev)
1874                                         packet_dev_mc(dev, ml, -1);
1875                                 kfree(ml);
1876                         }
1877                         rtnl_unlock();
1878                         return 0;
1879                 }
1880         }
1881         rtnl_unlock();
1882         return -EADDRNOTAVAIL;
1883 }
1884
1885 static void packet_flush_mclist(struct sock *sk)
1886 {
1887         struct packet_sock *po = pkt_sk(sk);
1888         struct packet_mclist *ml;
1889
1890         if (!po->mclist)
1891                 return;
1892
1893         rtnl_lock();
1894         while ((ml = po->mclist) != NULL) {
1895                 struct net_device *dev;
1896
1897                 po->mclist = ml->next;
1898                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1899                 if (dev != NULL)
1900                         packet_dev_mc(dev, ml, -1);
1901                 kfree(ml);
1902         }
1903         rtnl_unlock();
1904 }
1905
1906 static int
1907 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1908 {
1909         struct sock *sk = sock->sk;
1910         struct packet_sock *po = pkt_sk(sk);
1911         int ret;
1912
1913         if (level != SOL_PACKET)
1914                 return -ENOPROTOOPT;
1915
1916         switch (optname) {
1917         case PACKET_ADD_MEMBERSHIP:
1918         case PACKET_DROP_MEMBERSHIP:
1919         {
1920                 struct packet_mreq_max mreq;
1921                 int len = optlen;
1922                 memset(&mreq, 0, sizeof(mreq));
1923                 if (len < sizeof(struct packet_mreq))
1924                         return -EINVAL;
1925                 if (len > sizeof(mreq))
1926                         len = sizeof(mreq);
1927                 if (copy_from_user(&mreq, optval, len))
1928                         return -EFAULT;
1929                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1930                         return -EINVAL;
1931                 if (optname == PACKET_ADD_MEMBERSHIP)
1932                         ret = packet_mc_add(sk, &mreq);
1933                 else
1934                         ret = packet_mc_drop(sk, &mreq);
1935                 return ret;
1936         }
1937
1938         case PACKET_RX_RING:
1939         case PACKET_TX_RING:
1940         {
1941                 struct tpacket_req req;
1942
1943                 if (optlen < sizeof(req))
1944                         return -EINVAL;
1945                 if (pkt_sk(sk)->has_vnet_hdr)
1946                         return -EINVAL;
1947                 if (copy_from_user(&req, optval, sizeof(req)))
1948                         return -EFAULT;
1949                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1950         }
1951         case PACKET_COPY_THRESH:
1952         {
1953                 int val;
1954
1955                 if (optlen != sizeof(val))
1956                         return -EINVAL;
1957                 if (copy_from_user(&val, optval, sizeof(val)))
1958                         return -EFAULT;
1959
1960                 pkt_sk(sk)->copy_thresh = val;
1961                 return 0;
1962         }
1963         case PACKET_VERSION:
1964         {
1965                 int val;
1966
1967                 if (optlen != sizeof(val))
1968                         return -EINVAL;
1969                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1970                         return -EBUSY;
1971                 if (copy_from_user(&val, optval, sizeof(val)))
1972                         return -EFAULT;
1973                 switch (val) {
1974                 case TPACKET_V1:
1975                 case TPACKET_V2:
1976                         po->tp_version = val;
1977                         return 0;
1978                 default:
1979                         return -EINVAL;
1980                 }
1981         }
1982         case PACKET_RESERVE:
1983         {
1984                 unsigned int val;
1985
1986                 if (optlen != sizeof(val))
1987                         return -EINVAL;
1988                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1989                         return -EBUSY;
1990                 if (copy_from_user(&val, optval, sizeof(val)))
1991                         return -EFAULT;
1992                 po->tp_reserve = val;
1993                 return 0;
1994         }
1995         case PACKET_LOSS:
1996         {
1997                 unsigned int val;
1998
1999                 if (optlen != sizeof(val))
2000                         return -EINVAL;
2001                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2002                         return -EBUSY;
2003                 if (copy_from_user(&val, optval, sizeof(val)))
2004                         return -EFAULT;
2005                 po->tp_loss = !!val;
2006                 return 0;
2007         }
2008         case PACKET_AUXDATA:
2009         {
2010                 int val;
2011
2012                 if (optlen < sizeof(val))
2013                         return -EINVAL;
2014                 if (copy_from_user(&val, optval, sizeof(val)))
2015                         return -EFAULT;
2016
2017                 po->auxdata = !!val;
2018                 return 0;
2019         }
2020         case PACKET_ORIGDEV:
2021         {
2022                 int val;
2023
2024                 if (optlen < sizeof(val))
2025                         return -EINVAL;
2026                 if (copy_from_user(&val, optval, sizeof(val)))
2027                         return -EFAULT;
2028
2029                 po->origdev = !!val;
2030                 return 0;
2031         }
2032         case PACKET_VNET_HDR:
2033         {
2034                 int val;
2035
2036                 if (sock->type != SOCK_RAW)
2037                         return -EINVAL;
2038                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2039                         return -EBUSY;
2040                 if (optlen < sizeof(val))
2041                         return -EINVAL;
2042                 if (copy_from_user(&val, optval, sizeof(val)))
2043                         return -EFAULT;
2044
2045                 po->has_vnet_hdr = !!val;
2046                 return 0;
2047         }
2048         case PACKET_TIMESTAMP:
2049         {
2050                 int val;
2051
2052                 if (optlen != sizeof(val))
2053                         return -EINVAL;
2054                 if (copy_from_user(&val, optval, sizeof(val)))
2055                         return -EFAULT;
2056
2057                 po->tp_tstamp = val;
2058                 return 0;
2059         }
2060         default:
2061                 return -ENOPROTOOPT;
2062         }
2063 }
2064
2065 static int packet_getsockopt(struct socket *sock, int level, int optname,
2066                              char __user *optval, int __user *optlen)
2067 {
2068         int len;
2069         int val;
2070         struct sock *sk = sock->sk;
2071         struct packet_sock *po = pkt_sk(sk);
2072         void *data;
2073         struct tpacket_stats st;
2074
2075         if (level != SOL_PACKET)
2076                 return -ENOPROTOOPT;
2077
2078         if (get_user(len, optlen))
2079                 return -EFAULT;
2080
2081         if (len < 0)
2082                 return -EINVAL;
2083
2084         switch (optname) {
2085         case PACKET_STATISTICS:
2086                 if (len > sizeof(struct tpacket_stats))
2087                         len = sizeof(struct tpacket_stats);
2088                 spin_lock_bh(&sk->sk_receive_queue.lock);
2089                 st = po->stats;
2090                 memset(&po->stats, 0, sizeof(st));
2091                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2092                 st.tp_packets += st.tp_drops;
2093
2094                 data = &st;
2095                 break;
2096         case PACKET_AUXDATA:
2097                 if (len > sizeof(int))
2098                         len = sizeof(int);
2099                 val = po->auxdata;
2100
2101                 data = &val;
2102                 break;
2103         case PACKET_ORIGDEV:
2104                 if (len > sizeof(int))
2105                         len = sizeof(int);
2106                 val = po->origdev;
2107
2108                 data = &val;
2109                 break;
2110         case PACKET_VNET_HDR:
2111                 if (len > sizeof(int))
2112                         len = sizeof(int);
2113                 val = po->has_vnet_hdr;
2114
2115                 data = &val;
2116                 break;
2117         case PACKET_VERSION:
2118                 if (len > sizeof(int))
2119                         len = sizeof(int);
2120                 val = po->tp_version;
2121                 data = &val;
2122                 break;
2123         case PACKET_HDRLEN:
2124                 if (len > sizeof(int))
2125                         len = sizeof(int);
2126                 if (copy_from_user(&val, optval, len))
2127                         return -EFAULT;
2128                 switch (val) {
2129                 case TPACKET_V1:
2130                         val = sizeof(struct tpacket_hdr);
2131                         break;
2132                 case TPACKET_V2:
2133                         val = sizeof(struct tpacket2_hdr);
2134                         break;
2135                 default:
2136                         return -EINVAL;
2137                 }
2138                 data = &val;
2139                 break;
2140         case PACKET_RESERVE:
2141                 if (len > sizeof(unsigned int))
2142                         len = sizeof(unsigned int);
2143                 val = po->tp_reserve;
2144                 data = &val;
2145                 break;
2146         case PACKET_LOSS:
2147                 if (len > sizeof(unsigned int))
2148                         len = sizeof(unsigned int);
2149                 val = po->tp_loss;
2150                 data = &val;
2151                 break;
2152         case PACKET_TIMESTAMP:
2153                 if (len > sizeof(int))
2154                         len = sizeof(int);
2155                 val = po->tp_tstamp;
2156                 data = &val;
2157                 break;
2158         default:
2159                 return -ENOPROTOOPT;
2160         }
2161
2162         if (put_user(len, optlen))
2163                 return -EFAULT;
2164         if (copy_to_user(optval, data, len))
2165                 return -EFAULT;
2166         return 0;
2167 }
2168
2169
2170 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2171 {
2172         struct sock *sk;
2173         struct hlist_node *node;
2174         struct net_device *dev = data;
2175         struct net *net = dev_net(dev);
2176
2177         rcu_read_lock();
2178         sk_for_each_rcu(sk, node, &net->packet.sklist) {
2179                 struct packet_sock *po = pkt_sk(sk);
2180
2181                 switch (msg) {
2182                 case NETDEV_UNREGISTER:
2183                         if (po->mclist)
2184                                 packet_dev_mclist(dev, po->mclist, -1);
2185                         /* fallthrough */
2186
2187                 case NETDEV_DOWN:
2188                         if (dev->ifindex == po->ifindex) {
2189                                 spin_lock(&po->bind_lock);
2190                                 if (po->running) {
2191                                         __dev_remove_pack(&po->prot_hook);
2192                                         __sock_put(sk);
2193                                         po->running = 0;
2194                                         sk->sk_err = ENETDOWN;
2195                                         if (!sock_flag(sk, SOCK_DEAD))
2196                                                 sk->sk_error_report(sk);
2197                                 }
2198                                 if (msg == NETDEV_UNREGISTER) {
2199                                         po->ifindex = -1;
2200                                         po->prot_hook.dev = NULL;
2201                                 }
2202                                 spin_unlock(&po->bind_lock);
2203                         }
2204                         break;
2205                 case NETDEV_UP:
2206                         if (dev->ifindex == po->ifindex) {
2207                                 spin_lock(&po->bind_lock);
2208                                 if (po->num && !po->running) {
2209                                         dev_add_pack(&po->prot_hook);
2210                                         sock_hold(sk);
2211                                         po->running = 1;
2212                                 }
2213                                 spin_unlock(&po->bind_lock);
2214                         }
2215                         break;
2216                 }
2217         }
2218         rcu_read_unlock();
2219         return NOTIFY_DONE;
2220 }
2221
2222
2223 static int packet_ioctl(struct socket *sock, unsigned int cmd,
2224                         unsigned long arg)
2225 {
2226         struct sock *sk = sock->sk;
2227
2228         switch (cmd) {
2229         case SIOCOUTQ:
2230         {
2231                 int amount = sk_wmem_alloc_get(sk);
2232
2233                 return put_user(amount, (int __user *)arg);
2234         }
2235         case SIOCINQ:
2236         {
2237                 struct sk_buff *skb;
2238                 int amount = 0;
2239
2240                 spin_lock_bh(&sk->sk_receive_queue.lock);
2241                 skb = skb_peek(&sk->sk_receive_queue);
2242                 if (skb)
2243                         amount = skb->len;
2244                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2245                 return put_user(amount, (int __user *)arg);
2246         }
2247         case SIOCGSTAMP:
2248                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2249         case SIOCGSTAMPNS:
2250                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2251
2252 #ifdef CONFIG_INET
2253         case SIOCADDRT:
2254         case SIOCDELRT:
2255         case SIOCDARP:
2256         case SIOCGARP:
2257         case SIOCSARP:
2258         case SIOCGIFADDR:
2259         case SIOCSIFADDR:
2260         case SIOCGIFBRDADDR:
2261         case SIOCSIFBRDADDR:
2262         case SIOCGIFNETMASK:
2263         case SIOCSIFNETMASK:
2264         case SIOCGIFDSTADDR:
2265         case SIOCSIFDSTADDR:
2266         case SIOCSIFFLAGS:
2267                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2268 #endif
2269
2270         default:
2271                 return -ENOIOCTLCMD;
2272         }
2273         return 0;
2274 }
2275
2276 static unsigned int packet_poll(struct file *file, struct socket *sock,
2277                                 poll_table *wait)
2278 {
2279         struct sock *sk = sock->sk;
2280         struct packet_sock *po = pkt_sk(sk);
2281         unsigned int mask = datagram_poll(file, sock, wait);
2282
2283         spin_lock_bh(&sk->sk_receive_queue.lock);
2284         if (po->rx_ring.pg_vec) {
2285                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2286                         mask |= POLLIN | POLLRDNORM;
2287         }
2288         spin_unlock_bh(&sk->sk_receive_queue.lock);
2289         spin_lock_bh(&sk->sk_write_queue.lock);
2290         if (po->tx_ring.pg_vec) {
2291                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2292                         mask |= POLLOUT | POLLWRNORM;
2293         }
2294         spin_unlock_bh(&sk->sk_write_queue.lock);
2295         return mask;
2296 }
2297
2298
2299 /* Dirty? Well, I still did not learn better way to account
2300  * for user mmaps.
2301  */
2302
2303 static void packet_mm_open(struct vm_area_struct *vma)
2304 {
2305         struct file *file = vma->vm_file;
2306         struct socket *sock = file->private_data;
2307         struct sock *sk = sock->sk;
2308
2309         if (sk)
2310                 atomic_inc(&pkt_sk(sk)->mapped);
2311 }
2312
2313 static void packet_mm_close(struct vm_area_struct *vma)
2314 {
2315         struct file *file = vma->vm_file;
2316         struct socket *sock = file->private_data;
2317         struct sock *sk = sock->sk;
2318
2319         if (sk)
2320                 atomic_dec(&pkt_sk(sk)->mapped);
2321 }
2322
2323 static const struct vm_operations_struct packet_mmap_ops = {
2324         .open   =       packet_mm_open,
2325         .close  =       packet_mm_close,
2326 };
2327
2328 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2329 {
2330         int i;
2331
2332         for (i = 0; i < len; i++) {
2333                 if (likely(pg_vec[i]))
2334                         free_pages((unsigned long) pg_vec[i], order);
2335         }
2336         kfree(pg_vec);
2337 }
2338
2339 static inline char *alloc_one_pg_vec_page(unsigned long order)
2340 {
2341         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2342
2343         return (char *) __get_free_pages(gfp_flags, order);
2344 }
2345
2346 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2347 {
2348         unsigned int block_nr = req->tp_block_nr;
2349         char **pg_vec;
2350         int i;
2351
2352         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2353         if (unlikely(!pg_vec))
2354                 goto out;
2355
2356         for (i = 0; i < block_nr; i++) {
2357                 pg_vec[i] = alloc_one_pg_vec_page(order);
2358                 if (unlikely(!pg_vec[i]))
2359                         goto out_free_pgvec;
2360         }
2361
2362 out:
2363         return pg_vec;
2364
2365 out_free_pgvec:
2366         free_pg_vec(pg_vec, order, block_nr);
2367         pg_vec = NULL;
2368         goto out;
2369 }
2370
2371 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2372                 int closing, int tx_ring)
2373 {
2374         char **pg_vec = NULL;
2375         struct packet_sock *po = pkt_sk(sk);
2376         int was_running, order = 0;
2377         struct packet_ring_buffer *rb;
2378         struct sk_buff_head *rb_queue;
2379         __be16 num;
2380         int err;
2381
2382         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2383         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2384
2385         err = -EBUSY;
2386         if (!closing) {
2387                 if (atomic_read(&po->mapped))
2388                         goto out;
2389                 if (atomic_read(&rb->pending))
2390                         goto out;
2391         }
2392
2393         if (req->tp_block_nr) {
2394                 /* Sanity tests and some calculations */
2395                 err = -EBUSY;
2396                 if (unlikely(rb->pg_vec))
2397                         goto out;
2398
2399                 switch (po->tp_version) {
2400                 case TPACKET_V1:
2401                         po->tp_hdrlen = TPACKET_HDRLEN;
2402                         break;
2403                 case TPACKET_V2:
2404                         po->tp_hdrlen = TPACKET2_HDRLEN;
2405                         break;
2406                 }
2407
2408                 err = -EINVAL;
2409                 if (unlikely((int)req->tp_block_size <= 0))
2410                         goto out;
2411                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2412                         goto out;
2413                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2414                                         po->tp_reserve))
2415                         goto out;
2416                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2417                         goto out;
2418
2419                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2420                 if (unlikely(rb->frames_per_block <= 0))
2421                         goto out;
2422                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2423                                         req->tp_frame_nr))
2424                         goto out;
2425
2426                 err = -ENOMEM;
2427                 order = get_order(req->tp_block_size);
2428                 pg_vec = alloc_pg_vec(req, order);
2429                 if (unlikely(!pg_vec))
2430                         goto out;
2431         }
2432         /* Done */
2433         else {
2434                 err = -EINVAL;
2435                 if (unlikely(req->tp_frame_nr))
2436                         goto out;
2437         }
2438
2439         lock_sock(sk);
2440
2441         /* Detach socket from network */
2442         spin_lock(&po->bind_lock);
2443         was_running = po->running;
2444         num = po->num;
2445         if (was_running) {
2446                 __dev_remove_pack(&po->prot_hook);
2447                 po->num = 0;
2448                 po->running = 0;
2449                 __sock_put(sk);
2450         }
2451         spin_unlock(&po->bind_lock);
2452
2453         synchronize_net();
2454
2455         err = -EBUSY;
2456         mutex_lock(&po->pg_vec_lock);
2457         if (closing || atomic_read(&po->mapped) == 0) {
2458                 err = 0;
2459 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2460                 spin_lock_bh(&rb_queue->lock);
2461                 pg_vec = XC(rb->pg_vec, pg_vec);
2462                 rb->frame_max = (req->tp_frame_nr - 1);
2463                 rb->head = 0;
2464                 rb->frame_size = req->tp_frame_size;
2465                 spin_unlock_bh(&rb_queue->lock);
2466
2467                 order = XC(rb->pg_vec_order, order);
2468                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2469
2470                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2471                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2472                                                 tpacket_rcv : packet_rcv;
2473                 skb_queue_purge(rb_queue);
2474 #undef XC
2475                 if (atomic_read(&po->mapped))
2476                         pr_err("packet_mmap: vma is busy: %d\n",
2477                                atomic_read(&po->mapped));
2478         }
2479         mutex_unlock(&po->pg_vec_lock);
2480
2481         spin_lock(&po->bind_lock);
2482         if (was_running && !po->running) {
2483                 sock_hold(sk);
2484                 po->running = 1;
2485                 po->num = num;
2486                 dev_add_pack(&po->prot_hook);
2487         }
2488         spin_unlock(&po->bind_lock);
2489
2490         release_sock(sk);
2491
2492         if (pg_vec)
2493                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2494 out:
2495         return err;
2496 }
2497
2498 static int packet_mmap(struct file *file, struct socket *sock,
2499                 struct vm_area_struct *vma)
2500 {
2501         struct sock *sk = sock->sk;
2502         struct packet_sock *po = pkt_sk(sk);
2503         unsigned long size, expected_size;
2504         struct packet_ring_buffer *rb;
2505         unsigned long start;
2506         int err = -EINVAL;
2507         int i;
2508
2509         if (vma->vm_pgoff)
2510                 return -EINVAL;
2511
2512         mutex_lock(&po->pg_vec_lock);
2513
2514         expected_size = 0;
2515         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2516                 if (rb->pg_vec) {
2517                         expected_size += rb->pg_vec_len
2518                                                 * rb->pg_vec_pages
2519                                                 * PAGE_SIZE;
2520                 }
2521         }
2522
2523         if (expected_size == 0)
2524                 goto out;
2525
2526         size = vma->vm_end - vma->vm_start;
2527         if (size != expected_size)
2528                 goto out;
2529
2530         start = vma->vm_start;
2531         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2532                 if (rb->pg_vec == NULL)
2533                         continue;
2534
2535                 for (i = 0; i < rb->pg_vec_len; i++) {
2536                         struct page *page = virt_to_page(rb->pg_vec[i]);
2537                         int pg_num;
2538
2539                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2540                                         pg_num++, page++) {
2541                                 err = vm_insert_page(vma, start, page);
2542                                 if (unlikely(err))
2543                                         goto out;
2544                                 start += PAGE_SIZE;
2545                         }
2546                 }
2547         }
2548
2549         atomic_inc(&po->mapped);
2550         vma->vm_ops = &packet_mmap_ops;
2551         err = 0;
2552
2553 out:
2554         mutex_unlock(&po->pg_vec_lock);
2555         return err;
2556 }
2557
2558 static const struct proto_ops packet_ops_spkt = {
2559         .family =       PF_PACKET,
2560         .owner =        THIS_MODULE,
2561         .release =      packet_release,
2562         .bind =         packet_bind_spkt,
2563         .connect =      sock_no_connect,
2564         .socketpair =   sock_no_socketpair,
2565         .accept =       sock_no_accept,
2566         .getname =      packet_getname_spkt,
2567         .poll =         datagram_poll,
2568         .ioctl =        packet_ioctl,
2569         .listen =       sock_no_listen,
2570         .shutdown =     sock_no_shutdown,
2571         .setsockopt =   sock_no_setsockopt,
2572         .getsockopt =   sock_no_getsockopt,
2573         .sendmsg =      packet_sendmsg_spkt,
2574         .recvmsg =      packet_recvmsg,
2575         .mmap =         sock_no_mmap,
2576         .sendpage =     sock_no_sendpage,
2577 };
2578
2579 static const struct proto_ops packet_ops = {
2580         .family =       PF_PACKET,
2581         .owner =        THIS_MODULE,
2582         .release =      packet_release,
2583         .bind =         packet_bind,
2584         .connect =      sock_no_connect,
2585         .socketpair =   sock_no_socketpair,
2586         .accept =       sock_no_accept,
2587         .getname =      packet_getname,
2588         .poll =         packet_poll,
2589         .ioctl =        packet_ioctl,
2590         .listen =       sock_no_listen,
2591         .shutdown =     sock_no_shutdown,
2592         .setsockopt =   packet_setsockopt,
2593         .getsockopt =   packet_getsockopt,
2594         .sendmsg =      packet_sendmsg,
2595         .recvmsg =      packet_recvmsg,
2596         .mmap =         packet_mmap,
2597         .sendpage =     sock_no_sendpage,
2598 };
2599
2600 static const struct net_proto_family packet_family_ops = {
2601         .family =       PF_PACKET,
2602         .create =       packet_create,
2603         .owner  =       THIS_MODULE,
2604 };
2605
2606 static struct notifier_block packet_netdev_notifier = {
2607         .notifier_call =        packet_notifier,
2608 };
2609
2610 #ifdef CONFIG_PROC_FS
2611
2612 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2613         __acquires(RCU)
2614 {
2615         struct net *net = seq_file_net(seq);
2616
2617         rcu_read_lock();
2618         return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
2619 }
2620
2621 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2622 {
2623         struct net *net = seq_file_net(seq);
2624         return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
2625 }
2626
2627 static void packet_seq_stop(struct seq_file *seq, void *v)
2628         __releases(RCU)
2629 {
2630         rcu_read_unlock();
2631 }
2632
2633 static int packet_seq_show(struct seq_file *seq, void *v)
2634 {
2635         if (v == SEQ_START_TOKEN)
2636                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2637         else {
2638                 struct sock *s = sk_entry(v);
2639                 const struct packet_sock *po = pkt_sk(s);
2640
2641                 seq_printf(seq,
2642                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2643                            s,
2644                            atomic_read(&s->sk_refcnt),
2645                            s->sk_type,
2646                            ntohs(po->num),
2647                            po->ifindex,
2648                            po->running,
2649                            atomic_read(&s->sk_rmem_alloc),
2650                            sock_i_uid(s),
2651                            sock_i_ino(s));
2652         }
2653
2654         return 0;
2655 }
2656
2657 static const struct seq_operations packet_seq_ops = {
2658         .start  = packet_seq_start,
2659         .next   = packet_seq_next,
2660         .stop   = packet_seq_stop,
2661         .show   = packet_seq_show,
2662 };
2663
2664 static int packet_seq_open(struct inode *inode, struct file *file)
2665 {
2666         return seq_open_net(inode, file, &packet_seq_ops,
2667                             sizeof(struct seq_net_private));
2668 }
2669
2670 static const struct file_operations packet_seq_fops = {
2671         .owner          = THIS_MODULE,
2672         .open           = packet_seq_open,
2673         .read           = seq_read,
2674         .llseek         = seq_lseek,
2675         .release        = seq_release_net,
2676 };
2677
2678 #endif
2679
2680 static int __net_init packet_net_init(struct net *net)
2681 {
2682         spin_lock_init(&net->packet.sklist_lock);
2683         INIT_HLIST_HEAD(&net->packet.sklist);
2684
2685         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2686                 return -ENOMEM;
2687
2688         return 0;
2689 }
2690
2691 static void __net_exit packet_net_exit(struct net *net)
2692 {
2693         proc_net_remove(net, "packet");
2694 }
2695
2696 static struct pernet_operations packet_net_ops = {
2697         .init = packet_net_init,
2698         .exit = packet_net_exit,
2699 };
2700
2701
2702 static void __exit packet_exit(void)
2703 {
2704         unregister_netdevice_notifier(&packet_netdev_notifier);
2705         unregister_pernet_subsys(&packet_net_ops);
2706         sock_unregister(PF_PACKET);
2707         proto_unregister(&packet_proto);
2708 }
2709
2710 static int __init packet_init(void)
2711 {
2712         int rc = proto_register(&packet_proto, 0);
2713
2714         if (rc != 0)
2715                 goto out;
2716
2717         sock_register(&packet_family_ops);
2718         register_pernet_subsys(&packet_net_ops);
2719         register_netdevice_notifier(&packet_netdev_notifier);
2720 out:
2721         return rc;
2722 }
2723
2724 module_init(packet_init);
2725 module_exit(packet_exit);
2726 MODULE_LICENSE("GPL");
2727 MODULE_ALIAS_NETPROTO(PF_PACKET);