]> bbs.cooldavid.org Git - net-next-2.6.git/blame_incremental - net/packet/af_packet.c
filter: optimize sk_run_filter
[net-next-2.6.git] / net / packet / af_packet.c
... / ...
CommitLineData
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
12 * Fixes:
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
41 * and packet_mreq.
42 * Johann Baudy : Added TX RING.
43 *
44 * This program is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU General Public License
46 * as published by the Free Software Foundation; either version
47 * 2 of the License, or (at your option) any later version.
48 *
49 */
50
51#include <linux/types.h>
52#include <linux/mm.h>
53#include <linux/capability.h>
54#include <linux/fcntl.h>
55#include <linux/socket.h>
56#include <linux/in.h>
57#include <linux/inet.h>
58#include <linux/netdevice.h>
59#include <linux/if_packet.h>
60#include <linux/wireless.h>
61#include <linux/kernel.h>
62#include <linux/kmod.h>
63#include <linux/slab.h>
64#include <linux/vmalloc.h>
65#include <net/net_namespace.h>
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <linux/skbuff.h>
69#include <net/sock.h>
70#include <linux/errno.h>
71#include <linux/timer.h>
72#include <asm/system.h>
73#include <asm/uaccess.h>
74#include <asm/ioctls.h>
75#include <asm/page.h>
76#include <asm/cacheflush.h>
77#include <asm/io.h>
78#include <linux/proc_fs.h>
79#include <linux/seq_file.h>
80#include <linux/poll.h>
81#include <linux/module.h>
82#include <linux/init.h>
83#include <linux/mutex.h>
84#include <linux/if_vlan.h>
85#include <linux/virtio_net.h>
86#include <linux/errqueue.h>
87#include <linux/net_tstamp.h>
88
89#ifdef CONFIG_INET
90#include <net/inet_common.h>
91#endif
92
93/*
94 Assumptions:
95 - if device has no dev->hard_header routine, it adds and removes ll header
96 inside itself. In this case ll header is invisible outside of device,
97 but higher levels still should reserve dev->hard_header_len.
98 Some devices are enough clever to reallocate skb, when header
99 will not fit to reserved space (tunnel), another ones are silly
100 (PPP).
101 - packet socket receives packets with pulled ll header,
102 so that SOCK_RAW should push it back.
103
104On receive:
105-----------
106
107Incoming, dev->hard_header!=NULL
108 mac_header -> ll header
109 data -> data
110
111Outgoing, dev->hard_header!=NULL
112 mac_header -> ll header
113 data -> ll header
114
115Incoming, dev->hard_header==NULL
116 mac_header -> UNKNOWN position. It is very likely, that it points to ll
117 header. PPP makes it, that is wrong, because introduce
118 assymetry between rx and tx paths.
119 data -> data
120
121Outgoing, dev->hard_header==NULL
122 mac_header -> data. ll header is still not built!
123 data -> data
124
125Resume
126 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
127
128
129On transmit:
130------------
131
132dev->hard_header != NULL
133 mac_header -> ll header
134 data -> ll header
135
136dev->hard_header == NULL (ll header is added by device, we cannot control it)
137 mac_header -> data
138 data -> data
139
140 We should set nh.raw on output to correct posistion,
141 packet classifier depends on it.
142 */
143
144/* Private packet socket structures. */
145
146struct packet_mclist {
147 struct packet_mclist *next;
148 int ifindex;
149 int count;
150 unsigned short type;
151 unsigned short alen;
152 unsigned char addr[MAX_ADDR_LEN];
153};
154/* identical to struct packet_mreq except it has
155 * a longer address field.
156 */
157struct packet_mreq_max {
158 int mr_ifindex;
159 unsigned short mr_type;
160 unsigned short mr_alen;
161 unsigned char mr_address[MAX_ADDR_LEN];
162};
163
164static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
165 int closing, int tx_ring);
166
167#define PGV_FROM_VMALLOC 1
168struct pgv {
169 char *buffer;
170 unsigned char flags;
171};
172
173struct packet_ring_buffer {
174 struct pgv *pg_vec;
175 unsigned int head;
176 unsigned int frames_per_block;
177 unsigned int frame_size;
178 unsigned int frame_max;
179
180 unsigned int pg_vec_order;
181 unsigned int pg_vec_pages;
182 unsigned int pg_vec_len;
183
184 atomic_t pending;
185};
186
187struct packet_sock;
188static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
189
190static void packet_flush_mclist(struct sock *sk);
191
192struct packet_sock {
193 /* struct sock has to be the first member of packet_sock */
194 struct sock sk;
195 struct tpacket_stats stats;
196 struct packet_ring_buffer rx_ring;
197 struct packet_ring_buffer tx_ring;
198 int copy_thresh;
199 spinlock_t bind_lock;
200 struct mutex pg_vec_lock;
201 unsigned int running:1, /* prot_hook is attached*/
202 auxdata:1,
203 origdev:1,
204 has_vnet_hdr:1;
205 int ifindex; /* bound device */
206 __be16 num;
207 struct packet_mclist *mclist;
208 atomic_t mapped;
209 enum tpacket_versions tp_version;
210 unsigned int tp_hdrlen;
211 unsigned int tp_reserve;
212 unsigned int tp_loss:1;
213 unsigned int tp_tstamp;
214 struct packet_type prot_hook ____cacheline_aligned_in_smp;
215};
216
217struct packet_skb_cb {
218 unsigned int origlen;
219 union {
220 struct sockaddr_pkt pkt;
221 struct sockaddr_ll ll;
222 } sa;
223};
224
225#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
226
227static void __packet_set_status(struct packet_sock *po, void *frame, int status)
228{
229 union {
230 struct tpacket_hdr *h1;
231 struct tpacket2_hdr *h2;
232 void *raw;
233 } h;
234
235 h.raw = frame;
236 switch (po->tp_version) {
237 case TPACKET_V1:
238 h.h1->tp_status = status;
239 flush_dcache_page(virt_to_page(&h.h1->tp_status));
240 break;
241 case TPACKET_V2:
242 h.h2->tp_status = status;
243 flush_dcache_page(virt_to_page(&h.h2->tp_status));
244 break;
245 default:
246 pr_err("TPACKET version not supported\n");
247 BUG();
248 }
249
250 smp_wmb();
251}
252
253static int __packet_get_status(struct packet_sock *po, void *frame)
254{
255 union {
256 struct tpacket_hdr *h1;
257 struct tpacket2_hdr *h2;
258 void *raw;
259 } h;
260
261 smp_rmb();
262
263 h.raw = frame;
264 switch (po->tp_version) {
265 case TPACKET_V1:
266 flush_dcache_page(virt_to_page(&h.h1->tp_status));
267 return h.h1->tp_status;
268 case TPACKET_V2:
269 flush_dcache_page(virt_to_page(&h.h2->tp_status));
270 return h.h2->tp_status;
271 default:
272 pr_err("TPACKET version not supported\n");
273 BUG();
274 return 0;
275 }
276}
277
278static void *packet_lookup_frame(struct packet_sock *po,
279 struct packet_ring_buffer *rb,
280 unsigned int position,
281 int status)
282{
283 unsigned int pg_vec_pos, frame_offset;
284 union {
285 struct tpacket_hdr *h1;
286 struct tpacket2_hdr *h2;
287 void *raw;
288 } h;
289
290 pg_vec_pos = position / rb->frames_per_block;
291 frame_offset = position % rb->frames_per_block;
292
293 h.raw = rb->pg_vec[pg_vec_pos].buffer +
294 (frame_offset * rb->frame_size);
295
296 if (status != __packet_get_status(po, h.raw))
297 return NULL;
298
299 return h.raw;
300}
301
302static inline void *packet_current_frame(struct packet_sock *po,
303 struct packet_ring_buffer *rb,
304 int status)
305{
306 return packet_lookup_frame(po, rb, rb->head, status);
307}
308
309static inline void *packet_previous_frame(struct packet_sock *po,
310 struct packet_ring_buffer *rb,
311 int status)
312{
313 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
314 return packet_lookup_frame(po, rb, previous, status);
315}
316
317static inline void packet_increment_head(struct packet_ring_buffer *buff)
318{
319 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
320}
321
322static inline struct packet_sock *pkt_sk(struct sock *sk)
323{
324 return (struct packet_sock *)sk;
325}
326
327static void packet_sock_destruct(struct sock *sk)
328{
329 skb_queue_purge(&sk->sk_error_queue);
330
331 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
332 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
333
334 if (!sock_flag(sk, SOCK_DEAD)) {
335 pr_err("Attempt to release alive packet socket: %p\n", sk);
336 return;
337 }
338
339 sk_refcnt_debug_dec(sk);
340}
341
342
343static const struct proto_ops packet_ops;
344
345static const struct proto_ops packet_ops_spkt;
346
347static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
348 struct packet_type *pt, struct net_device *orig_dev)
349{
350 struct sock *sk;
351 struct sockaddr_pkt *spkt;
352
353 /*
354 * When we registered the protocol we saved the socket in the data
355 * field for just this event.
356 */
357
358 sk = pt->af_packet_priv;
359
360 /*
361 * Yank back the headers [hope the device set this
362 * right or kerboom...]
363 *
364 * Incoming packets have ll header pulled,
365 * push it back.
366 *
367 * For outgoing ones skb->data == skb_mac_header(skb)
368 * so that this procedure is noop.
369 */
370
371 if (skb->pkt_type == PACKET_LOOPBACK)
372 goto out;
373
374 if (!net_eq(dev_net(dev), sock_net(sk)))
375 goto out;
376
377 skb = skb_share_check(skb, GFP_ATOMIC);
378 if (skb == NULL)
379 goto oom;
380
381 /* drop any routing info */
382 skb_dst_drop(skb);
383
384 /* drop conntrack reference */
385 nf_reset(skb);
386
387 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
388
389 skb_push(skb, skb->data - skb_mac_header(skb));
390
391 /*
392 * The SOCK_PACKET socket receives _all_ frames.
393 */
394
395 spkt->spkt_family = dev->type;
396 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
397 spkt->spkt_protocol = skb->protocol;
398
399 /*
400 * Charge the memory to the socket. This is done specifically
401 * to prevent sockets using all the memory up.
402 */
403
404 if (sock_queue_rcv_skb(sk, skb) == 0)
405 return 0;
406
407out:
408 kfree_skb(skb);
409oom:
410 return 0;
411}
412
413
414/*
415 * Output a raw packet to a device layer. This bypasses all the other
416 * protocol layers and you must therefore supply it with a complete frame
417 */
418
419static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
420 struct msghdr *msg, size_t len)
421{
422 struct sock *sk = sock->sk;
423 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
424 struct sk_buff *skb = NULL;
425 struct net_device *dev;
426 __be16 proto = 0;
427 int err;
428
429 /*
430 * Get and verify the address.
431 */
432
433 if (saddr) {
434 if (msg->msg_namelen < sizeof(struct sockaddr))
435 return -EINVAL;
436 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
437 proto = saddr->spkt_protocol;
438 } else
439 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
440
441 /*
442 * Find the device first to size check it
443 */
444
445 saddr->spkt_device[13] = 0;
446retry:
447 rcu_read_lock();
448 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
449 err = -ENODEV;
450 if (dev == NULL)
451 goto out_unlock;
452
453 err = -ENETDOWN;
454 if (!(dev->flags & IFF_UP))
455 goto out_unlock;
456
457 /*
458 * You may not queue a frame bigger than the mtu. This is the lowest level
459 * raw protocol and you must do your own fragmentation at this level.
460 */
461
462 err = -EMSGSIZE;
463 if (len > dev->mtu + dev->hard_header_len)
464 goto out_unlock;
465
466 if (!skb) {
467 size_t reserved = LL_RESERVED_SPACE(dev);
468 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
469
470 rcu_read_unlock();
471 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
472 if (skb == NULL)
473 return -ENOBUFS;
474 /* FIXME: Save some space for broken drivers that write a hard
475 * header at transmission time by themselves. PPP is the notable
476 * one here. This should really be fixed at the driver level.
477 */
478 skb_reserve(skb, reserved);
479 skb_reset_network_header(skb);
480
481 /* Try to align data part correctly */
482 if (hhlen) {
483 skb->data -= hhlen;
484 skb->tail -= hhlen;
485 if (len < hhlen)
486 skb_reset_network_header(skb);
487 }
488 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
489 if (err)
490 goto out_free;
491 goto retry;
492 }
493
494
495 skb->protocol = proto;
496 skb->dev = dev;
497 skb->priority = sk->sk_priority;
498 skb->mark = sk->sk_mark;
499 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
500 if (err < 0)
501 goto out_unlock;
502
503 dev_queue_xmit(skb);
504 rcu_read_unlock();
505 return len;
506
507out_unlock:
508 rcu_read_unlock();
509out_free:
510 kfree_skb(skb);
511 return err;
512}
513
514static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
515 unsigned int res)
516{
517 struct sk_filter *filter;
518
519 rcu_read_lock_bh();
520 filter = rcu_dereference_bh(sk->sk_filter);
521 if (filter != NULL)
522 res = sk_run_filter(skb, filter->insns);
523 rcu_read_unlock_bh();
524
525 return res;
526}
527
528/*
529 This function makes lazy skb cloning in hope that most of packets
530 are discarded by BPF.
531
532 Note tricky part: we DO mangle shared skb! skb->data, skb->len
533 and skb->cb are mangled. It works because (and until) packets
534 falling here are owned by current CPU. Output packets are cloned
535 by dev_queue_xmit_nit(), input packets are processed by net_bh
536 sequencially, so that if we return skb to original state on exit,
537 we will not harm anyone.
538 */
539
540static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
541 struct packet_type *pt, struct net_device *orig_dev)
542{
543 struct sock *sk;
544 struct sockaddr_ll *sll;
545 struct packet_sock *po;
546 u8 *skb_head = skb->data;
547 int skb_len = skb->len;
548 unsigned int snaplen, res;
549
550 if (skb->pkt_type == PACKET_LOOPBACK)
551 goto drop;
552
553 sk = pt->af_packet_priv;
554 po = pkt_sk(sk);
555
556 if (!net_eq(dev_net(dev), sock_net(sk)))
557 goto drop;
558
559 skb->dev = dev;
560
561 if (dev->header_ops) {
562 /* The device has an explicit notion of ll header,
563 exported to higher levels.
564
565 Otherwise, the device hides datails of it frame
566 structure, so that corresponding packet head
567 never delivered to user.
568 */
569 if (sk->sk_type != SOCK_DGRAM)
570 skb_push(skb, skb->data - skb_mac_header(skb));
571 else if (skb->pkt_type == PACKET_OUTGOING) {
572 /* Special case: outgoing packets have ll header at head */
573 skb_pull(skb, skb_network_offset(skb));
574 }
575 }
576
577 snaplen = skb->len;
578
579 res = run_filter(skb, sk, snaplen);
580 if (!res)
581 goto drop_n_restore;
582 if (snaplen > res)
583 snaplen = res;
584
585 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
586 (unsigned)sk->sk_rcvbuf)
587 goto drop_n_acct;
588
589 if (skb_shared(skb)) {
590 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
591 if (nskb == NULL)
592 goto drop_n_acct;
593
594 if (skb_head != skb->data) {
595 skb->data = skb_head;
596 skb->len = skb_len;
597 }
598 kfree_skb(skb);
599 skb = nskb;
600 }
601
602 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
603 sizeof(skb->cb));
604
605 sll = &PACKET_SKB_CB(skb)->sa.ll;
606 sll->sll_family = AF_PACKET;
607 sll->sll_hatype = dev->type;
608 sll->sll_protocol = skb->protocol;
609 sll->sll_pkttype = skb->pkt_type;
610 if (unlikely(po->origdev))
611 sll->sll_ifindex = orig_dev->ifindex;
612 else
613 sll->sll_ifindex = dev->ifindex;
614
615 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
616
617 PACKET_SKB_CB(skb)->origlen = skb->len;
618
619 if (pskb_trim(skb, snaplen))
620 goto drop_n_acct;
621
622 skb_set_owner_r(skb, sk);
623 skb->dev = NULL;
624 skb_dst_drop(skb);
625
626 /* drop conntrack reference */
627 nf_reset(skb);
628
629 spin_lock(&sk->sk_receive_queue.lock);
630 po->stats.tp_packets++;
631 skb->dropcount = atomic_read(&sk->sk_drops);
632 __skb_queue_tail(&sk->sk_receive_queue, skb);
633 spin_unlock(&sk->sk_receive_queue.lock);
634 sk->sk_data_ready(sk, skb->len);
635 return 0;
636
637drop_n_acct:
638 po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
639
640drop_n_restore:
641 if (skb_head != skb->data && skb_shared(skb)) {
642 skb->data = skb_head;
643 skb->len = skb_len;
644 }
645drop:
646 consume_skb(skb);
647 return 0;
648}
649
650static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
651 struct packet_type *pt, struct net_device *orig_dev)
652{
653 struct sock *sk;
654 struct packet_sock *po;
655 struct sockaddr_ll *sll;
656 union {
657 struct tpacket_hdr *h1;
658 struct tpacket2_hdr *h2;
659 void *raw;
660 } h;
661 u8 *skb_head = skb->data;
662 int skb_len = skb->len;
663 unsigned int snaplen, res;
664 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
665 unsigned short macoff, netoff, hdrlen;
666 struct sk_buff *copy_skb = NULL;
667 struct timeval tv;
668 struct timespec ts;
669 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
670
671 if (skb->pkt_type == PACKET_LOOPBACK)
672 goto drop;
673
674 sk = pt->af_packet_priv;
675 po = pkt_sk(sk);
676
677 if (!net_eq(dev_net(dev), sock_net(sk)))
678 goto drop;
679
680 if (dev->header_ops) {
681 if (sk->sk_type != SOCK_DGRAM)
682 skb_push(skb, skb->data - skb_mac_header(skb));
683 else if (skb->pkt_type == PACKET_OUTGOING) {
684 /* Special case: outgoing packets have ll header at head */
685 skb_pull(skb, skb_network_offset(skb));
686 }
687 }
688
689 if (skb->ip_summed == CHECKSUM_PARTIAL)
690 status |= TP_STATUS_CSUMNOTREADY;
691
692 snaplen = skb->len;
693
694 res = run_filter(skb, sk, snaplen);
695 if (!res)
696 goto drop_n_restore;
697 if (snaplen > res)
698 snaplen = res;
699
700 if (sk->sk_type == SOCK_DGRAM) {
701 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
702 po->tp_reserve;
703 } else {
704 unsigned maclen = skb_network_offset(skb);
705 netoff = TPACKET_ALIGN(po->tp_hdrlen +
706 (maclen < 16 ? 16 : maclen)) +
707 po->tp_reserve;
708 macoff = netoff - maclen;
709 }
710
711 if (macoff + snaplen > po->rx_ring.frame_size) {
712 if (po->copy_thresh &&
713 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
714 (unsigned)sk->sk_rcvbuf) {
715 if (skb_shared(skb)) {
716 copy_skb = skb_clone(skb, GFP_ATOMIC);
717 } else {
718 copy_skb = skb_get(skb);
719 skb_head = skb->data;
720 }
721 if (copy_skb)
722 skb_set_owner_r(copy_skb, sk);
723 }
724 snaplen = po->rx_ring.frame_size - macoff;
725 if ((int)snaplen < 0)
726 snaplen = 0;
727 }
728
729 spin_lock(&sk->sk_receive_queue.lock);
730 h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
731 if (!h.raw)
732 goto ring_is_full;
733 packet_increment_head(&po->rx_ring);
734 po->stats.tp_packets++;
735 if (copy_skb) {
736 status |= TP_STATUS_COPY;
737 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
738 }
739 if (!po->stats.tp_drops)
740 status &= ~TP_STATUS_LOSING;
741 spin_unlock(&sk->sk_receive_queue.lock);
742
743 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
744
745 switch (po->tp_version) {
746 case TPACKET_V1:
747 h.h1->tp_len = skb->len;
748 h.h1->tp_snaplen = snaplen;
749 h.h1->tp_mac = macoff;
750 h.h1->tp_net = netoff;
751 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
752 && shhwtstamps->syststamp.tv64)
753 tv = ktime_to_timeval(shhwtstamps->syststamp);
754 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
755 && shhwtstamps->hwtstamp.tv64)
756 tv = ktime_to_timeval(shhwtstamps->hwtstamp);
757 else if (skb->tstamp.tv64)
758 tv = ktime_to_timeval(skb->tstamp);
759 else
760 do_gettimeofday(&tv);
761 h.h1->tp_sec = tv.tv_sec;
762 h.h1->tp_usec = tv.tv_usec;
763 hdrlen = sizeof(*h.h1);
764 break;
765 case TPACKET_V2:
766 h.h2->tp_len = skb->len;
767 h.h2->tp_snaplen = snaplen;
768 h.h2->tp_mac = macoff;
769 h.h2->tp_net = netoff;
770 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
771 && shhwtstamps->syststamp.tv64)
772 ts = ktime_to_timespec(shhwtstamps->syststamp);
773 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
774 && shhwtstamps->hwtstamp.tv64)
775 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
776 else if (skb->tstamp.tv64)
777 ts = ktime_to_timespec(skb->tstamp);
778 else
779 getnstimeofday(&ts);
780 h.h2->tp_sec = ts.tv_sec;
781 h.h2->tp_nsec = ts.tv_nsec;
782 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
783 hdrlen = sizeof(*h.h2);
784 break;
785 default:
786 BUG();
787 }
788
789 sll = h.raw + TPACKET_ALIGN(hdrlen);
790 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
791 sll->sll_family = AF_PACKET;
792 sll->sll_hatype = dev->type;
793 sll->sll_protocol = skb->protocol;
794 sll->sll_pkttype = skb->pkt_type;
795 if (unlikely(po->origdev))
796 sll->sll_ifindex = orig_dev->ifindex;
797 else
798 sll->sll_ifindex = dev->ifindex;
799
800 __packet_set_status(po, h.raw, status);
801 smp_mb();
802 {
803 struct page *p_start, *p_end;
804 u8 *h_end = h.raw + macoff + snaplen - 1;
805
806 p_start = virt_to_page(h.raw);
807 p_end = virt_to_page(h_end);
808 while (p_start <= p_end) {
809 flush_dcache_page(p_start);
810 p_start++;
811 }
812 }
813
814 sk->sk_data_ready(sk, 0);
815
816drop_n_restore:
817 if (skb_head != skb->data && skb_shared(skb)) {
818 skb->data = skb_head;
819 skb->len = skb_len;
820 }
821drop:
822 kfree_skb(skb);
823 return 0;
824
825ring_is_full:
826 po->stats.tp_drops++;
827 spin_unlock(&sk->sk_receive_queue.lock);
828
829 sk->sk_data_ready(sk, 0);
830 kfree_skb(copy_skb);
831 goto drop_n_restore;
832}
833
834static void tpacket_destruct_skb(struct sk_buff *skb)
835{
836 struct packet_sock *po = pkt_sk(skb->sk);
837 void *ph;
838
839 BUG_ON(skb == NULL);
840
841 if (likely(po->tx_ring.pg_vec)) {
842 ph = skb_shinfo(skb)->destructor_arg;
843 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
844 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
845 atomic_dec(&po->tx_ring.pending);
846 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
847 }
848
849 sock_wfree(skb);
850}
851
852static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
853 void *frame, struct net_device *dev, int size_max,
854 __be16 proto, unsigned char *addr)
855{
856 union {
857 struct tpacket_hdr *h1;
858 struct tpacket2_hdr *h2;
859 void *raw;
860 } ph;
861 int to_write, offset, len, tp_len, nr_frags, len_max;
862 struct socket *sock = po->sk.sk_socket;
863 struct page *page;
864 void *data;
865 int err;
866
867 ph.raw = frame;
868
869 skb->protocol = proto;
870 skb->dev = dev;
871 skb->priority = po->sk.sk_priority;
872 skb->mark = po->sk.sk_mark;
873 skb_shinfo(skb)->destructor_arg = ph.raw;
874
875 switch (po->tp_version) {
876 case TPACKET_V2:
877 tp_len = ph.h2->tp_len;
878 break;
879 default:
880 tp_len = ph.h1->tp_len;
881 break;
882 }
883 if (unlikely(tp_len > size_max)) {
884 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
885 return -EMSGSIZE;
886 }
887
888 skb_reserve(skb, LL_RESERVED_SPACE(dev));
889 skb_reset_network_header(skb);
890
891 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
892 to_write = tp_len;
893
894 if (sock->type == SOCK_DGRAM) {
895 err = dev_hard_header(skb, dev, ntohs(proto), addr,
896 NULL, tp_len);
897 if (unlikely(err < 0))
898 return -EINVAL;
899 } else if (dev->hard_header_len) {
900 /* net device doesn't like empty head */
901 if (unlikely(tp_len <= dev->hard_header_len)) {
902 pr_err("packet size is too short (%d < %d)\n",
903 tp_len, dev->hard_header_len);
904 return -EINVAL;
905 }
906
907 skb_push(skb, dev->hard_header_len);
908 err = skb_store_bits(skb, 0, data,
909 dev->hard_header_len);
910 if (unlikely(err))
911 return err;
912
913 data += dev->hard_header_len;
914 to_write -= dev->hard_header_len;
915 }
916
917 err = -EFAULT;
918 page = virt_to_page(data);
919 offset = offset_in_page(data);
920 len_max = PAGE_SIZE - offset;
921 len = ((to_write > len_max) ? len_max : to_write);
922
923 skb->data_len = to_write;
924 skb->len += to_write;
925 skb->truesize += to_write;
926 atomic_add(to_write, &po->sk.sk_wmem_alloc);
927
928 while (likely(to_write)) {
929 nr_frags = skb_shinfo(skb)->nr_frags;
930
931 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
932 pr_err("Packet exceed the number of skb frags(%lu)\n",
933 MAX_SKB_FRAGS);
934 return -EFAULT;
935 }
936
937 flush_dcache_page(page);
938 get_page(page);
939 skb_fill_page_desc(skb,
940 nr_frags,
941 page++, offset, len);
942 to_write -= len;
943 offset = 0;
944 len_max = PAGE_SIZE;
945 len = ((to_write > len_max) ? len_max : to_write);
946 }
947
948 return tp_len;
949}
950
951static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
952{
953 struct socket *sock;
954 struct sk_buff *skb;
955 struct net_device *dev;
956 __be16 proto;
957 int ifindex, err, reserve = 0;
958 void *ph;
959 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
960 int tp_len, size_max;
961 unsigned char *addr;
962 int len_sum = 0;
963 int status = 0;
964
965 sock = po->sk.sk_socket;
966
967 mutex_lock(&po->pg_vec_lock);
968
969 err = -EBUSY;
970 if (saddr == NULL) {
971 ifindex = po->ifindex;
972 proto = po->num;
973 addr = NULL;
974 } else {
975 err = -EINVAL;
976 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
977 goto out;
978 if (msg->msg_namelen < (saddr->sll_halen
979 + offsetof(struct sockaddr_ll,
980 sll_addr)))
981 goto out;
982 ifindex = saddr->sll_ifindex;
983 proto = saddr->sll_protocol;
984 addr = saddr->sll_addr;
985 }
986
987 dev = dev_get_by_index(sock_net(&po->sk), ifindex);
988 err = -ENXIO;
989 if (unlikely(dev == NULL))
990 goto out;
991
992 reserve = dev->hard_header_len;
993
994 err = -ENETDOWN;
995 if (unlikely(!(dev->flags & IFF_UP)))
996 goto out_put;
997
998 size_max = po->tx_ring.frame_size
999 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
1000
1001 if (size_max > dev->mtu + reserve)
1002 size_max = dev->mtu + reserve;
1003
1004 do {
1005 ph = packet_current_frame(po, &po->tx_ring,
1006 TP_STATUS_SEND_REQUEST);
1007
1008 if (unlikely(ph == NULL)) {
1009 schedule();
1010 continue;
1011 }
1012
1013 status = TP_STATUS_SEND_REQUEST;
1014 skb = sock_alloc_send_skb(&po->sk,
1015 LL_ALLOCATED_SPACE(dev)
1016 + sizeof(struct sockaddr_ll),
1017 0, &err);
1018
1019 if (unlikely(skb == NULL))
1020 goto out_status;
1021
1022 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1023 addr);
1024
1025 if (unlikely(tp_len < 0)) {
1026 if (po->tp_loss) {
1027 __packet_set_status(po, ph,
1028 TP_STATUS_AVAILABLE);
1029 packet_increment_head(&po->tx_ring);
1030 kfree_skb(skb);
1031 continue;
1032 } else {
1033 status = TP_STATUS_WRONG_FORMAT;
1034 err = tp_len;
1035 goto out_status;
1036 }
1037 }
1038
1039 skb->destructor = tpacket_destruct_skb;
1040 __packet_set_status(po, ph, TP_STATUS_SENDING);
1041 atomic_inc(&po->tx_ring.pending);
1042
1043 status = TP_STATUS_SEND_REQUEST;
1044 err = dev_queue_xmit(skb);
1045 if (unlikely(err > 0)) {
1046 err = net_xmit_errno(err);
1047 if (err && __packet_get_status(po, ph) ==
1048 TP_STATUS_AVAILABLE) {
1049 /* skb was destructed already */
1050 skb = NULL;
1051 goto out_status;
1052 }
1053 /*
1054 * skb was dropped but not destructed yet;
1055 * let's treat it like congestion or err < 0
1056 */
1057 err = 0;
1058 }
1059 packet_increment_head(&po->tx_ring);
1060 len_sum += tp_len;
1061 } while (likely((ph != NULL) ||
1062 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1063 (atomic_read(&po->tx_ring.pending))))
1064 );
1065
1066 err = len_sum;
1067 goto out_put;
1068
1069out_status:
1070 __packet_set_status(po, ph, status);
1071 kfree_skb(skb);
1072out_put:
1073 dev_put(dev);
1074out:
1075 mutex_unlock(&po->pg_vec_lock);
1076 return err;
1077}
1078
1079static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1080 size_t reserve, size_t len,
1081 size_t linear, int noblock,
1082 int *err)
1083{
1084 struct sk_buff *skb;
1085
1086 /* Under a page? Don't bother with paged skb. */
1087 if (prepad + len < PAGE_SIZE || !linear)
1088 linear = len;
1089
1090 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1091 err);
1092 if (!skb)
1093 return NULL;
1094
1095 skb_reserve(skb, reserve);
1096 skb_put(skb, linear);
1097 skb->data_len = len - linear;
1098 skb->len += len - linear;
1099
1100 return skb;
1101}
1102
1103static int packet_snd(struct socket *sock,
1104 struct msghdr *msg, size_t len)
1105{
1106 struct sock *sk = sock->sk;
1107 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1108 struct sk_buff *skb;
1109 struct net_device *dev;
1110 __be16 proto;
1111 unsigned char *addr;
1112 int ifindex, err, reserve = 0;
1113 struct virtio_net_hdr vnet_hdr = { 0 };
1114 int offset = 0;
1115 int vnet_hdr_len;
1116 struct packet_sock *po = pkt_sk(sk);
1117 unsigned short gso_type = 0;
1118
1119 /*
1120 * Get and verify the address.
1121 */
1122
1123 if (saddr == NULL) {
1124 ifindex = po->ifindex;
1125 proto = po->num;
1126 addr = NULL;
1127 } else {
1128 err = -EINVAL;
1129 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1130 goto out;
1131 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1132 goto out;
1133 ifindex = saddr->sll_ifindex;
1134 proto = saddr->sll_protocol;
1135 addr = saddr->sll_addr;
1136 }
1137
1138
1139 dev = dev_get_by_index(sock_net(sk), ifindex);
1140 err = -ENXIO;
1141 if (dev == NULL)
1142 goto out_unlock;
1143 if (sock->type == SOCK_RAW)
1144 reserve = dev->hard_header_len;
1145
1146 err = -ENETDOWN;
1147 if (!(dev->flags & IFF_UP))
1148 goto out_unlock;
1149
1150 if (po->has_vnet_hdr) {
1151 vnet_hdr_len = sizeof(vnet_hdr);
1152
1153 err = -EINVAL;
1154 if (len < vnet_hdr_len)
1155 goto out_unlock;
1156
1157 len -= vnet_hdr_len;
1158
1159 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1160 vnet_hdr_len);
1161 if (err < 0)
1162 goto out_unlock;
1163
1164 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1165 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1166 vnet_hdr.hdr_len))
1167 vnet_hdr.hdr_len = vnet_hdr.csum_start +
1168 vnet_hdr.csum_offset + 2;
1169
1170 err = -EINVAL;
1171 if (vnet_hdr.hdr_len > len)
1172 goto out_unlock;
1173
1174 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1175 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1176 case VIRTIO_NET_HDR_GSO_TCPV4:
1177 gso_type = SKB_GSO_TCPV4;
1178 break;
1179 case VIRTIO_NET_HDR_GSO_TCPV6:
1180 gso_type = SKB_GSO_TCPV6;
1181 break;
1182 case VIRTIO_NET_HDR_GSO_UDP:
1183 gso_type = SKB_GSO_UDP;
1184 break;
1185 default:
1186 goto out_unlock;
1187 }
1188
1189 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1190 gso_type |= SKB_GSO_TCP_ECN;
1191
1192 if (vnet_hdr.gso_size == 0)
1193 goto out_unlock;
1194
1195 }
1196 }
1197
1198 err = -EMSGSIZE;
1199 if (!gso_type && (len > dev->mtu+reserve))
1200 goto out_unlock;
1201
1202 err = -ENOBUFS;
1203 skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1204 LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1205 msg->msg_flags & MSG_DONTWAIT, &err);
1206 if (skb == NULL)
1207 goto out_unlock;
1208
1209 skb_set_network_header(skb, reserve);
1210
1211 err = -EINVAL;
1212 if (sock->type == SOCK_DGRAM &&
1213 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1214 goto out_free;
1215
1216 /* Returns -EFAULT on error */
1217 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1218 if (err)
1219 goto out_free;
1220 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1221 if (err < 0)
1222 goto out_free;
1223
1224 skb->protocol = proto;
1225 skb->dev = dev;
1226 skb->priority = sk->sk_priority;
1227 skb->mark = sk->sk_mark;
1228
1229 if (po->has_vnet_hdr) {
1230 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1231 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1232 vnet_hdr.csum_offset)) {
1233 err = -EINVAL;
1234 goto out_free;
1235 }
1236 }
1237
1238 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1239 skb_shinfo(skb)->gso_type = gso_type;
1240
1241 /* Header must be checked, and gso_segs computed. */
1242 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1243 skb_shinfo(skb)->gso_segs = 0;
1244
1245 len += vnet_hdr_len;
1246 }
1247
1248 /*
1249 * Now send it
1250 */
1251
1252 err = dev_queue_xmit(skb);
1253 if (err > 0 && (err = net_xmit_errno(err)) != 0)
1254 goto out_unlock;
1255
1256 dev_put(dev);
1257
1258 return len;
1259
1260out_free:
1261 kfree_skb(skb);
1262out_unlock:
1263 if (dev)
1264 dev_put(dev);
1265out:
1266 return err;
1267}
1268
1269static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1270 struct msghdr *msg, size_t len)
1271{
1272 struct sock *sk = sock->sk;
1273 struct packet_sock *po = pkt_sk(sk);
1274 if (po->tx_ring.pg_vec)
1275 return tpacket_snd(po, msg);
1276 else
1277 return packet_snd(sock, msg, len);
1278}
1279
1280/*
1281 * Close a PACKET socket. This is fairly simple. We immediately go
1282 * to 'closed' state and remove our protocol entry in the device list.
1283 */
1284
1285static int packet_release(struct socket *sock)
1286{
1287 struct sock *sk = sock->sk;
1288 struct packet_sock *po;
1289 struct net *net;
1290 struct tpacket_req req;
1291
1292 if (!sk)
1293 return 0;
1294
1295 net = sock_net(sk);
1296 po = pkt_sk(sk);
1297
1298 spin_lock_bh(&net->packet.sklist_lock);
1299 sk_del_node_init_rcu(sk);
1300 sock_prot_inuse_add(net, sk->sk_prot, -1);
1301 spin_unlock_bh(&net->packet.sklist_lock);
1302
1303 spin_lock(&po->bind_lock);
1304 if (po->running) {
1305 /*
1306 * Remove from protocol table
1307 */
1308 po->running = 0;
1309 po->num = 0;
1310 __dev_remove_pack(&po->prot_hook);
1311 __sock_put(sk);
1312 }
1313 spin_unlock(&po->bind_lock);
1314
1315 packet_flush_mclist(sk);
1316
1317 memset(&req, 0, sizeof(req));
1318
1319 if (po->rx_ring.pg_vec)
1320 packet_set_ring(sk, &req, 1, 0);
1321
1322 if (po->tx_ring.pg_vec)
1323 packet_set_ring(sk, &req, 1, 1);
1324
1325 synchronize_net();
1326 /*
1327 * Now the socket is dead. No more input will appear.
1328 */
1329 sock_orphan(sk);
1330 sock->sk = NULL;
1331
1332 /* Purge queues */
1333
1334 skb_queue_purge(&sk->sk_receive_queue);
1335 sk_refcnt_debug_release(sk);
1336
1337 sock_put(sk);
1338 return 0;
1339}
1340
1341/*
1342 * Attach a packet hook.
1343 */
1344
1345static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1346{
1347 struct packet_sock *po = pkt_sk(sk);
1348 /*
1349 * Detach an existing hook if present.
1350 */
1351
1352 lock_sock(sk);
1353
1354 spin_lock(&po->bind_lock);
1355 if (po->running) {
1356 __sock_put(sk);
1357 po->running = 0;
1358 po->num = 0;
1359 spin_unlock(&po->bind_lock);
1360 dev_remove_pack(&po->prot_hook);
1361 spin_lock(&po->bind_lock);
1362 }
1363
1364 po->num = protocol;
1365 po->prot_hook.type = protocol;
1366 po->prot_hook.dev = dev;
1367
1368 po->ifindex = dev ? dev->ifindex : 0;
1369
1370 if (protocol == 0)
1371 goto out_unlock;
1372
1373 if (!dev || (dev->flags & IFF_UP)) {
1374 dev_add_pack(&po->prot_hook);
1375 sock_hold(sk);
1376 po->running = 1;
1377 } else {
1378 sk->sk_err = ENETDOWN;
1379 if (!sock_flag(sk, SOCK_DEAD))
1380 sk->sk_error_report(sk);
1381 }
1382
1383out_unlock:
1384 spin_unlock(&po->bind_lock);
1385 release_sock(sk);
1386 return 0;
1387}
1388
1389/*
1390 * Bind a packet socket to a device
1391 */
1392
1393static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1394 int addr_len)
1395{
1396 struct sock *sk = sock->sk;
1397 char name[15];
1398 struct net_device *dev;
1399 int err = -ENODEV;
1400
1401 /*
1402 * Check legality
1403 */
1404
1405 if (addr_len != sizeof(struct sockaddr))
1406 return -EINVAL;
1407 strlcpy(name, uaddr->sa_data, sizeof(name));
1408
1409 dev = dev_get_by_name(sock_net(sk), name);
1410 if (dev) {
1411 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1412 dev_put(dev);
1413 }
1414 return err;
1415}
1416
1417static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1418{
1419 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1420 struct sock *sk = sock->sk;
1421 struct net_device *dev = NULL;
1422 int err;
1423
1424
1425 /*
1426 * Check legality
1427 */
1428
1429 if (addr_len < sizeof(struct sockaddr_ll))
1430 return -EINVAL;
1431 if (sll->sll_family != AF_PACKET)
1432 return -EINVAL;
1433
1434 if (sll->sll_ifindex) {
1435 err = -ENODEV;
1436 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1437 if (dev == NULL)
1438 goto out;
1439 }
1440 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1441 if (dev)
1442 dev_put(dev);
1443
1444out:
1445 return err;
1446}
1447
1448static struct proto packet_proto = {
1449 .name = "PACKET",
1450 .owner = THIS_MODULE,
1451 .obj_size = sizeof(struct packet_sock),
1452};
1453
1454/*
1455 * Create a packet of type SOCK_PACKET.
1456 */
1457
1458static int packet_create(struct net *net, struct socket *sock, int protocol,
1459 int kern)
1460{
1461 struct sock *sk;
1462 struct packet_sock *po;
1463 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1464 int err;
1465
1466 if (!capable(CAP_NET_RAW))
1467 return -EPERM;
1468 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1469 sock->type != SOCK_PACKET)
1470 return -ESOCKTNOSUPPORT;
1471
1472 sock->state = SS_UNCONNECTED;
1473
1474 err = -ENOBUFS;
1475 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1476 if (sk == NULL)
1477 goto out;
1478
1479 sock->ops = &packet_ops;
1480 if (sock->type == SOCK_PACKET)
1481 sock->ops = &packet_ops_spkt;
1482
1483 sock_init_data(sock, sk);
1484
1485 po = pkt_sk(sk);
1486 sk->sk_family = PF_PACKET;
1487 po->num = proto;
1488
1489 sk->sk_destruct = packet_sock_destruct;
1490 sk_refcnt_debug_inc(sk);
1491
1492 /*
1493 * Attach a protocol block
1494 */
1495
1496 spin_lock_init(&po->bind_lock);
1497 mutex_init(&po->pg_vec_lock);
1498 po->prot_hook.func = packet_rcv;
1499
1500 if (sock->type == SOCK_PACKET)
1501 po->prot_hook.func = packet_rcv_spkt;
1502
1503 po->prot_hook.af_packet_priv = sk;
1504
1505 if (proto) {
1506 po->prot_hook.type = proto;
1507 dev_add_pack(&po->prot_hook);
1508 sock_hold(sk);
1509 po->running = 1;
1510 }
1511
1512 spin_lock_bh(&net->packet.sklist_lock);
1513 sk_add_node_rcu(sk, &net->packet.sklist);
1514 sock_prot_inuse_add(net, &packet_proto, 1);
1515 spin_unlock_bh(&net->packet.sklist_lock);
1516
1517 return 0;
1518out:
1519 return err;
1520}
1521
1522static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
1523{
1524 struct sock_exterr_skb *serr;
1525 struct sk_buff *skb, *skb2;
1526 int copied, err;
1527
1528 err = -EAGAIN;
1529 skb = skb_dequeue(&sk->sk_error_queue);
1530 if (skb == NULL)
1531 goto out;
1532
1533 copied = skb->len;
1534 if (copied > len) {
1535 msg->msg_flags |= MSG_TRUNC;
1536 copied = len;
1537 }
1538 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1539 if (err)
1540 goto out_free_skb;
1541
1542 sock_recv_timestamp(msg, sk, skb);
1543
1544 serr = SKB_EXT_ERR(skb);
1545 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
1546 sizeof(serr->ee), &serr->ee);
1547
1548 msg->msg_flags |= MSG_ERRQUEUE;
1549 err = copied;
1550
1551 /* Reset and regenerate socket error */
1552 spin_lock_bh(&sk->sk_error_queue.lock);
1553 sk->sk_err = 0;
1554 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
1555 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
1556 spin_unlock_bh(&sk->sk_error_queue.lock);
1557 sk->sk_error_report(sk);
1558 } else
1559 spin_unlock_bh(&sk->sk_error_queue.lock);
1560
1561out_free_skb:
1562 kfree_skb(skb);
1563out:
1564 return err;
1565}
1566
1567/*
1568 * Pull a packet from our receive queue and hand it to the user.
1569 * If necessary we block.
1570 */
1571
1572static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1573 struct msghdr *msg, size_t len, int flags)
1574{
1575 struct sock *sk = sock->sk;
1576 struct sk_buff *skb;
1577 int copied, err;
1578 struct sockaddr_ll *sll;
1579 int vnet_hdr_len = 0;
1580
1581 err = -EINVAL;
1582 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1583 goto out;
1584
1585#if 0
1586 /* What error should we return now? EUNATTACH? */
1587 if (pkt_sk(sk)->ifindex < 0)
1588 return -ENODEV;
1589#endif
1590
1591 if (flags & MSG_ERRQUEUE) {
1592 err = packet_recv_error(sk, msg, len);
1593 goto out;
1594 }
1595
1596 /*
1597 * Call the generic datagram receiver. This handles all sorts
1598 * of horrible races and re-entrancy so we can forget about it
1599 * in the protocol layers.
1600 *
1601 * Now it will return ENETDOWN, if device have just gone down,
1602 * but then it will block.
1603 */
1604
1605 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1606
1607 /*
1608 * An error occurred so return it. Because skb_recv_datagram()
1609 * handles the blocking we don't see and worry about blocking
1610 * retries.
1611 */
1612
1613 if (skb == NULL)
1614 goto out;
1615
1616 if (pkt_sk(sk)->has_vnet_hdr) {
1617 struct virtio_net_hdr vnet_hdr = { 0 };
1618
1619 err = -EINVAL;
1620 vnet_hdr_len = sizeof(vnet_hdr);
1621 if (len < vnet_hdr_len)
1622 goto out_free;
1623
1624 len -= vnet_hdr_len;
1625
1626 if (skb_is_gso(skb)) {
1627 struct skb_shared_info *sinfo = skb_shinfo(skb);
1628
1629 /* This is a hint as to how much should be linear. */
1630 vnet_hdr.hdr_len = skb_headlen(skb);
1631 vnet_hdr.gso_size = sinfo->gso_size;
1632 if (sinfo->gso_type & SKB_GSO_TCPV4)
1633 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1634 else if (sinfo->gso_type & SKB_GSO_TCPV6)
1635 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1636 else if (sinfo->gso_type & SKB_GSO_UDP)
1637 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1638 else if (sinfo->gso_type & SKB_GSO_FCOE)
1639 goto out_free;
1640 else
1641 BUG();
1642 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1643 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1644 } else
1645 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1646
1647 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1648 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1649 vnet_hdr.csum_start = skb->csum_start -
1650 skb_headroom(skb);
1651 vnet_hdr.csum_offset = skb->csum_offset;
1652 } /* else everything is zero */
1653
1654 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1655 vnet_hdr_len);
1656 if (err < 0)
1657 goto out_free;
1658 }
1659
1660 /*
1661 * If the address length field is there to be filled in, we fill
1662 * it in now.
1663 */
1664
1665 sll = &PACKET_SKB_CB(skb)->sa.ll;
1666 if (sock->type == SOCK_PACKET)
1667 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1668 else
1669 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1670
1671 /*
1672 * You lose any data beyond the buffer you gave. If it worries a
1673 * user program they can ask the device for its MTU anyway.
1674 */
1675
1676 copied = skb->len;
1677 if (copied > len) {
1678 copied = len;
1679 msg->msg_flags |= MSG_TRUNC;
1680 }
1681
1682 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1683 if (err)
1684 goto out_free;
1685
1686 sock_recv_ts_and_drops(msg, sk, skb);
1687
1688 if (msg->msg_name)
1689 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1690 msg->msg_namelen);
1691
1692 if (pkt_sk(sk)->auxdata) {
1693 struct tpacket_auxdata aux;
1694
1695 aux.tp_status = TP_STATUS_USER;
1696 if (skb->ip_summed == CHECKSUM_PARTIAL)
1697 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1698 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1699 aux.tp_snaplen = skb->len;
1700 aux.tp_mac = 0;
1701 aux.tp_net = skb_network_offset(skb);
1702 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1703
1704 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1705 }
1706
1707 /*
1708 * Free or return the buffer as appropriate. Again this
1709 * hides all the races and re-entrancy issues from us.
1710 */
1711 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1712
1713out_free:
1714 skb_free_datagram(sk, skb);
1715out:
1716 return err;
1717}
1718
1719static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1720 int *uaddr_len, int peer)
1721{
1722 struct net_device *dev;
1723 struct sock *sk = sock->sk;
1724
1725 if (peer)
1726 return -EOPNOTSUPP;
1727
1728 uaddr->sa_family = AF_PACKET;
1729 rcu_read_lock();
1730 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1731 if (dev)
1732 strncpy(uaddr->sa_data, dev->name, 14);
1733 else
1734 memset(uaddr->sa_data, 0, 14);
1735 rcu_read_unlock();
1736 *uaddr_len = sizeof(*uaddr);
1737
1738 return 0;
1739}
1740
1741static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1742 int *uaddr_len, int peer)
1743{
1744 struct net_device *dev;
1745 struct sock *sk = sock->sk;
1746 struct packet_sock *po = pkt_sk(sk);
1747 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1748
1749 if (peer)
1750 return -EOPNOTSUPP;
1751
1752 sll->sll_family = AF_PACKET;
1753 sll->sll_ifindex = po->ifindex;
1754 sll->sll_protocol = po->num;
1755 sll->sll_pkttype = 0;
1756 rcu_read_lock();
1757 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1758 if (dev) {
1759 sll->sll_hatype = dev->type;
1760 sll->sll_halen = dev->addr_len;
1761 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1762 } else {
1763 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
1764 sll->sll_halen = 0;
1765 }
1766 rcu_read_unlock();
1767 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1768
1769 return 0;
1770}
1771
1772static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1773 int what)
1774{
1775 switch (i->type) {
1776 case PACKET_MR_MULTICAST:
1777 if (i->alen != dev->addr_len)
1778 return -EINVAL;
1779 if (what > 0)
1780 return dev_mc_add(dev, i->addr);
1781 else
1782 return dev_mc_del(dev, i->addr);
1783 break;
1784 case PACKET_MR_PROMISC:
1785 return dev_set_promiscuity(dev, what);
1786 break;
1787 case PACKET_MR_ALLMULTI:
1788 return dev_set_allmulti(dev, what);
1789 break;
1790 case PACKET_MR_UNICAST:
1791 if (i->alen != dev->addr_len)
1792 return -EINVAL;
1793 if (what > 0)
1794 return dev_uc_add(dev, i->addr);
1795 else
1796 return dev_uc_del(dev, i->addr);
1797 break;
1798 default:
1799 break;
1800 }
1801 return 0;
1802}
1803
1804static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1805{
1806 for ( ; i; i = i->next) {
1807 if (i->ifindex == dev->ifindex)
1808 packet_dev_mc(dev, i, what);
1809 }
1810}
1811
1812static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1813{
1814 struct packet_sock *po = pkt_sk(sk);
1815 struct packet_mclist *ml, *i;
1816 struct net_device *dev;
1817 int err;
1818
1819 rtnl_lock();
1820
1821 err = -ENODEV;
1822 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1823 if (!dev)
1824 goto done;
1825
1826 err = -EINVAL;
1827 if (mreq->mr_alen > dev->addr_len)
1828 goto done;
1829
1830 err = -ENOBUFS;
1831 i = kmalloc(sizeof(*i), GFP_KERNEL);
1832 if (i == NULL)
1833 goto done;
1834
1835 err = 0;
1836 for (ml = po->mclist; ml; ml = ml->next) {
1837 if (ml->ifindex == mreq->mr_ifindex &&
1838 ml->type == mreq->mr_type &&
1839 ml->alen == mreq->mr_alen &&
1840 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1841 ml->count++;
1842 /* Free the new element ... */
1843 kfree(i);
1844 goto done;
1845 }
1846 }
1847
1848 i->type = mreq->mr_type;
1849 i->ifindex = mreq->mr_ifindex;
1850 i->alen = mreq->mr_alen;
1851 memcpy(i->addr, mreq->mr_address, i->alen);
1852 i->count = 1;
1853 i->next = po->mclist;
1854 po->mclist = i;
1855 err = packet_dev_mc(dev, i, 1);
1856 if (err) {
1857 po->mclist = i->next;
1858 kfree(i);
1859 }
1860
1861done:
1862 rtnl_unlock();
1863 return err;
1864}
1865
1866static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1867{
1868 struct packet_mclist *ml, **mlp;
1869
1870 rtnl_lock();
1871
1872 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1873 if (ml->ifindex == mreq->mr_ifindex &&
1874 ml->type == mreq->mr_type &&
1875 ml->alen == mreq->mr_alen &&
1876 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1877 if (--ml->count == 0) {
1878 struct net_device *dev;
1879 *mlp = ml->next;
1880 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1881 if (dev)
1882 packet_dev_mc(dev, ml, -1);
1883 kfree(ml);
1884 }
1885 rtnl_unlock();
1886 return 0;
1887 }
1888 }
1889 rtnl_unlock();
1890 return -EADDRNOTAVAIL;
1891}
1892
1893static void packet_flush_mclist(struct sock *sk)
1894{
1895 struct packet_sock *po = pkt_sk(sk);
1896 struct packet_mclist *ml;
1897
1898 if (!po->mclist)
1899 return;
1900
1901 rtnl_lock();
1902 while ((ml = po->mclist) != NULL) {
1903 struct net_device *dev;
1904
1905 po->mclist = ml->next;
1906 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1907 if (dev != NULL)
1908 packet_dev_mc(dev, ml, -1);
1909 kfree(ml);
1910 }
1911 rtnl_unlock();
1912}
1913
1914static int
1915packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1916{
1917 struct sock *sk = sock->sk;
1918 struct packet_sock *po = pkt_sk(sk);
1919 int ret;
1920
1921 if (level != SOL_PACKET)
1922 return -ENOPROTOOPT;
1923
1924 switch (optname) {
1925 case PACKET_ADD_MEMBERSHIP:
1926 case PACKET_DROP_MEMBERSHIP:
1927 {
1928 struct packet_mreq_max mreq;
1929 int len = optlen;
1930 memset(&mreq, 0, sizeof(mreq));
1931 if (len < sizeof(struct packet_mreq))
1932 return -EINVAL;
1933 if (len > sizeof(mreq))
1934 len = sizeof(mreq);
1935 if (copy_from_user(&mreq, optval, len))
1936 return -EFAULT;
1937 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1938 return -EINVAL;
1939 if (optname == PACKET_ADD_MEMBERSHIP)
1940 ret = packet_mc_add(sk, &mreq);
1941 else
1942 ret = packet_mc_drop(sk, &mreq);
1943 return ret;
1944 }
1945
1946 case PACKET_RX_RING:
1947 case PACKET_TX_RING:
1948 {
1949 struct tpacket_req req;
1950
1951 if (optlen < sizeof(req))
1952 return -EINVAL;
1953 if (pkt_sk(sk)->has_vnet_hdr)
1954 return -EINVAL;
1955 if (copy_from_user(&req, optval, sizeof(req)))
1956 return -EFAULT;
1957 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1958 }
1959 case PACKET_COPY_THRESH:
1960 {
1961 int val;
1962
1963 if (optlen != sizeof(val))
1964 return -EINVAL;
1965 if (copy_from_user(&val, optval, sizeof(val)))
1966 return -EFAULT;
1967
1968 pkt_sk(sk)->copy_thresh = val;
1969 return 0;
1970 }
1971 case PACKET_VERSION:
1972 {
1973 int val;
1974
1975 if (optlen != sizeof(val))
1976 return -EINVAL;
1977 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1978 return -EBUSY;
1979 if (copy_from_user(&val, optval, sizeof(val)))
1980 return -EFAULT;
1981 switch (val) {
1982 case TPACKET_V1:
1983 case TPACKET_V2:
1984 po->tp_version = val;
1985 return 0;
1986 default:
1987 return -EINVAL;
1988 }
1989 }
1990 case PACKET_RESERVE:
1991 {
1992 unsigned int val;
1993
1994 if (optlen != sizeof(val))
1995 return -EINVAL;
1996 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1997 return -EBUSY;
1998 if (copy_from_user(&val, optval, sizeof(val)))
1999 return -EFAULT;
2000 po->tp_reserve = val;
2001 return 0;
2002 }
2003 case PACKET_LOSS:
2004 {
2005 unsigned int val;
2006
2007 if (optlen != sizeof(val))
2008 return -EINVAL;
2009 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2010 return -EBUSY;
2011 if (copy_from_user(&val, optval, sizeof(val)))
2012 return -EFAULT;
2013 po->tp_loss = !!val;
2014 return 0;
2015 }
2016 case PACKET_AUXDATA:
2017 {
2018 int val;
2019
2020 if (optlen < sizeof(val))
2021 return -EINVAL;
2022 if (copy_from_user(&val, optval, sizeof(val)))
2023 return -EFAULT;
2024
2025 po->auxdata = !!val;
2026 return 0;
2027 }
2028 case PACKET_ORIGDEV:
2029 {
2030 int val;
2031
2032 if (optlen < sizeof(val))
2033 return -EINVAL;
2034 if (copy_from_user(&val, optval, sizeof(val)))
2035 return -EFAULT;
2036
2037 po->origdev = !!val;
2038 return 0;
2039 }
2040 case PACKET_VNET_HDR:
2041 {
2042 int val;
2043
2044 if (sock->type != SOCK_RAW)
2045 return -EINVAL;
2046 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2047 return -EBUSY;
2048 if (optlen < sizeof(val))
2049 return -EINVAL;
2050 if (copy_from_user(&val, optval, sizeof(val)))
2051 return -EFAULT;
2052
2053 po->has_vnet_hdr = !!val;
2054 return 0;
2055 }
2056 case PACKET_TIMESTAMP:
2057 {
2058 int val;
2059
2060 if (optlen != sizeof(val))
2061 return -EINVAL;
2062 if (copy_from_user(&val, optval, sizeof(val)))
2063 return -EFAULT;
2064
2065 po->tp_tstamp = val;
2066 return 0;
2067 }
2068 default:
2069 return -ENOPROTOOPT;
2070 }
2071}
2072
2073static int packet_getsockopt(struct socket *sock, int level, int optname,
2074 char __user *optval, int __user *optlen)
2075{
2076 int len;
2077 int val;
2078 struct sock *sk = sock->sk;
2079 struct packet_sock *po = pkt_sk(sk);
2080 void *data;
2081 struct tpacket_stats st;
2082
2083 if (level != SOL_PACKET)
2084 return -ENOPROTOOPT;
2085
2086 if (get_user(len, optlen))
2087 return -EFAULT;
2088
2089 if (len < 0)
2090 return -EINVAL;
2091
2092 switch (optname) {
2093 case PACKET_STATISTICS:
2094 if (len > sizeof(struct tpacket_stats))
2095 len = sizeof(struct tpacket_stats);
2096 spin_lock_bh(&sk->sk_receive_queue.lock);
2097 st = po->stats;
2098 memset(&po->stats, 0, sizeof(st));
2099 spin_unlock_bh(&sk->sk_receive_queue.lock);
2100 st.tp_packets += st.tp_drops;
2101
2102 data = &st;
2103 break;
2104 case PACKET_AUXDATA:
2105 if (len > sizeof(int))
2106 len = sizeof(int);
2107 val = po->auxdata;
2108
2109 data = &val;
2110 break;
2111 case PACKET_ORIGDEV:
2112 if (len > sizeof(int))
2113 len = sizeof(int);
2114 val = po->origdev;
2115
2116 data = &val;
2117 break;
2118 case PACKET_VNET_HDR:
2119 if (len > sizeof(int))
2120 len = sizeof(int);
2121 val = po->has_vnet_hdr;
2122
2123 data = &val;
2124 break;
2125 case PACKET_VERSION:
2126 if (len > sizeof(int))
2127 len = sizeof(int);
2128 val = po->tp_version;
2129 data = &val;
2130 break;
2131 case PACKET_HDRLEN:
2132 if (len > sizeof(int))
2133 len = sizeof(int);
2134 if (copy_from_user(&val, optval, len))
2135 return -EFAULT;
2136 switch (val) {
2137 case TPACKET_V1:
2138 val = sizeof(struct tpacket_hdr);
2139 break;
2140 case TPACKET_V2:
2141 val = sizeof(struct tpacket2_hdr);
2142 break;
2143 default:
2144 return -EINVAL;
2145 }
2146 data = &val;
2147 break;
2148 case PACKET_RESERVE:
2149 if (len > sizeof(unsigned int))
2150 len = sizeof(unsigned int);
2151 val = po->tp_reserve;
2152 data = &val;
2153 break;
2154 case PACKET_LOSS:
2155 if (len > sizeof(unsigned int))
2156 len = sizeof(unsigned int);
2157 val = po->tp_loss;
2158 data = &val;
2159 break;
2160 case PACKET_TIMESTAMP:
2161 if (len > sizeof(int))
2162 len = sizeof(int);
2163 val = po->tp_tstamp;
2164 data = &val;
2165 break;
2166 default:
2167 return -ENOPROTOOPT;
2168 }
2169
2170 if (put_user(len, optlen))
2171 return -EFAULT;
2172 if (copy_to_user(optval, data, len))
2173 return -EFAULT;
2174 return 0;
2175}
2176
2177
2178static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2179{
2180 struct sock *sk;
2181 struct hlist_node *node;
2182 struct net_device *dev = data;
2183 struct net *net = dev_net(dev);
2184
2185 rcu_read_lock();
2186 sk_for_each_rcu(sk, node, &net->packet.sklist) {
2187 struct packet_sock *po = pkt_sk(sk);
2188
2189 switch (msg) {
2190 case NETDEV_UNREGISTER:
2191 if (po->mclist)
2192 packet_dev_mclist(dev, po->mclist, -1);
2193 /* fallthrough */
2194
2195 case NETDEV_DOWN:
2196 if (dev->ifindex == po->ifindex) {
2197 spin_lock(&po->bind_lock);
2198 if (po->running) {
2199 __dev_remove_pack(&po->prot_hook);
2200 __sock_put(sk);
2201 po->running = 0;
2202 sk->sk_err = ENETDOWN;
2203 if (!sock_flag(sk, SOCK_DEAD))
2204 sk->sk_error_report(sk);
2205 }
2206 if (msg == NETDEV_UNREGISTER) {
2207 po->ifindex = -1;
2208 po->prot_hook.dev = NULL;
2209 }
2210 spin_unlock(&po->bind_lock);
2211 }
2212 break;
2213 case NETDEV_UP:
2214 if (dev->ifindex == po->ifindex) {
2215 spin_lock(&po->bind_lock);
2216 if (po->num && !po->running) {
2217 dev_add_pack(&po->prot_hook);
2218 sock_hold(sk);
2219 po->running = 1;
2220 }
2221 spin_unlock(&po->bind_lock);
2222 }
2223 break;
2224 }
2225 }
2226 rcu_read_unlock();
2227 return NOTIFY_DONE;
2228}
2229
2230
2231static int packet_ioctl(struct socket *sock, unsigned int cmd,
2232 unsigned long arg)
2233{
2234 struct sock *sk = sock->sk;
2235
2236 switch (cmd) {
2237 case SIOCOUTQ:
2238 {
2239 int amount = sk_wmem_alloc_get(sk);
2240
2241 return put_user(amount, (int __user *)arg);
2242 }
2243 case SIOCINQ:
2244 {
2245 struct sk_buff *skb;
2246 int amount = 0;
2247
2248 spin_lock_bh(&sk->sk_receive_queue.lock);
2249 skb = skb_peek(&sk->sk_receive_queue);
2250 if (skb)
2251 amount = skb->len;
2252 spin_unlock_bh(&sk->sk_receive_queue.lock);
2253 return put_user(amount, (int __user *)arg);
2254 }
2255 case SIOCGSTAMP:
2256 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2257 case SIOCGSTAMPNS:
2258 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2259
2260#ifdef CONFIG_INET
2261 case SIOCADDRT:
2262 case SIOCDELRT:
2263 case SIOCDARP:
2264 case SIOCGARP:
2265 case SIOCSARP:
2266 case SIOCGIFADDR:
2267 case SIOCSIFADDR:
2268 case SIOCGIFBRDADDR:
2269 case SIOCSIFBRDADDR:
2270 case SIOCGIFNETMASK:
2271 case SIOCSIFNETMASK:
2272 case SIOCGIFDSTADDR:
2273 case SIOCSIFDSTADDR:
2274 case SIOCSIFFLAGS:
2275 return inet_dgram_ops.ioctl(sock, cmd, arg);
2276#endif
2277
2278 default:
2279 return -ENOIOCTLCMD;
2280 }
2281 return 0;
2282}
2283
2284static unsigned int packet_poll(struct file *file, struct socket *sock,
2285 poll_table *wait)
2286{
2287 struct sock *sk = sock->sk;
2288 struct packet_sock *po = pkt_sk(sk);
2289 unsigned int mask = datagram_poll(file, sock, wait);
2290
2291 spin_lock_bh(&sk->sk_receive_queue.lock);
2292 if (po->rx_ring.pg_vec) {
2293 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2294 mask |= POLLIN | POLLRDNORM;
2295 }
2296 spin_unlock_bh(&sk->sk_receive_queue.lock);
2297 spin_lock_bh(&sk->sk_write_queue.lock);
2298 if (po->tx_ring.pg_vec) {
2299 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2300 mask |= POLLOUT | POLLWRNORM;
2301 }
2302 spin_unlock_bh(&sk->sk_write_queue.lock);
2303 return mask;
2304}
2305
2306
2307/* Dirty? Well, I still did not learn better way to account
2308 * for user mmaps.
2309 */
2310
2311static void packet_mm_open(struct vm_area_struct *vma)
2312{
2313 struct file *file = vma->vm_file;
2314 struct socket *sock = file->private_data;
2315 struct sock *sk = sock->sk;
2316
2317 if (sk)
2318 atomic_inc(&pkt_sk(sk)->mapped);
2319}
2320
2321static void packet_mm_close(struct vm_area_struct *vma)
2322{
2323 struct file *file = vma->vm_file;
2324 struct socket *sock = file->private_data;
2325 struct sock *sk = sock->sk;
2326
2327 if (sk)
2328 atomic_dec(&pkt_sk(sk)->mapped);
2329}
2330
2331static const struct vm_operations_struct packet_mmap_ops = {
2332 .open = packet_mm_open,
2333 .close = packet_mm_close,
2334};
2335
2336static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
2337 unsigned int len)
2338{
2339 int i;
2340
2341 for (i = 0; i < len; i++) {
2342 if (likely(pg_vec[i].buffer)) {
2343 if (pg_vec[i].flags & PGV_FROM_VMALLOC)
2344 vfree(pg_vec[i].buffer);
2345 else
2346 free_pages((unsigned long)pg_vec[i].buffer,
2347 order);
2348 pg_vec[i].buffer = NULL;
2349 }
2350 }
2351 kfree(pg_vec);
2352}
2353
2354static inline char *alloc_one_pg_vec_page(unsigned long order,
2355 unsigned char *flags)
2356{
2357 char *buffer = NULL;
2358 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
2359 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
2360
2361 buffer = (char *) __get_free_pages(gfp_flags, order);
2362
2363 if (buffer)
2364 return buffer;
2365
2366 /*
2367 * __get_free_pages failed, fall back to vmalloc
2368 */
2369 *flags |= PGV_FROM_VMALLOC;
2370 buffer = vmalloc((1 << order) * PAGE_SIZE);
2371
2372 if (buffer)
2373 return buffer;
2374
2375 /*
2376 * vmalloc failed, lets dig into swap here
2377 */
2378 *flags = 0;
2379 gfp_flags &= ~__GFP_NORETRY;
2380 buffer = (char *)__get_free_pages(gfp_flags, order);
2381 if (buffer)
2382 return buffer;
2383
2384 /*
2385 * complete and utter failure
2386 */
2387 return NULL;
2388}
2389
2390static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
2391{
2392 unsigned int block_nr = req->tp_block_nr;
2393 struct pgv *pg_vec;
2394 int i;
2395
2396 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
2397 if (unlikely(!pg_vec))
2398 goto out;
2399
2400 for (i = 0; i < block_nr; i++) {
2401 pg_vec[i].buffer = alloc_one_pg_vec_page(order,
2402 &pg_vec[i].flags);
2403 if (unlikely(!pg_vec[i].buffer))
2404 goto out_free_pgvec;
2405 }
2406
2407out:
2408 return pg_vec;
2409
2410out_free_pgvec:
2411 free_pg_vec(pg_vec, order, block_nr);
2412 kfree(pg_vec);
2413 pg_vec = NULL;
2414 goto out;
2415}
2416
2417static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2418 int closing, int tx_ring)
2419{
2420 struct pgv *pg_vec = NULL;
2421 struct packet_sock *po = pkt_sk(sk);
2422 int was_running, order = 0;
2423 struct packet_ring_buffer *rb;
2424 struct sk_buff_head *rb_queue;
2425 __be16 num;
2426 int err;
2427
2428 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2429 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2430
2431 err = -EBUSY;
2432 if (!closing) {
2433 if (atomic_read(&po->mapped))
2434 goto out;
2435 if (atomic_read(&rb->pending))
2436 goto out;
2437 }
2438
2439 if (req->tp_block_nr) {
2440 /* Sanity tests and some calculations */
2441 err = -EBUSY;
2442 if (unlikely(rb->pg_vec))
2443 goto out;
2444
2445 switch (po->tp_version) {
2446 case TPACKET_V1:
2447 po->tp_hdrlen = TPACKET_HDRLEN;
2448 break;
2449 case TPACKET_V2:
2450 po->tp_hdrlen = TPACKET2_HDRLEN;
2451 break;
2452 }
2453
2454 err = -EINVAL;
2455 if (unlikely((int)req->tp_block_size <= 0))
2456 goto out;
2457 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2458 goto out;
2459 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2460 po->tp_reserve))
2461 goto out;
2462 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2463 goto out;
2464
2465 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2466 if (unlikely(rb->frames_per_block <= 0))
2467 goto out;
2468 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2469 req->tp_frame_nr))
2470 goto out;
2471
2472 err = -ENOMEM;
2473 order = get_order(req->tp_block_size);
2474 pg_vec = alloc_pg_vec(req, order);
2475 if (unlikely(!pg_vec))
2476 goto out;
2477 }
2478 /* Done */
2479 else {
2480 err = -EINVAL;
2481 if (unlikely(req->tp_frame_nr))
2482 goto out;
2483 }
2484
2485 lock_sock(sk);
2486
2487 /* Detach socket from network */
2488 spin_lock(&po->bind_lock);
2489 was_running = po->running;
2490 num = po->num;
2491 if (was_running) {
2492 __dev_remove_pack(&po->prot_hook);
2493 po->num = 0;
2494 po->running = 0;
2495 __sock_put(sk);
2496 }
2497 spin_unlock(&po->bind_lock);
2498
2499 synchronize_net();
2500
2501 err = -EBUSY;
2502 mutex_lock(&po->pg_vec_lock);
2503 if (closing || atomic_read(&po->mapped) == 0) {
2504 err = 0;
2505#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2506 spin_lock_bh(&rb_queue->lock);
2507 pg_vec = XC(rb->pg_vec, pg_vec);
2508 rb->frame_max = (req->tp_frame_nr - 1);
2509 rb->head = 0;
2510 rb->frame_size = req->tp_frame_size;
2511 spin_unlock_bh(&rb_queue->lock);
2512
2513 order = XC(rb->pg_vec_order, order);
2514 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2515
2516 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2517 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2518 tpacket_rcv : packet_rcv;
2519 skb_queue_purge(rb_queue);
2520#undef XC
2521 if (atomic_read(&po->mapped))
2522 pr_err("packet_mmap: vma is busy: %d\n",
2523 atomic_read(&po->mapped));
2524 }
2525 mutex_unlock(&po->pg_vec_lock);
2526
2527 spin_lock(&po->bind_lock);
2528 if (was_running && !po->running) {
2529 sock_hold(sk);
2530 po->running = 1;
2531 po->num = num;
2532 dev_add_pack(&po->prot_hook);
2533 }
2534 spin_unlock(&po->bind_lock);
2535
2536 release_sock(sk);
2537
2538 if (pg_vec)
2539 free_pg_vec(pg_vec, order, req->tp_block_nr);
2540out:
2541 return err;
2542}
2543
2544static int packet_mmap(struct file *file, struct socket *sock,
2545 struct vm_area_struct *vma)
2546{
2547 struct sock *sk = sock->sk;
2548 struct packet_sock *po = pkt_sk(sk);
2549 unsigned long size, expected_size;
2550 struct packet_ring_buffer *rb;
2551 unsigned long start;
2552 int err = -EINVAL;
2553 int i;
2554
2555 if (vma->vm_pgoff)
2556 return -EINVAL;
2557
2558 mutex_lock(&po->pg_vec_lock);
2559
2560 expected_size = 0;
2561 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2562 if (rb->pg_vec) {
2563 expected_size += rb->pg_vec_len
2564 * rb->pg_vec_pages
2565 * PAGE_SIZE;
2566 }
2567 }
2568
2569 if (expected_size == 0)
2570 goto out;
2571
2572 size = vma->vm_end - vma->vm_start;
2573 if (size != expected_size)
2574 goto out;
2575
2576 start = vma->vm_start;
2577 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2578 if (rb->pg_vec == NULL)
2579 continue;
2580
2581 for (i = 0; i < rb->pg_vec_len; i++) {
2582 struct page *page;
2583 void *kaddr = rb->pg_vec[i].buffer;
2584 int pg_num;
2585
2586 for (pg_num = 0; pg_num < rb->pg_vec_pages;
2587 pg_num++) {
2588 if (rb->pg_vec[i].flags & PGV_FROM_VMALLOC)
2589 page = vmalloc_to_page(kaddr);
2590 else
2591 page = virt_to_page(kaddr);
2592
2593 err = vm_insert_page(vma, start, page);
2594 if (unlikely(err))
2595 goto out;
2596 start += PAGE_SIZE;
2597 kaddr += PAGE_SIZE;
2598 }
2599 }
2600 }
2601
2602 atomic_inc(&po->mapped);
2603 vma->vm_ops = &packet_mmap_ops;
2604 err = 0;
2605
2606out:
2607 mutex_unlock(&po->pg_vec_lock);
2608 return err;
2609}
2610
2611static const struct proto_ops packet_ops_spkt = {
2612 .family = PF_PACKET,
2613 .owner = THIS_MODULE,
2614 .release = packet_release,
2615 .bind = packet_bind_spkt,
2616 .connect = sock_no_connect,
2617 .socketpair = sock_no_socketpair,
2618 .accept = sock_no_accept,
2619 .getname = packet_getname_spkt,
2620 .poll = datagram_poll,
2621 .ioctl = packet_ioctl,
2622 .listen = sock_no_listen,
2623 .shutdown = sock_no_shutdown,
2624 .setsockopt = sock_no_setsockopt,
2625 .getsockopt = sock_no_getsockopt,
2626 .sendmsg = packet_sendmsg_spkt,
2627 .recvmsg = packet_recvmsg,
2628 .mmap = sock_no_mmap,
2629 .sendpage = sock_no_sendpage,
2630};
2631
2632static const struct proto_ops packet_ops = {
2633 .family = PF_PACKET,
2634 .owner = THIS_MODULE,
2635 .release = packet_release,
2636 .bind = packet_bind,
2637 .connect = sock_no_connect,
2638 .socketpair = sock_no_socketpair,
2639 .accept = sock_no_accept,
2640 .getname = packet_getname,
2641 .poll = packet_poll,
2642 .ioctl = packet_ioctl,
2643 .listen = sock_no_listen,
2644 .shutdown = sock_no_shutdown,
2645 .setsockopt = packet_setsockopt,
2646 .getsockopt = packet_getsockopt,
2647 .sendmsg = packet_sendmsg,
2648 .recvmsg = packet_recvmsg,
2649 .mmap = packet_mmap,
2650 .sendpage = sock_no_sendpage,
2651};
2652
2653static const struct net_proto_family packet_family_ops = {
2654 .family = PF_PACKET,
2655 .create = packet_create,
2656 .owner = THIS_MODULE,
2657};
2658
2659static struct notifier_block packet_netdev_notifier = {
2660 .notifier_call = packet_notifier,
2661};
2662
2663#ifdef CONFIG_PROC_FS
2664
2665static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2666 __acquires(RCU)
2667{
2668 struct net *net = seq_file_net(seq);
2669
2670 rcu_read_lock();
2671 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
2672}
2673
2674static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2675{
2676 struct net *net = seq_file_net(seq);
2677 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
2678}
2679
2680static void packet_seq_stop(struct seq_file *seq, void *v)
2681 __releases(RCU)
2682{
2683 rcu_read_unlock();
2684}
2685
2686static int packet_seq_show(struct seq_file *seq, void *v)
2687{
2688 if (v == SEQ_START_TOKEN)
2689 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
2690 else {
2691 struct sock *s = sk_entry(v);
2692 const struct packet_sock *po = pkt_sk(s);
2693
2694 seq_printf(seq,
2695 "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
2696 s,
2697 atomic_read(&s->sk_refcnt),
2698 s->sk_type,
2699 ntohs(po->num),
2700 po->ifindex,
2701 po->running,
2702 atomic_read(&s->sk_rmem_alloc),
2703 sock_i_uid(s),
2704 sock_i_ino(s));
2705 }
2706
2707 return 0;
2708}
2709
2710static const struct seq_operations packet_seq_ops = {
2711 .start = packet_seq_start,
2712 .next = packet_seq_next,
2713 .stop = packet_seq_stop,
2714 .show = packet_seq_show,
2715};
2716
2717static int packet_seq_open(struct inode *inode, struct file *file)
2718{
2719 return seq_open_net(inode, file, &packet_seq_ops,
2720 sizeof(struct seq_net_private));
2721}
2722
2723static const struct file_operations packet_seq_fops = {
2724 .owner = THIS_MODULE,
2725 .open = packet_seq_open,
2726 .read = seq_read,
2727 .llseek = seq_lseek,
2728 .release = seq_release_net,
2729};
2730
2731#endif
2732
2733static int __net_init packet_net_init(struct net *net)
2734{
2735 spin_lock_init(&net->packet.sklist_lock);
2736 INIT_HLIST_HEAD(&net->packet.sklist);
2737
2738 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2739 return -ENOMEM;
2740
2741 return 0;
2742}
2743
2744static void __net_exit packet_net_exit(struct net *net)
2745{
2746 proc_net_remove(net, "packet");
2747}
2748
2749static struct pernet_operations packet_net_ops = {
2750 .init = packet_net_init,
2751 .exit = packet_net_exit,
2752};
2753
2754
2755static void __exit packet_exit(void)
2756{
2757 unregister_netdevice_notifier(&packet_netdev_notifier);
2758 unregister_pernet_subsys(&packet_net_ops);
2759 sock_unregister(PF_PACKET);
2760 proto_unregister(&packet_proto);
2761}
2762
2763static int __init packet_init(void)
2764{
2765 int rc = proto_register(&packet_proto, 0);
2766
2767 if (rc != 0)
2768 goto out;
2769
2770 sock_register(&packet_family_ops);
2771 register_pernet_subsys(&packet_net_ops);
2772 register_netdevice_notifier(&packet_netdev_notifier);
2773out:
2774 return rc;
2775}
2776
2777module_init(packet_init);
2778module_exit(packet_exit);
2779MODULE_LICENSE("GPL");
2780MODULE_ALIAS_NETPROTO(PF_PACKET);