]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/packet/af_packet.c
filter: optimize sk_run_filter
[net-next-2.6.git] / net / packet / af_packet.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * PACKET - implements raw packet sockets.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 *
1ce4f28b 12 * Fixes:
1da177e4
LT
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
1ce4f28b 35 * Ulises Alonso : Frame number limit removal and
1da177e4 36 * packet_set_ring memory leak.
0fb375fb
EB
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
1ce4f28b 40 * byte arrays at the end of sockaddr_ll
0fb375fb 41 * and packet_mreq.
69e3c75f 42 * Johann Baudy : Added TX RING.
1da177e4
LT
43 *
44 * This program is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU General Public License
46 * as published by the Free Software Foundation; either version
47 * 2 of the License, or (at your option) any later version.
48 *
49 */
1ce4f28b 50
1da177e4 51#include <linux/types.h>
1da177e4 52#include <linux/mm.h>
4fc268d2 53#include <linux/capability.h>
1da177e4
LT
54#include <linux/fcntl.h>
55#include <linux/socket.h>
56#include <linux/in.h>
57#include <linux/inet.h>
58#include <linux/netdevice.h>
59#include <linux/if_packet.h>
60#include <linux/wireless.h>
ffbc6111 61#include <linux/kernel.h>
1da177e4 62#include <linux/kmod.h>
5a0e3ad6 63#include <linux/slab.h>
0e3125c7 64#include <linux/vmalloc.h>
457c4cbc 65#include <net/net_namespace.h>
1da177e4
LT
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <linux/skbuff.h>
69#include <net/sock.h>
70#include <linux/errno.h>
71#include <linux/timer.h>
72#include <asm/system.h>
73#include <asm/uaccess.h>
74#include <asm/ioctls.h>
75#include <asm/page.h>
a1f8e7f7 76#include <asm/cacheflush.h>
1da177e4
LT
77#include <asm/io.h>
78#include <linux/proc_fs.h>
79#include <linux/seq_file.h>
80#include <linux/poll.h>
81#include <linux/module.h>
82#include <linux/init.h>
905db440 83#include <linux/mutex.h>
05423b24 84#include <linux/if_vlan.h>
bfd5f4a3 85#include <linux/virtio_net.h>
ed85b565 86#include <linux/errqueue.h>
614f60fa 87#include <linux/net_tstamp.h>
1da177e4
LT
88
89#ifdef CONFIG_INET
90#include <net/inet_common.h>
91#endif
92
1da177e4
LT
93/*
94 Assumptions:
95 - if device has no dev->hard_header routine, it adds and removes ll header
96 inside itself. In this case ll header is invisible outside of device,
97 but higher levels still should reserve dev->hard_header_len.
98 Some devices are enough clever to reallocate skb, when header
99 will not fit to reserved space (tunnel), another ones are silly
100 (PPP).
101 - packet socket receives packets with pulled ll header,
102 so that SOCK_RAW should push it back.
103
104On receive:
105-----------
106
107Incoming, dev->hard_header!=NULL
b0e380b1
ACM
108 mac_header -> ll header
109 data -> data
1da177e4
LT
110
111Outgoing, dev->hard_header!=NULL
b0e380b1
ACM
112 mac_header -> ll header
113 data -> ll header
1da177e4
LT
114
115Incoming, dev->hard_header==NULL
b0e380b1
ACM
116 mac_header -> UNKNOWN position. It is very likely, that it points to ll
117 header. PPP makes it, that is wrong, because introduce
db0c58f9 118 assymetry between rx and tx paths.
b0e380b1 119 data -> data
1da177e4
LT
120
121Outgoing, dev->hard_header==NULL
b0e380b1
ACM
122 mac_header -> data. ll header is still not built!
123 data -> data
1da177e4
LT
124
125Resume
126 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
127
128
129On transmit:
130------------
131
132dev->hard_header != NULL
b0e380b1
ACM
133 mac_header -> ll header
134 data -> ll header
1da177e4
LT
135
136dev->hard_header == NULL (ll header is added by device, we cannot control it)
b0e380b1
ACM
137 mac_header -> data
138 data -> data
1da177e4
LT
139
140 We should set nh.raw on output to correct posistion,
141 packet classifier depends on it.
142 */
143
1da177e4
LT
144/* Private packet socket structures. */
145
40d4e3df 146struct packet_mclist {
1da177e4
LT
147 struct packet_mclist *next;
148 int ifindex;
149 int count;
150 unsigned short type;
151 unsigned short alen;
0fb375fb
EB
152 unsigned char addr[MAX_ADDR_LEN];
153};
154/* identical to struct packet_mreq except it has
155 * a longer address field.
156 */
40d4e3df 157struct packet_mreq_max {
0fb375fb
EB
158 int mr_ifindex;
159 unsigned short mr_type;
160 unsigned short mr_alen;
161 unsigned char mr_address[MAX_ADDR_LEN];
1da177e4 162};
a2efcfa0 163
69e3c75f
JB
164static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
165 int closing, int tx_ring);
166
0e3125c7
NH
167#define PGV_FROM_VMALLOC 1
168struct pgv {
169 char *buffer;
170 unsigned char flags;
171};
172
69e3c75f 173struct packet_ring_buffer {
0e3125c7 174 struct pgv *pg_vec;
69e3c75f
JB
175 unsigned int head;
176 unsigned int frames_per_block;
177 unsigned int frame_size;
178 unsigned int frame_max;
179
180 unsigned int pg_vec_order;
181 unsigned int pg_vec_pages;
182 unsigned int pg_vec_len;
183
184 atomic_t pending;
185};
186
187struct packet_sock;
188static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
1da177e4
LT
189
190static void packet_flush_mclist(struct sock *sk);
191
192struct packet_sock {
193 /* struct sock has to be the first member of packet_sock */
194 struct sock sk;
195 struct tpacket_stats stats;
69e3c75f
JB
196 struct packet_ring_buffer rx_ring;
197 struct packet_ring_buffer tx_ring;
1da177e4 198 int copy_thresh;
1da177e4 199 spinlock_t bind_lock;
905db440 200 struct mutex pg_vec_lock;
8dc41944 201 unsigned int running:1, /* prot_hook is attached*/
80feaacb 202 auxdata:1,
bfd5f4a3
SS
203 origdev:1,
204 has_vnet_hdr:1;
1da177e4 205 int ifindex; /* bound device */
0e11c91e 206 __be16 num;
1da177e4 207 struct packet_mclist *mclist;
1da177e4 208 atomic_t mapped;
bbd6ef87
PM
209 enum tpacket_versions tp_version;
210 unsigned int tp_hdrlen;
8913336a 211 unsigned int tp_reserve;
69e3c75f 212 unsigned int tp_loss:1;
614f60fa 213 unsigned int tp_tstamp;
94b05952 214 struct packet_type prot_hook ____cacheline_aligned_in_smp;
1da177e4
LT
215};
216
ffbc6111
HX
217struct packet_skb_cb {
218 unsigned int origlen;
219 union {
220 struct sockaddr_pkt pkt;
221 struct sockaddr_ll ll;
222 } sa;
223};
224
225#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
8dc41944 226
69e3c75f 227static void __packet_set_status(struct packet_sock *po, void *frame, int status)
1da177e4 228{
bbd6ef87
PM
229 union {
230 struct tpacket_hdr *h1;
231 struct tpacket2_hdr *h2;
232 void *raw;
233 } h;
1da177e4 234
69e3c75f 235 h.raw = frame;
bbd6ef87
PM
236 switch (po->tp_version) {
237 case TPACKET_V1:
69e3c75f
JB
238 h.h1->tp_status = status;
239 flush_dcache_page(virt_to_page(&h.h1->tp_status));
bbd6ef87
PM
240 break;
241 case TPACKET_V2:
69e3c75f
JB
242 h.h2->tp_status = status;
243 flush_dcache_page(virt_to_page(&h.h2->tp_status));
bbd6ef87 244 break;
69e3c75f 245 default:
40d4e3df 246 pr_err("TPACKET version not supported\n");
69e3c75f 247 BUG();
bbd6ef87 248 }
69e3c75f
JB
249
250 smp_wmb();
bbd6ef87
PM
251}
252
69e3c75f 253static int __packet_get_status(struct packet_sock *po, void *frame)
bbd6ef87
PM
254{
255 union {
256 struct tpacket_hdr *h1;
257 struct tpacket2_hdr *h2;
258 void *raw;
259 } h;
260
69e3c75f
JB
261 smp_rmb();
262
bbd6ef87
PM
263 h.raw = frame;
264 switch (po->tp_version) {
265 case TPACKET_V1:
69e3c75f
JB
266 flush_dcache_page(virt_to_page(&h.h1->tp_status));
267 return h.h1->tp_status;
bbd6ef87 268 case TPACKET_V2:
69e3c75f
JB
269 flush_dcache_page(virt_to_page(&h.h2->tp_status));
270 return h.h2->tp_status;
271 default:
40d4e3df 272 pr_err("TPACKET version not supported\n");
69e3c75f
JB
273 BUG();
274 return 0;
bbd6ef87 275 }
1da177e4 276}
69e3c75f
JB
277
278static void *packet_lookup_frame(struct packet_sock *po,
279 struct packet_ring_buffer *rb,
280 unsigned int position,
281 int status)
282{
283 unsigned int pg_vec_pos, frame_offset;
284 union {
285 struct tpacket_hdr *h1;
286 struct tpacket2_hdr *h2;
287 void *raw;
288 } h;
289
290 pg_vec_pos = position / rb->frames_per_block;
291 frame_offset = position % rb->frames_per_block;
292
0e3125c7
NH
293 h.raw = rb->pg_vec[pg_vec_pos].buffer +
294 (frame_offset * rb->frame_size);
69e3c75f
JB
295
296 if (status != __packet_get_status(po, h.raw))
297 return NULL;
298
299 return h.raw;
300}
301
302static inline void *packet_current_frame(struct packet_sock *po,
303 struct packet_ring_buffer *rb,
304 int status)
305{
306 return packet_lookup_frame(po, rb, rb->head, status);
307}
308
309static inline void *packet_previous_frame(struct packet_sock *po,
310 struct packet_ring_buffer *rb,
311 int status)
312{
313 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
314 return packet_lookup_frame(po, rb, previous, status);
315}
316
317static inline void packet_increment_head(struct packet_ring_buffer *buff)
318{
319 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
320}
321
1da177e4
LT
322static inline struct packet_sock *pkt_sk(struct sock *sk)
323{
324 return (struct packet_sock *)sk;
325}
326
327static void packet_sock_destruct(struct sock *sk)
328{
ed85b565
RC
329 skb_queue_purge(&sk->sk_error_queue);
330
547b792c
IJ
331 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
332 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1da177e4
LT
333
334 if (!sock_flag(sk, SOCK_DEAD)) {
40d4e3df 335 pr_err("Attempt to release alive packet socket: %p\n", sk);
1da177e4
LT
336 return;
337 }
338
17ab56a2 339 sk_refcnt_debug_dec(sk);
1da177e4
LT
340}
341
342
90ddc4f0 343static const struct proto_ops packet_ops;
1da177e4 344
90ddc4f0 345static const struct proto_ops packet_ops_spkt;
1da177e4 346
40d4e3df
ED
347static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
348 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
349{
350 struct sock *sk;
351 struct sockaddr_pkt *spkt;
352
353 /*
354 * When we registered the protocol we saved the socket in the data
355 * field for just this event.
356 */
357
358 sk = pt->af_packet_priv;
1ce4f28b 359
1da177e4
LT
360 /*
361 * Yank back the headers [hope the device set this
362 * right or kerboom...]
363 *
364 * Incoming packets have ll header pulled,
365 * push it back.
366 *
98e399f8 367 * For outgoing ones skb->data == skb_mac_header(skb)
1da177e4
LT
368 * so that this procedure is noop.
369 */
370
371 if (skb->pkt_type == PACKET_LOOPBACK)
372 goto out;
373
09ad9bc7 374 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
375 goto out;
376
40d4e3df
ED
377 skb = skb_share_check(skb, GFP_ATOMIC);
378 if (skb == NULL)
1da177e4
LT
379 goto oom;
380
381 /* drop any routing info */
adf30907 382 skb_dst_drop(skb);
1da177e4 383
84531c24
PO
384 /* drop conntrack reference */
385 nf_reset(skb);
386
ffbc6111 387 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1da177e4 388
98e399f8 389 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
390
391 /*
392 * The SOCK_PACKET socket receives _all_ frames.
393 */
394
395 spkt->spkt_family = dev->type;
396 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
397 spkt->spkt_protocol = skb->protocol;
398
399 /*
400 * Charge the memory to the socket. This is done specifically
401 * to prevent sockets using all the memory up.
402 */
403
40d4e3df 404 if (sock_queue_rcv_skb(sk, skb) == 0)
1da177e4
LT
405 return 0;
406
407out:
408 kfree_skb(skb);
409oom:
410 return 0;
411}
412
413
414/*
415 * Output a raw packet to a device layer. This bypasses all the other
416 * protocol layers and you must therefore supply it with a complete frame
417 */
1ce4f28b 418
1da177e4
LT
419static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
420 struct msghdr *msg, size_t len)
421{
422 struct sock *sk = sock->sk;
40d4e3df 423 struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
1a35ca80 424 struct sk_buff *skb = NULL;
1da177e4 425 struct net_device *dev;
40d4e3df 426 __be16 proto = 0;
1da177e4 427 int err;
1ce4f28b 428
1da177e4 429 /*
1ce4f28b 430 * Get and verify the address.
1da177e4
LT
431 */
432
40d4e3df 433 if (saddr) {
1da177e4 434 if (msg->msg_namelen < sizeof(struct sockaddr))
40d4e3df
ED
435 return -EINVAL;
436 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
437 proto = saddr->spkt_protocol;
438 } else
439 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1da177e4
LT
440
441 /*
1ce4f28b 442 * Find the device first to size check it
1da177e4
LT
443 */
444
445 saddr->spkt_device[13] = 0;
1a35ca80 446retry:
654d1f8a
ED
447 rcu_read_lock();
448 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1da177e4
LT
449 err = -ENODEV;
450 if (dev == NULL)
451 goto out_unlock;
1ce4f28b 452
d5e76b0a
DM
453 err = -ENETDOWN;
454 if (!(dev->flags & IFF_UP))
455 goto out_unlock;
456
1da177e4 457 /*
40d4e3df
ED
458 * You may not queue a frame bigger than the mtu. This is the lowest level
459 * raw protocol and you must do your own fragmentation at this level.
1da177e4 460 */
1ce4f28b 461
1da177e4 462 err = -EMSGSIZE;
8ae55f04 463 if (len > dev->mtu + dev->hard_header_len)
1da177e4
LT
464 goto out_unlock;
465
1a35ca80
ED
466 if (!skb) {
467 size_t reserved = LL_RESERVED_SPACE(dev);
468 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
469
470 rcu_read_unlock();
471 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
472 if (skb == NULL)
473 return -ENOBUFS;
474 /* FIXME: Save some space for broken drivers that write a hard
475 * header at transmission time by themselves. PPP is the notable
476 * one here. This should really be fixed at the driver level.
477 */
478 skb_reserve(skb, reserved);
479 skb_reset_network_header(skb);
480
481 /* Try to align data part correctly */
482 if (hhlen) {
483 skb->data -= hhlen;
484 skb->tail -= hhlen;
485 if (len < hhlen)
486 skb_reset_network_header(skb);
487 }
488 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
489 if (err)
490 goto out_free;
491 goto retry;
1da177e4
LT
492 }
493
1a35ca80 494
1da177e4
LT
495 skb->protocol = proto;
496 skb->dev = dev;
497 skb->priority = sk->sk_priority;
2d37a186 498 skb->mark = sk->sk_mark;
2244d07b 499 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
500 if (err < 0)
501 goto out_unlock;
1da177e4
LT
502
503 dev_queue_xmit(skb);
654d1f8a 504 rcu_read_unlock();
40d4e3df 505 return len;
1da177e4 506
1da177e4 507out_unlock:
654d1f8a 508 rcu_read_unlock();
1a35ca80
ED
509out_free:
510 kfree_skb(skb);
1da177e4
LT
511 return err;
512}
1da177e4 513
dbcb5855
DM
514static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
515 unsigned int res)
1da177e4
LT
516{
517 struct sk_filter *filter;
fda9ef5d
DM
518
519 rcu_read_lock_bh();
a898def2 520 filter = rcu_dereference_bh(sk->sk_filter);
dbcb5855 521 if (filter != NULL)
93aaae2e 522 res = sk_run_filter(skb, filter->insns);
fda9ef5d 523 rcu_read_unlock_bh();
1da177e4 524
dbcb5855 525 return res;
1da177e4
LT
526}
527
528/*
529 This function makes lazy skb cloning in hope that most of packets
530 are discarded by BPF.
531
532 Note tricky part: we DO mangle shared skb! skb->data, skb->len
533 and skb->cb are mangled. It works because (and until) packets
534 falling here are owned by current CPU. Output packets are cloned
535 by dev_queue_xmit_nit(), input packets are processed by net_bh
536 sequencially, so that if we return skb to original state on exit,
537 we will not harm anyone.
538 */
539
40d4e3df
ED
540static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
541 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
542{
543 struct sock *sk;
544 struct sockaddr_ll *sll;
545 struct packet_sock *po;
40d4e3df 546 u8 *skb_head = skb->data;
1da177e4 547 int skb_len = skb->len;
dbcb5855 548 unsigned int snaplen, res;
1da177e4
LT
549
550 if (skb->pkt_type == PACKET_LOOPBACK)
551 goto drop;
552
553 sk = pt->af_packet_priv;
554 po = pkt_sk(sk);
555
09ad9bc7 556 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
557 goto drop;
558
1da177e4
LT
559 skb->dev = dev;
560
3b04ddde 561 if (dev->header_ops) {
1da177e4
LT
562 /* The device has an explicit notion of ll header,
563 exported to higher levels.
564
565 Otherwise, the device hides datails of it frame
566 structure, so that corresponding packet head
567 never delivered to user.
568 */
569 if (sk->sk_type != SOCK_DGRAM)
98e399f8 570 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
571 else if (skb->pkt_type == PACKET_OUTGOING) {
572 /* Special case: outgoing packets have ll header at head */
bbe735e4 573 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
574 }
575 }
576
577 snaplen = skb->len;
578
dbcb5855
DM
579 res = run_filter(skb, sk, snaplen);
580 if (!res)
fda9ef5d 581 goto drop_n_restore;
dbcb5855
DM
582 if (snaplen > res)
583 snaplen = res;
1da177e4
LT
584
585 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
586 (unsigned)sk->sk_rcvbuf)
587 goto drop_n_acct;
588
589 if (skb_shared(skb)) {
590 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
591 if (nskb == NULL)
592 goto drop_n_acct;
593
594 if (skb_head != skb->data) {
595 skb->data = skb_head;
596 skb->len = skb_len;
597 }
598 kfree_skb(skb);
599 skb = nskb;
600 }
601
ffbc6111
HX
602 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
603 sizeof(skb->cb));
604
605 sll = &PACKET_SKB_CB(skb)->sa.ll;
1da177e4
LT
606 sll->sll_family = AF_PACKET;
607 sll->sll_hatype = dev->type;
608 sll->sll_protocol = skb->protocol;
609 sll->sll_pkttype = skb->pkt_type;
8032b464 610 if (unlikely(po->origdev))
80feaacb
PWJ
611 sll->sll_ifindex = orig_dev->ifindex;
612 else
613 sll->sll_ifindex = dev->ifindex;
1da177e4 614
b95cce35 615 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4 616
ffbc6111 617 PACKET_SKB_CB(skb)->origlen = skb->len;
8dc41944 618
1da177e4
LT
619 if (pskb_trim(skb, snaplen))
620 goto drop_n_acct;
621
622 skb_set_owner_r(skb, sk);
623 skb->dev = NULL;
adf30907 624 skb_dst_drop(skb);
1da177e4 625
84531c24
PO
626 /* drop conntrack reference */
627 nf_reset(skb);
628
1da177e4
LT
629 spin_lock(&sk->sk_receive_queue.lock);
630 po->stats.tp_packets++;
3b885787 631 skb->dropcount = atomic_read(&sk->sk_drops);
1da177e4
LT
632 __skb_queue_tail(&sk->sk_receive_queue, skb);
633 spin_unlock(&sk->sk_receive_queue.lock);
634 sk->sk_data_ready(sk, skb->len);
635 return 0;
636
637drop_n_acct:
3b885787 638 po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
1da177e4
LT
639
640drop_n_restore:
641 if (skb_head != skb->data && skb_shared(skb)) {
642 skb->data = skb_head;
643 skb->len = skb_len;
644 }
645drop:
ead2ceb0 646 consume_skb(skb);
1da177e4
LT
647 return 0;
648}
649
40d4e3df
ED
650static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
651 struct packet_type *pt, struct net_device *orig_dev)
1da177e4
LT
652{
653 struct sock *sk;
654 struct packet_sock *po;
655 struct sockaddr_ll *sll;
bbd6ef87
PM
656 union {
657 struct tpacket_hdr *h1;
658 struct tpacket2_hdr *h2;
659 void *raw;
660 } h;
40d4e3df 661 u8 *skb_head = skb->data;
1da177e4 662 int skb_len = skb->len;
dbcb5855 663 unsigned int snaplen, res;
1da177e4 664 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
bbd6ef87 665 unsigned short macoff, netoff, hdrlen;
1da177e4 666 struct sk_buff *copy_skb = NULL;
b7aa0bf7 667 struct timeval tv;
bbd6ef87 668 struct timespec ts;
614f60fa 669 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
1da177e4
LT
670
671 if (skb->pkt_type == PACKET_LOOPBACK)
672 goto drop;
673
674 sk = pt->af_packet_priv;
675 po = pkt_sk(sk);
676
09ad9bc7 677 if (!net_eq(dev_net(dev), sock_net(sk)))
d12d01d6
DL
678 goto drop;
679
3b04ddde 680 if (dev->header_ops) {
1da177e4 681 if (sk->sk_type != SOCK_DGRAM)
98e399f8 682 skb_push(skb, skb->data - skb_mac_header(skb));
1da177e4
LT
683 else if (skb->pkt_type == PACKET_OUTGOING) {
684 /* Special case: outgoing packets have ll header at head */
bbe735e4 685 skb_pull(skb, skb_network_offset(skb));
1da177e4
LT
686 }
687 }
688
8dc41944
HX
689 if (skb->ip_summed == CHECKSUM_PARTIAL)
690 status |= TP_STATUS_CSUMNOTREADY;
691
1da177e4
LT
692 snaplen = skb->len;
693
dbcb5855
DM
694 res = run_filter(skb, sk, snaplen);
695 if (!res)
fda9ef5d 696 goto drop_n_restore;
dbcb5855
DM
697 if (snaplen > res)
698 snaplen = res;
1da177e4
LT
699
700 if (sk->sk_type == SOCK_DGRAM) {
8913336a
PM
701 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
702 po->tp_reserve;
1da177e4 703 } else {
bbe735e4 704 unsigned maclen = skb_network_offset(skb);
bbd6ef87 705 netoff = TPACKET_ALIGN(po->tp_hdrlen +
8913336a
PM
706 (maclen < 16 ? 16 : maclen)) +
707 po->tp_reserve;
1da177e4
LT
708 macoff = netoff - maclen;
709 }
710
69e3c75f 711 if (macoff + snaplen > po->rx_ring.frame_size) {
1da177e4
LT
712 if (po->copy_thresh &&
713 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
714 (unsigned)sk->sk_rcvbuf) {
715 if (skb_shared(skb)) {
716 copy_skb = skb_clone(skb, GFP_ATOMIC);
717 } else {
718 copy_skb = skb_get(skb);
719 skb_head = skb->data;
720 }
721 if (copy_skb)
722 skb_set_owner_r(copy_skb, sk);
723 }
69e3c75f 724 snaplen = po->rx_ring.frame_size - macoff;
1da177e4
LT
725 if ((int)snaplen < 0)
726 snaplen = 0;
727 }
1da177e4
LT
728
729 spin_lock(&sk->sk_receive_queue.lock);
69e3c75f 730 h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
bbd6ef87 731 if (!h.raw)
1da177e4 732 goto ring_is_full;
69e3c75f 733 packet_increment_head(&po->rx_ring);
1da177e4
LT
734 po->stats.tp_packets++;
735 if (copy_skb) {
736 status |= TP_STATUS_COPY;
737 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
738 }
739 if (!po->stats.tp_drops)
740 status &= ~TP_STATUS_LOSING;
741 spin_unlock(&sk->sk_receive_queue.lock);
742
bbd6ef87 743 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
1da177e4 744
bbd6ef87
PM
745 switch (po->tp_version) {
746 case TPACKET_V1:
747 h.h1->tp_len = skb->len;
748 h.h1->tp_snaplen = snaplen;
749 h.h1->tp_mac = macoff;
750 h.h1->tp_net = netoff;
614f60fa
SM
751 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
752 && shhwtstamps->syststamp.tv64)
753 tv = ktime_to_timeval(shhwtstamps->syststamp);
754 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
755 && shhwtstamps->hwtstamp.tv64)
756 tv = ktime_to_timeval(shhwtstamps->hwtstamp);
757 else if (skb->tstamp.tv64)
bbd6ef87
PM
758 tv = ktime_to_timeval(skb->tstamp);
759 else
760 do_gettimeofday(&tv);
761 h.h1->tp_sec = tv.tv_sec;
762 h.h1->tp_usec = tv.tv_usec;
763 hdrlen = sizeof(*h.h1);
764 break;
765 case TPACKET_V2:
766 h.h2->tp_len = skb->len;
767 h.h2->tp_snaplen = snaplen;
768 h.h2->tp_mac = macoff;
769 h.h2->tp_net = netoff;
614f60fa
SM
770 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
771 && shhwtstamps->syststamp.tv64)
772 ts = ktime_to_timespec(shhwtstamps->syststamp);
773 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
774 && shhwtstamps->hwtstamp.tv64)
775 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
776 else if (skb->tstamp.tv64)
bbd6ef87
PM
777 ts = ktime_to_timespec(skb->tstamp);
778 else
779 getnstimeofday(&ts);
780 h.h2->tp_sec = ts.tv_sec;
781 h.h2->tp_nsec = ts.tv_nsec;
05423b24 782 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
bbd6ef87
PM
783 hdrlen = sizeof(*h.h2);
784 break;
785 default:
786 BUG();
787 }
1da177e4 788
bbd6ef87 789 sll = h.raw + TPACKET_ALIGN(hdrlen);
b95cce35 790 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1da177e4
LT
791 sll->sll_family = AF_PACKET;
792 sll->sll_hatype = dev->type;
793 sll->sll_protocol = skb->protocol;
794 sll->sll_pkttype = skb->pkt_type;
8032b464 795 if (unlikely(po->origdev))
80feaacb
PWJ
796 sll->sll_ifindex = orig_dev->ifindex;
797 else
798 sll->sll_ifindex = dev->ifindex;
1da177e4 799
bbd6ef87 800 __packet_set_status(po, h.raw, status);
e16aa207 801 smp_mb();
1da177e4
LT
802 {
803 struct page *p_start, *p_end;
bbd6ef87 804 u8 *h_end = h.raw + macoff + snaplen - 1;
1da177e4 805
bbd6ef87 806 p_start = virt_to_page(h.raw);
1da177e4
LT
807 p_end = virt_to_page(h_end);
808 while (p_start <= p_end) {
809 flush_dcache_page(p_start);
810 p_start++;
811 }
812 }
813
814 sk->sk_data_ready(sk, 0);
815
816drop_n_restore:
817 if (skb_head != skb->data && skb_shared(skb)) {
818 skb->data = skb_head;
819 skb->len = skb_len;
820 }
821drop:
1ce4f28b 822 kfree_skb(skb);
1da177e4
LT
823 return 0;
824
825ring_is_full:
826 po->stats.tp_drops++;
827 spin_unlock(&sk->sk_receive_queue.lock);
828
829 sk->sk_data_ready(sk, 0);
acb5d75b 830 kfree_skb(copy_skb);
1da177e4
LT
831 goto drop_n_restore;
832}
833
69e3c75f
JB
834static void tpacket_destruct_skb(struct sk_buff *skb)
835{
836 struct packet_sock *po = pkt_sk(skb->sk);
40d4e3df 837 void *ph;
1da177e4 838
69e3c75f 839 BUG_ON(skb == NULL);
1da177e4 840
69e3c75f
JB
841 if (likely(po->tx_ring.pg_vec)) {
842 ph = skb_shinfo(skb)->destructor_arg;
843 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
844 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
845 atomic_dec(&po->tx_ring.pending);
846 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
847 }
848
849 sock_wfree(skb);
850}
851
40d4e3df
ED
852static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
853 void *frame, struct net_device *dev, int size_max,
854 __be16 proto, unsigned char *addr)
69e3c75f
JB
855{
856 union {
857 struct tpacket_hdr *h1;
858 struct tpacket2_hdr *h2;
859 void *raw;
860 } ph;
861 int to_write, offset, len, tp_len, nr_frags, len_max;
862 struct socket *sock = po->sk.sk_socket;
863 struct page *page;
864 void *data;
865 int err;
866
867 ph.raw = frame;
868
869 skb->protocol = proto;
870 skb->dev = dev;
871 skb->priority = po->sk.sk_priority;
2d37a186 872 skb->mark = po->sk.sk_mark;
69e3c75f
JB
873 skb_shinfo(skb)->destructor_arg = ph.raw;
874
875 switch (po->tp_version) {
876 case TPACKET_V2:
877 tp_len = ph.h2->tp_len;
878 break;
879 default:
880 tp_len = ph.h1->tp_len;
881 break;
882 }
883 if (unlikely(tp_len > size_max)) {
40d4e3df 884 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
69e3c75f
JB
885 return -EMSGSIZE;
886 }
887
888 skb_reserve(skb, LL_RESERVED_SPACE(dev));
889 skb_reset_network_header(skb);
890
891 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
892 to_write = tp_len;
893
894 if (sock->type == SOCK_DGRAM) {
895 err = dev_hard_header(skb, dev, ntohs(proto), addr,
896 NULL, tp_len);
897 if (unlikely(err < 0))
898 return -EINVAL;
40d4e3df 899 } else if (dev->hard_header_len) {
69e3c75f
JB
900 /* net device doesn't like empty head */
901 if (unlikely(tp_len <= dev->hard_header_len)) {
40d4e3df
ED
902 pr_err("packet size is too short (%d < %d)\n",
903 tp_len, dev->hard_header_len);
69e3c75f
JB
904 return -EINVAL;
905 }
906
907 skb_push(skb, dev->hard_header_len);
908 err = skb_store_bits(skb, 0, data,
909 dev->hard_header_len);
910 if (unlikely(err))
911 return err;
912
913 data += dev->hard_header_len;
914 to_write -= dev->hard_header_len;
915 }
916
917 err = -EFAULT;
918 page = virt_to_page(data);
919 offset = offset_in_page(data);
920 len_max = PAGE_SIZE - offset;
921 len = ((to_write > len_max) ? len_max : to_write);
922
923 skb->data_len = to_write;
924 skb->len += to_write;
925 skb->truesize += to_write;
926 atomic_add(to_write, &po->sk.sk_wmem_alloc);
927
928 while (likely(to_write)) {
929 nr_frags = skb_shinfo(skb)->nr_frags;
930
931 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
40d4e3df
ED
932 pr_err("Packet exceed the number of skb frags(%lu)\n",
933 MAX_SKB_FRAGS);
69e3c75f
JB
934 return -EFAULT;
935 }
936
937 flush_dcache_page(page);
938 get_page(page);
939 skb_fill_page_desc(skb,
940 nr_frags,
941 page++, offset, len);
942 to_write -= len;
943 offset = 0;
944 len_max = PAGE_SIZE;
945 len = ((to_write > len_max) ? len_max : to_write);
946 }
947
948 return tp_len;
949}
950
951static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
952{
953 struct socket *sock;
954 struct sk_buff *skb;
955 struct net_device *dev;
956 __be16 proto;
957 int ifindex, err, reserve = 0;
40d4e3df
ED
958 void *ph;
959 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
69e3c75f
JB
960 int tp_len, size_max;
961 unsigned char *addr;
962 int len_sum = 0;
963 int status = 0;
964
965 sock = po->sk.sk_socket;
966
967 mutex_lock(&po->pg_vec_lock);
968
969 err = -EBUSY;
970 if (saddr == NULL) {
971 ifindex = po->ifindex;
972 proto = po->num;
973 addr = NULL;
974 } else {
975 err = -EINVAL;
976 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
977 goto out;
978 if (msg->msg_namelen < (saddr->sll_halen
979 + offsetof(struct sockaddr_ll,
980 sll_addr)))
981 goto out;
982 ifindex = saddr->sll_ifindex;
983 proto = saddr->sll_protocol;
984 addr = saddr->sll_addr;
985 }
986
987 dev = dev_get_by_index(sock_net(&po->sk), ifindex);
988 err = -ENXIO;
989 if (unlikely(dev == NULL))
990 goto out;
991
992 reserve = dev->hard_header_len;
993
994 err = -ENETDOWN;
995 if (unlikely(!(dev->flags & IFF_UP)))
996 goto out_put;
997
998 size_max = po->tx_ring.frame_size
b5dd884e 999 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
69e3c75f
JB
1000
1001 if (size_max > dev->mtu + reserve)
1002 size_max = dev->mtu + reserve;
1003
1004 do {
1005 ph = packet_current_frame(po, &po->tx_ring,
1006 TP_STATUS_SEND_REQUEST);
1007
1008 if (unlikely(ph == NULL)) {
1009 schedule();
1010 continue;
1011 }
1012
1013 status = TP_STATUS_SEND_REQUEST;
1014 skb = sock_alloc_send_skb(&po->sk,
1015 LL_ALLOCATED_SPACE(dev)
1016 + sizeof(struct sockaddr_ll),
1017 0, &err);
1018
1019 if (unlikely(skb == NULL))
1020 goto out_status;
1021
1022 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1023 addr);
1024
1025 if (unlikely(tp_len < 0)) {
1026 if (po->tp_loss) {
1027 __packet_set_status(po, ph,
1028 TP_STATUS_AVAILABLE);
1029 packet_increment_head(&po->tx_ring);
1030 kfree_skb(skb);
1031 continue;
1032 } else {
1033 status = TP_STATUS_WRONG_FORMAT;
1034 err = tp_len;
1035 goto out_status;
1036 }
1037 }
1038
1039 skb->destructor = tpacket_destruct_skb;
1040 __packet_set_status(po, ph, TP_STATUS_SENDING);
1041 atomic_inc(&po->tx_ring.pending);
1042
1043 status = TP_STATUS_SEND_REQUEST;
1044 err = dev_queue_xmit(skb);
eb70df13
JP
1045 if (unlikely(err > 0)) {
1046 err = net_xmit_errno(err);
1047 if (err && __packet_get_status(po, ph) ==
1048 TP_STATUS_AVAILABLE) {
1049 /* skb was destructed already */
1050 skb = NULL;
1051 goto out_status;
1052 }
1053 /*
1054 * skb was dropped but not destructed yet;
1055 * let's treat it like congestion or err < 0
1056 */
1057 err = 0;
1058 }
69e3c75f
JB
1059 packet_increment_head(&po->tx_ring);
1060 len_sum += tp_len;
f64f9e71
JP
1061 } while (likely((ph != NULL) ||
1062 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1063 (atomic_read(&po->tx_ring.pending))))
1064 );
69e3c75f
JB
1065
1066 err = len_sum;
1067 goto out_put;
1068
69e3c75f
JB
1069out_status:
1070 __packet_set_status(po, ph, status);
1071 kfree_skb(skb);
1072out_put:
1073 dev_put(dev);
1074out:
1075 mutex_unlock(&po->pg_vec_lock);
1076 return err;
1077}
69e3c75f 1078
bfd5f4a3
SS
1079static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1080 size_t reserve, size_t len,
1081 size_t linear, int noblock,
1082 int *err)
1083{
1084 struct sk_buff *skb;
1085
1086 /* Under a page? Don't bother with paged skb. */
1087 if (prepad + len < PAGE_SIZE || !linear)
1088 linear = len;
1089
1090 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1091 err);
1092 if (!skb)
1093 return NULL;
1094
1095 skb_reserve(skb, reserve);
1096 skb_put(skb, linear);
1097 skb->data_len = len - linear;
1098 skb->len += len - linear;
1099
1100 return skb;
1101}
1102
69e3c75f 1103static int packet_snd(struct socket *sock,
1da177e4
LT
1104 struct msghdr *msg, size_t len)
1105{
1106 struct sock *sk = sock->sk;
40d4e3df 1107 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1da177e4
LT
1108 struct sk_buff *skb;
1109 struct net_device *dev;
0e11c91e 1110 __be16 proto;
1da177e4
LT
1111 unsigned char *addr;
1112 int ifindex, err, reserve = 0;
bfd5f4a3
SS
1113 struct virtio_net_hdr vnet_hdr = { 0 };
1114 int offset = 0;
1115 int vnet_hdr_len;
1116 struct packet_sock *po = pkt_sk(sk);
1117 unsigned short gso_type = 0;
1da177e4
LT
1118
1119 /*
1ce4f28b 1120 * Get and verify the address.
1da177e4 1121 */
1ce4f28b 1122
1da177e4 1123 if (saddr == NULL) {
1da177e4
LT
1124 ifindex = po->ifindex;
1125 proto = po->num;
1126 addr = NULL;
1127 } else {
1128 err = -EINVAL;
1129 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1130 goto out;
0fb375fb
EB
1131 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1132 goto out;
1da177e4
LT
1133 ifindex = saddr->sll_ifindex;
1134 proto = saddr->sll_protocol;
1135 addr = saddr->sll_addr;
1136 }
1137
1138
3b1e0a65 1139 dev = dev_get_by_index(sock_net(sk), ifindex);
1da177e4
LT
1140 err = -ENXIO;
1141 if (dev == NULL)
1142 goto out_unlock;
1143 if (sock->type == SOCK_RAW)
1144 reserve = dev->hard_header_len;
1145
d5e76b0a
DM
1146 err = -ENETDOWN;
1147 if (!(dev->flags & IFF_UP))
1148 goto out_unlock;
1149
bfd5f4a3
SS
1150 if (po->has_vnet_hdr) {
1151 vnet_hdr_len = sizeof(vnet_hdr);
1152
1153 err = -EINVAL;
1154 if (len < vnet_hdr_len)
1155 goto out_unlock;
1156
1157 len -= vnet_hdr_len;
1158
1159 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1160 vnet_hdr_len);
1161 if (err < 0)
1162 goto out_unlock;
1163
1164 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1165 (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1166 vnet_hdr.hdr_len))
1167 vnet_hdr.hdr_len = vnet_hdr.csum_start +
1168 vnet_hdr.csum_offset + 2;
1169
1170 err = -EINVAL;
1171 if (vnet_hdr.hdr_len > len)
1172 goto out_unlock;
1173
1174 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1175 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1176 case VIRTIO_NET_HDR_GSO_TCPV4:
1177 gso_type = SKB_GSO_TCPV4;
1178 break;
1179 case VIRTIO_NET_HDR_GSO_TCPV6:
1180 gso_type = SKB_GSO_TCPV6;
1181 break;
1182 case VIRTIO_NET_HDR_GSO_UDP:
1183 gso_type = SKB_GSO_UDP;
1184 break;
1185 default:
1186 goto out_unlock;
1187 }
1188
1189 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1190 gso_type |= SKB_GSO_TCP_ECN;
1191
1192 if (vnet_hdr.gso_size == 0)
1193 goto out_unlock;
1194
1195 }
1196 }
1197
1da177e4 1198 err = -EMSGSIZE;
bfd5f4a3 1199 if (!gso_type && (len > dev->mtu+reserve))
1da177e4
LT
1200 goto out_unlock;
1201
bfd5f4a3
SS
1202 err = -ENOBUFS;
1203 skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1204 LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1205 msg->msg_flags & MSG_DONTWAIT, &err);
40d4e3df 1206 if (skb == NULL)
1da177e4
LT
1207 goto out_unlock;
1208
bfd5f4a3 1209 skb_set_network_header(skb, reserve);
1da177e4 1210
0c4e8581
SH
1211 err = -EINVAL;
1212 if (sock->type == SOCK_DGRAM &&
bfd5f4a3 1213 (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
0c4e8581 1214 goto out_free;
1da177e4
LT
1215
1216 /* Returns -EFAULT on error */
bfd5f4a3 1217 err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1da177e4
LT
1218 if (err)
1219 goto out_free;
2244d07b 1220 err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
ed85b565
RC
1221 if (err < 0)
1222 goto out_free;
1da177e4
LT
1223
1224 skb->protocol = proto;
1225 skb->dev = dev;
1226 skb->priority = sk->sk_priority;
2d37a186 1227 skb->mark = sk->sk_mark;
1da177e4 1228
bfd5f4a3
SS
1229 if (po->has_vnet_hdr) {
1230 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1231 if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1232 vnet_hdr.csum_offset)) {
1233 err = -EINVAL;
1234 goto out_free;
1235 }
1236 }
1237
1238 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1239 skb_shinfo(skb)->gso_type = gso_type;
1240
1241 /* Header must be checked, and gso_segs computed. */
1242 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1243 skb_shinfo(skb)->gso_segs = 0;
1244
1245 len += vnet_hdr_len;
1246 }
1247
1da177e4
LT
1248 /*
1249 * Now send it
1250 */
1251
1252 err = dev_queue_xmit(skb);
1253 if (err > 0 && (err = net_xmit_errno(err)) != 0)
1254 goto out_unlock;
1255
1256 dev_put(dev);
1257
40d4e3df 1258 return len;
1da177e4
LT
1259
1260out_free:
1261 kfree_skb(skb);
1262out_unlock:
1263 if (dev)
1264 dev_put(dev);
1265out:
1266 return err;
1267}
1268
69e3c75f
JB
1269static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1270 struct msghdr *msg, size_t len)
1271{
69e3c75f
JB
1272 struct sock *sk = sock->sk;
1273 struct packet_sock *po = pkt_sk(sk);
1274 if (po->tx_ring.pg_vec)
1275 return tpacket_snd(po, msg);
1276 else
69e3c75f
JB
1277 return packet_snd(sock, msg, len);
1278}
1279
1da177e4
LT
1280/*
1281 * Close a PACKET socket. This is fairly simple. We immediately go
1282 * to 'closed' state and remove our protocol entry in the device list.
1283 */
1284
1285static int packet_release(struct socket *sock)
1286{
1287 struct sock *sk = sock->sk;
1288 struct packet_sock *po;
d12d01d6 1289 struct net *net;
69e3c75f 1290 struct tpacket_req req;
1da177e4
LT
1291
1292 if (!sk)
1293 return 0;
1294
3b1e0a65 1295 net = sock_net(sk);
1da177e4
LT
1296 po = pkt_sk(sk);
1297
808f5114 1298 spin_lock_bh(&net->packet.sklist_lock);
1299 sk_del_node_init_rcu(sk);
920de804 1300 sock_prot_inuse_add(net, sk->sk_prot, -1);
808f5114 1301 spin_unlock_bh(&net->packet.sklist_lock);
1da177e4 1302
808f5114 1303 spin_lock(&po->bind_lock);
1da177e4
LT
1304 if (po->running) {
1305 /*
808f5114 1306 * Remove from protocol table
1da177e4 1307 */
1da177e4
LT
1308 po->running = 0;
1309 po->num = 0;
808f5114 1310 __dev_remove_pack(&po->prot_hook);
1da177e4
LT
1311 __sock_put(sk);
1312 }
808f5114 1313 spin_unlock(&po->bind_lock);
1da177e4 1314
1da177e4 1315 packet_flush_mclist(sk);
1da177e4 1316
69e3c75f
JB
1317 memset(&req, 0, sizeof(req));
1318
1319 if (po->rx_ring.pg_vec)
1320 packet_set_ring(sk, &req, 1, 0);
1321
1322 if (po->tx_ring.pg_vec)
1323 packet_set_ring(sk, &req, 1, 1);
1da177e4 1324
808f5114 1325 synchronize_net();
1da177e4
LT
1326 /*
1327 * Now the socket is dead. No more input will appear.
1328 */
1da177e4
LT
1329 sock_orphan(sk);
1330 sock->sk = NULL;
1331
1332 /* Purge queues */
1333
1334 skb_queue_purge(&sk->sk_receive_queue);
17ab56a2 1335 sk_refcnt_debug_release(sk);
1da177e4
LT
1336
1337 sock_put(sk);
1338 return 0;
1339}
1340
1341/*
1342 * Attach a packet hook.
1343 */
1344
0e11c91e 1345static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1da177e4
LT
1346{
1347 struct packet_sock *po = pkt_sk(sk);
1348 /*
1349 * Detach an existing hook if present.
1350 */
1351
1352 lock_sock(sk);
1353
1354 spin_lock(&po->bind_lock);
1355 if (po->running) {
1356 __sock_put(sk);
1357 po->running = 0;
1358 po->num = 0;
1359 spin_unlock(&po->bind_lock);
1360 dev_remove_pack(&po->prot_hook);
1361 spin_lock(&po->bind_lock);
1362 }
1363
1364 po->num = protocol;
1365 po->prot_hook.type = protocol;
1366 po->prot_hook.dev = dev;
1367
1368 po->ifindex = dev ? dev->ifindex : 0;
1369
1370 if (protocol == 0)
1371 goto out_unlock;
1372
be85d4ad 1373 if (!dev || (dev->flags & IFF_UP)) {
1da177e4
LT
1374 dev_add_pack(&po->prot_hook);
1375 sock_hold(sk);
1376 po->running = 1;
be85d4ad
UT
1377 } else {
1378 sk->sk_err = ENETDOWN;
1379 if (!sock_flag(sk, SOCK_DEAD))
1380 sk->sk_error_report(sk);
1da177e4
LT
1381 }
1382
1383out_unlock:
1384 spin_unlock(&po->bind_lock);
1385 release_sock(sk);
1386 return 0;
1387}
1388
1389/*
1390 * Bind a packet socket to a device
1391 */
1392
40d4e3df
ED
1393static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1394 int addr_len)
1da177e4 1395{
40d4e3df 1396 struct sock *sk = sock->sk;
1da177e4
LT
1397 char name[15];
1398 struct net_device *dev;
1399 int err = -ENODEV;
1ce4f28b 1400
1da177e4
LT
1401 /*
1402 * Check legality
1403 */
1ce4f28b 1404
8ae55f04 1405 if (addr_len != sizeof(struct sockaddr))
1da177e4 1406 return -EINVAL;
40d4e3df 1407 strlcpy(name, uaddr->sa_data, sizeof(name));
1da177e4 1408
3b1e0a65 1409 dev = dev_get_by_name(sock_net(sk), name);
1da177e4
LT
1410 if (dev) {
1411 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1412 dev_put(dev);
1413 }
1414 return err;
1415}
1da177e4
LT
1416
1417static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1418{
40d4e3df
ED
1419 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1420 struct sock *sk = sock->sk;
1da177e4
LT
1421 struct net_device *dev = NULL;
1422 int err;
1423
1424
1425 /*
1426 * Check legality
1427 */
1ce4f28b 1428
1da177e4
LT
1429 if (addr_len < sizeof(struct sockaddr_ll))
1430 return -EINVAL;
1431 if (sll->sll_family != AF_PACKET)
1432 return -EINVAL;
1433
1434 if (sll->sll_ifindex) {
1435 err = -ENODEV;
3b1e0a65 1436 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1da177e4
LT
1437 if (dev == NULL)
1438 goto out;
1439 }
1440 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1441 if (dev)
1442 dev_put(dev);
1443
1444out:
1445 return err;
1446}
1447
1448static struct proto packet_proto = {
1449 .name = "PACKET",
1450 .owner = THIS_MODULE,
1451 .obj_size = sizeof(struct packet_sock),
1452};
1453
1454/*
1ce4f28b 1455 * Create a packet of type SOCK_PACKET.
1da177e4
LT
1456 */
1457
3f378b68
EP
1458static int packet_create(struct net *net, struct socket *sock, int protocol,
1459 int kern)
1da177e4
LT
1460{
1461 struct sock *sk;
1462 struct packet_sock *po;
0e11c91e 1463 __be16 proto = (__force __be16)protocol; /* weird, but documented */
1da177e4
LT
1464 int err;
1465
1466 if (!capable(CAP_NET_RAW))
1467 return -EPERM;
be02097c
DM
1468 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1469 sock->type != SOCK_PACKET)
1da177e4
LT
1470 return -ESOCKTNOSUPPORT;
1471
1472 sock->state = SS_UNCONNECTED;
1473
1474 err = -ENOBUFS;
6257ff21 1475 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1da177e4
LT
1476 if (sk == NULL)
1477 goto out;
1478
1479 sock->ops = &packet_ops;
1da177e4
LT
1480 if (sock->type == SOCK_PACKET)
1481 sock->ops = &packet_ops_spkt;
be02097c 1482
1da177e4
LT
1483 sock_init_data(sock, sk);
1484
1485 po = pkt_sk(sk);
1486 sk->sk_family = PF_PACKET;
0e11c91e 1487 po->num = proto;
1da177e4
LT
1488
1489 sk->sk_destruct = packet_sock_destruct;
17ab56a2 1490 sk_refcnt_debug_inc(sk);
1da177e4
LT
1491
1492 /*
1493 * Attach a protocol block
1494 */
1495
1496 spin_lock_init(&po->bind_lock);
905db440 1497 mutex_init(&po->pg_vec_lock);
1da177e4 1498 po->prot_hook.func = packet_rcv;
be02097c 1499
1da177e4
LT
1500 if (sock->type == SOCK_PACKET)
1501 po->prot_hook.func = packet_rcv_spkt;
be02097c 1502
1da177e4
LT
1503 po->prot_hook.af_packet_priv = sk;
1504
0e11c91e
AV
1505 if (proto) {
1506 po->prot_hook.type = proto;
1da177e4
LT
1507 dev_add_pack(&po->prot_hook);
1508 sock_hold(sk);
1509 po->running = 1;
1510 }
1511
808f5114 1512 spin_lock_bh(&net->packet.sklist_lock);
1513 sk_add_node_rcu(sk, &net->packet.sklist);
3680453c 1514 sock_prot_inuse_add(net, &packet_proto, 1);
808f5114 1515 spin_unlock_bh(&net->packet.sklist_lock);
1516
40d4e3df 1517 return 0;
1da177e4
LT
1518out:
1519 return err;
1520}
1521
ed85b565
RC
1522static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
1523{
1524 struct sock_exterr_skb *serr;
1525 struct sk_buff *skb, *skb2;
1526 int copied, err;
1527
1528 err = -EAGAIN;
1529 skb = skb_dequeue(&sk->sk_error_queue);
1530 if (skb == NULL)
1531 goto out;
1532
1533 copied = skb->len;
1534 if (copied > len) {
1535 msg->msg_flags |= MSG_TRUNC;
1536 copied = len;
1537 }
1538 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1539 if (err)
1540 goto out_free_skb;
1541
1542 sock_recv_timestamp(msg, sk, skb);
1543
1544 serr = SKB_EXT_ERR(skb);
1545 put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
1546 sizeof(serr->ee), &serr->ee);
1547
1548 msg->msg_flags |= MSG_ERRQUEUE;
1549 err = copied;
1550
1551 /* Reset and regenerate socket error */
1552 spin_lock_bh(&sk->sk_error_queue.lock);
1553 sk->sk_err = 0;
1554 if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
1555 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
1556 spin_unlock_bh(&sk->sk_error_queue.lock);
1557 sk->sk_error_report(sk);
1558 } else
1559 spin_unlock_bh(&sk->sk_error_queue.lock);
1560
1561out_free_skb:
1562 kfree_skb(skb);
1563out:
1564 return err;
1565}
1566
1da177e4
LT
1567/*
1568 * Pull a packet from our receive queue and hand it to the user.
1569 * If necessary we block.
1570 */
1571
1572static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1573 struct msghdr *msg, size_t len, int flags)
1574{
1575 struct sock *sk = sock->sk;
1576 struct sk_buff *skb;
1577 int copied, err;
0fb375fb 1578 struct sockaddr_ll *sll;
bfd5f4a3 1579 int vnet_hdr_len = 0;
1da177e4
LT
1580
1581 err = -EINVAL;
ed85b565 1582 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1da177e4
LT
1583 goto out;
1584
1585#if 0
1586 /* What error should we return now? EUNATTACH? */
1587 if (pkt_sk(sk)->ifindex < 0)
1588 return -ENODEV;
1589#endif
1590
ed85b565
RC
1591 if (flags & MSG_ERRQUEUE) {
1592 err = packet_recv_error(sk, msg, len);
1593 goto out;
1594 }
1595
1da177e4
LT
1596 /*
1597 * Call the generic datagram receiver. This handles all sorts
1598 * of horrible races and re-entrancy so we can forget about it
1599 * in the protocol layers.
1600 *
1601 * Now it will return ENETDOWN, if device have just gone down,
1602 * but then it will block.
1603 */
1604
40d4e3df 1605 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1da177e4
LT
1606
1607 /*
1ce4f28b 1608 * An error occurred so return it. Because skb_recv_datagram()
1da177e4
LT
1609 * handles the blocking we don't see and worry about blocking
1610 * retries.
1611 */
1612
8ae55f04 1613 if (skb == NULL)
1da177e4
LT
1614 goto out;
1615
bfd5f4a3
SS
1616 if (pkt_sk(sk)->has_vnet_hdr) {
1617 struct virtio_net_hdr vnet_hdr = { 0 };
1618
1619 err = -EINVAL;
1620 vnet_hdr_len = sizeof(vnet_hdr);
1f18b717 1621 if (len < vnet_hdr_len)
bfd5f4a3
SS
1622 goto out_free;
1623
1f18b717
MK
1624 len -= vnet_hdr_len;
1625
bfd5f4a3
SS
1626 if (skb_is_gso(skb)) {
1627 struct skb_shared_info *sinfo = skb_shinfo(skb);
1628
1629 /* This is a hint as to how much should be linear. */
1630 vnet_hdr.hdr_len = skb_headlen(skb);
1631 vnet_hdr.gso_size = sinfo->gso_size;
1632 if (sinfo->gso_type & SKB_GSO_TCPV4)
1633 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1634 else if (sinfo->gso_type & SKB_GSO_TCPV6)
1635 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1636 else if (sinfo->gso_type & SKB_GSO_UDP)
1637 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1638 else if (sinfo->gso_type & SKB_GSO_FCOE)
1639 goto out_free;
1640 else
1641 BUG();
1642 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1643 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1644 } else
1645 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1646
1647 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1648 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1649 vnet_hdr.csum_start = skb->csum_start -
1650 skb_headroom(skb);
1651 vnet_hdr.csum_offset = skb->csum_offset;
1652 } /* else everything is zero */
1653
1654 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1655 vnet_hdr_len);
1656 if (err < 0)
1657 goto out_free;
1658 }
1659
0fb375fb
EB
1660 /*
1661 * If the address length field is there to be filled in, we fill
1662 * it in now.
1663 */
1664
ffbc6111 1665 sll = &PACKET_SKB_CB(skb)->sa.ll;
0fb375fb
EB
1666 if (sock->type == SOCK_PACKET)
1667 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1668 else
1669 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1670
1da177e4
LT
1671 /*
1672 * You lose any data beyond the buffer you gave. If it worries a
1673 * user program they can ask the device for its MTU anyway.
1674 */
1675
1676 copied = skb->len;
40d4e3df
ED
1677 if (copied > len) {
1678 copied = len;
1679 msg->msg_flags |= MSG_TRUNC;
1da177e4
LT
1680 }
1681
1682 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1683 if (err)
1684 goto out_free;
1685
3b885787 1686 sock_recv_ts_and_drops(msg, sk, skb);
1da177e4
LT
1687
1688 if (msg->msg_name)
ffbc6111
HX
1689 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1690 msg->msg_namelen);
1da177e4 1691
8dc41944 1692 if (pkt_sk(sk)->auxdata) {
ffbc6111
HX
1693 struct tpacket_auxdata aux;
1694
1695 aux.tp_status = TP_STATUS_USER;
1696 if (skb->ip_summed == CHECKSUM_PARTIAL)
1697 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1698 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1699 aux.tp_snaplen = skb->len;
1700 aux.tp_mac = 0;
bbe735e4 1701 aux.tp_net = skb_network_offset(skb);
05423b24 1702 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
ffbc6111
HX
1703
1704 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
8dc41944
HX
1705 }
1706
1da177e4
LT
1707 /*
1708 * Free or return the buffer as appropriate. Again this
1709 * hides all the races and re-entrancy issues from us.
1710 */
bfd5f4a3 1711 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1da177e4
LT
1712
1713out_free:
1714 skb_free_datagram(sk, skb);
1715out:
1716 return err;
1717}
1718
1da177e4
LT
1719static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1720 int *uaddr_len, int peer)
1721{
1722 struct net_device *dev;
1723 struct sock *sk = sock->sk;
1724
1725 if (peer)
1726 return -EOPNOTSUPP;
1727
1728 uaddr->sa_family = AF_PACKET;
654d1f8a
ED
1729 rcu_read_lock();
1730 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1731 if (dev)
67286640 1732 strncpy(uaddr->sa_data, dev->name, 14);
654d1f8a 1733 else
1da177e4 1734 memset(uaddr->sa_data, 0, 14);
654d1f8a 1735 rcu_read_unlock();
1da177e4
LT
1736 *uaddr_len = sizeof(*uaddr);
1737
1738 return 0;
1739}
1da177e4
LT
1740
1741static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1742 int *uaddr_len, int peer)
1743{
1744 struct net_device *dev;
1745 struct sock *sk = sock->sk;
1746 struct packet_sock *po = pkt_sk(sk);
13cfa97b 1747 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1da177e4
LT
1748
1749 if (peer)
1750 return -EOPNOTSUPP;
1751
1752 sll->sll_family = AF_PACKET;
1753 sll->sll_ifindex = po->ifindex;
1754 sll->sll_protocol = po->num;
67286640 1755 sll->sll_pkttype = 0;
654d1f8a
ED
1756 rcu_read_lock();
1757 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1da177e4
LT
1758 if (dev) {
1759 sll->sll_hatype = dev->type;
1760 sll->sll_halen = dev->addr_len;
1761 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1da177e4
LT
1762 } else {
1763 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
1764 sll->sll_halen = 0;
1765 }
654d1f8a 1766 rcu_read_unlock();
0fb375fb 1767 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1da177e4
LT
1768
1769 return 0;
1770}
1771
2aeb0b88
WC
1772static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1773 int what)
1da177e4
LT
1774{
1775 switch (i->type) {
1776 case PACKET_MR_MULTICAST:
1162563f
JP
1777 if (i->alen != dev->addr_len)
1778 return -EINVAL;
1da177e4 1779 if (what > 0)
22bedad3 1780 return dev_mc_add(dev, i->addr);
1da177e4 1781 else
22bedad3 1782 return dev_mc_del(dev, i->addr);
1da177e4
LT
1783 break;
1784 case PACKET_MR_PROMISC:
2aeb0b88 1785 return dev_set_promiscuity(dev, what);
1da177e4
LT
1786 break;
1787 case PACKET_MR_ALLMULTI:
2aeb0b88 1788 return dev_set_allmulti(dev, what);
1da177e4 1789 break;
d95ed927 1790 case PACKET_MR_UNICAST:
1162563f
JP
1791 if (i->alen != dev->addr_len)
1792 return -EINVAL;
d95ed927 1793 if (what > 0)
a748ee24 1794 return dev_uc_add(dev, i->addr);
d95ed927 1795 else
a748ee24 1796 return dev_uc_del(dev, i->addr);
d95ed927 1797 break;
40d4e3df
ED
1798 default:
1799 break;
1da177e4 1800 }
2aeb0b88 1801 return 0;
1da177e4
LT
1802}
1803
1804static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1805{
40d4e3df 1806 for ( ; i; i = i->next) {
1da177e4
LT
1807 if (i->ifindex == dev->ifindex)
1808 packet_dev_mc(dev, i, what);
1809 }
1810}
1811
0fb375fb 1812static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
1813{
1814 struct packet_sock *po = pkt_sk(sk);
1815 struct packet_mclist *ml, *i;
1816 struct net_device *dev;
1817 int err;
1818
1819 rtnl_lock();
1820
1821 err = -ENODEV;
3b1e0a65 1822 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1da177e4
LT
1823 if (!dev)
1824 goto done;
1825
1826 err = -EINVAL;
1162563f 1827 if (mreq->mr_alen > dev->addr_len)
1da177e4
LT
1828 goto done;
1829
1830 err = -ENOBUFS;
8b3a7005 1831 i = kmalloc(sizeof(*i), GFP_KERNEL);
1da177e4
LT
1832 if (i == NULL)
1833 goto done;
1834
1835 err = 0;
1836 for (ml = po->mclist; ml; ml = ml->next) {
1837 if (ml->ifindex == mreq->mr_ifindex &&
1838 ml->type == mreq->mr_type &&
1839 ml->alen == mreq->mr_alen &&
1840 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1841 ml->count++;
1842 /* Free the new element ... */
1843 kfree(i);
1844 goto done;
1845 }
1846 }
1847
1848 i->type = mreq->mr_type;
1849 i->ifindex = mreq->mr_ifindex;
1850 i->alen = mreq->mr_alen;
1851 memcpy(i->addr, mreq->mr_address, i->alen);
1852 i->count = 1;
1853 i->next = po->mclist;
1854 po->mclist = i;
2aeb0b88
WC
1855 err = packet_dev_mc(dev, i, 1);
1856 if (err) {
1857 po->mclist = i->next;
1858 kfree(i);
1859 }
1da177e4
LT
1860
1861done:
1862 rtnl_unlock();
1863 return err;
1864}
1865
0fb375fb 1866static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1da177e4
LT
1867{
1868 struct packet_mclist *ml, **mlp;
1869
1870 rtnl_lock();
1871
1872 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1873 if (ml->ifindex == mreq->mr_ifindex &&
1874 ml->type == mreq->mr_type &&
1875 ml->alen == mreq->mr_alen &&
1876 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1877 if (--ml->count == 0) {
1878 struct net_device *dev;
1879 *mlp = ml->next;
ad959e76
ED
1880 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1881 if (dev)
1da177e4 1882 packet_dev_mc(dev, ml, -1);
1da177e4
LT
1883 kfree(ml);
1884 }
1885 rtnl_unlock();
1886 return 0;
1887 }
1888 }
1889 rtnl_unlock();
1890 return -EADDRNOTAVAIL;
1891}
1892
1893static void packet_flush_mclist(struct sock *sk)
1894{
1895 struct packet_sock *po = pkt_sk(sk);
1896 struct packet_mclist *ml;
1897
1898 if (!po->mclist)
1899 return;
1900
1901 rtnl_lock();
1902 while ((ml = po->mclist) != NULL) {
1903 struct net_device *dev;
1904
1905 po->mclist = ml->next;
ad959e76
ED
1906 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1907 if (dev != NULL)
1da177e4 1908 packet_dev_mc(dev, ml, -1);
1da177e4
LT
1909 kfree(ml);
1910 }
1911 rtnl_unlock();
1912}
1da177e4
LT
1913
1914static int
b7058842 1915packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1da177e4
LT
1916{
1917 struct sock *sk = sock->sk;
8dc41944 1918 struct packet_sock *po = pkt_sk(sk);
1da177e4
LT
1919 int ret;
1920
1921 if (level != SOL_PACKET)
1922 return -ENOPROTOOPT;
1923
69e3c75f 1924 switch (optname) {
1ce4f28b 1925 case PACKET_ADD_MEMBERSHIP:
1da177e4
LT
1926 case PACKET_DROP_MEMBERSHIP:
1927 {
0fb375fb
EB
1928 struct packet_mreq_max mreq;
1929 int len = optlen;
1930 memset(&mreq, 0, sizeof(mreq));
1931 if (len < sizeof(struct packet_mreq))
1da177e4 1932 return -EINVAL;
0fb375fb
EB
1933 if (len > sizeof(mreq))
1934 len = sizeof(mreq);
40d4e3df 1935 if (copy_from_user(&mreq, optval, len))
1da177e4 1936 return -EFAULT;
0fb375fb
EB
1937 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1938 return -EINVAL;
1da177e4
LT
1939 if (optname == PACKET_ADD_MEMBERSHIP)
1940 ret = packet_mc_add(sk, &mreq);
1941 else
1942 ret = packet_mc_drop(sk, &mreq);
1943 return ret;
1944 }
a2efcfa0 1945
1da177e4 1946 case PACKET_RX_RING:
69e3c75f 1947 case PACKET_TX_RING:
1da177e4
LT
1948 {
1949 struct tpacket_req req;
1950
40d4e3df 1951 if (optlen < sizeof(req))
1da177e4 1952 return -EINVAL;
bfd5f4a3
SS
1953 if (pkt_sk(sk)->has_vnet_hdr)
1954 return -EINVAL;
40d4e3df 1955 if (copy_from_user(&req, optval, sizeof(req)))
1da177e4 1956 return -EFAULT;
69e3c75f 1957 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1da177e4
LT
1958 }
1959 case PACKET_COPY_THRESH:
1960 {
1961 int val;
1962
40d4e3df 1963 if (optlen != sizeof(val))
1da177e4 1964 return -EINVAL;
40d4e3df 1965 if (copy_from_user(&val, optval, sizeof(val)))
1da177e4
LT
1966 return -EFAULT;
1967
1968 pkt_sk(sk)->copy_thresh = val;
1969 return 0;
1970 }
bbd6ef87
PM
1971 case PACKET_VERSION:
1972 {
1973 int val;
1974
1975 if (optlen != sizeof(val))
1976 return -EINVAL;
69e3c75f 1977 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
bbd6ef87
PM
1978 return -EBUSY;
1979 if (copy_from_user(&val, optval, sizeof(val)))
1980 return -EFAULT;
1981 switch (val) {
1982 case TPACKET_V1:
1983 case TPACKET_V2:
1984 po->tp_version = val;
1985 return 0;
1986 default:
1987 return -EINVAL;
1988 }
1989 }
8913336a
PM
1990 case PACKET_RESERVE:
1991 {
1992 unsigned int val;
1993
1994 if (optlen != sizeof(val))
1995 return -EINVAL;
69e3c75f 1996 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
8913336a
PM
1997 return -EBUSY;
1998 if (copy_from_user(&val, optval, sizeof(val)))
1999 return -EFAULT;
2000 po->tp_reserve = val;
2001 return 0;
2002 }
69e3c75f
JB
2003 case PACKET_LOSS:
2004 {
2005 unsigned int val;
2006
2007 if (optlen != sizeof(val))
2008 return -EINVAL;
2009 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2010 return -EBUSY;
2011 if (copy_from_user(&val, optval, sizeof(val)))
2012 return -EFAULT;
2013 po->tp_loss = !!val;
2014 return 0;
2015 }
8dc41944
HX
2016 case PACKET_AUXDATA:
2017 {
2018 int val;
2019
2020 if (optlen < sizeof(val))
2021 return -EINVAL;
2022 if (copy_from_user(&val, optval, sizeof(val)))
2023 return -EFAULT;
2024
2025 po->auxdata = !!val;
2026 return 0;
2027 }
80feaacb
PWJ
2028 case PACKET_ORIGDEV:
2029 {
2030 int val;
2031
2032 if (optlen < sizeof(val))
2033 return -EINVAL;
2034 if (copy_from_user(&val, optval, sizeof(val)))
2035 return -EFAULT;
2036
2037 po->origdev = !!val;
2038 return 0;
2039 }
bfd5f4a3
SS
2040 case PACKET_VNET_HDR:
2041 {
2042 int val;
2043
2044 if (sock->type != SOCK_RAW)
2045 return -EINVAL;
2046 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2047 return -EBUSY;
2048 if (optlen < sizeof(val))
2049 return -EINVAL;
2050 if (copy_from_user(&val, optval, sizeof(val)))
2051 return -EFAULT;
2052
2053 po->has_vnet_hdr = !!val;
2054 return 0;
2055 }
614f60fa
SM
2056 case PACKET_TIMESTAMP:
2057 {
2058 int val;
2059
2060 if (optlen != sizeof(val))
2061 return -EINVAL;
2062 if (copy_from_user(&val, optval, sizeof(val)))
2063 return -EFAULT;
2064
2065 po->tp_tstamp = val;
2066 return 0;
2067 }
1da177e4
LT
2068 default:
2069 return -ENOPROTOOPT;
2070 }
2071}
2072
2073static int packet_getsockopt(struct socket *sock, int level, int optname,
2074 char __user *optval, int __user *optlen)
2075{
2076 int len;
8dc41944 2077 int val;
1da177e4
LT
2078 struct sock *sk = sock->sk;
2079 struct packet_sock *po = pkt_sk(sk);
8dc41944
HX
2080 void *data;
2081 struct tpacket_stats st;
1da177e4
LT
2082
2083 if (level != SOL_PACKET)
2084 return -ENOPROTOOPT;
2085
8ae55f04
KK
2086 if (get_user(len, optlen))
2087 return -EFAULT;
1da177e4
LT
2088
2089 if (len < 0)
2090 return -EINVAL;
1ce4f28b 2091
69e3c75f 2092 switch (optname) {
1da177e4 2093 case PACKET_STATISTICS:
1da177e4
LT
2094 if (len > sizeof(struct tpacket_stats))
2095 len = sizeof(struct tpacket_stats);
2096 spin_lock_bh(&sk->sk_receive_queue.lock);
2097 st = po->stats;
2098 memset(&po->stats, 0, sizeof(st));
2099 spin_unlock_bh(&sk->sk_receive_queue.lock);
2100 st.tp_packets += st.tp_drops;
2101
8dc41944
HX
2102 data = &st;
2103 break;
2104 case PACKET_AUXDATA:
2105 if (len > sizeof(int))
2106 len = sizeof(int);
2107 val = po->auxdata;
2108
80feaacb
PWJ
2109 data = &val;
2110 break;
2111 case PACKET_ORIGDEV:
2112 if (len > sizeof(int))
2113 len = sizeof(int);
2114 val = po->origdev;
2115
bfd5f4a3
SS
2116 data = &val;
2117 break;
2118 case PACKET_VNET_HDR:
2119 if (len > sizeof(int))
2120 len = sizeof(int);
2121 val = po->has_vnet_hdr;
2122
8dc41944 2123 data = &val;
1da177e4 2124 break;
bbd6ef87
PM
2125 case PACKET_VERSION:
2126 if (len > sizeof(int))
2127 len = sizeof(int);
2128 val = po->tp_version;
2129 data = &val;
2130 break;
2131 case PACKET_HDRLEN:
2132 if (len > sizeof(int))
2133 len = sizeof(int);
2134 if (copy_from_user(&val, optval, len))
2135 return -EFAULT;
2136 switch (val) {
2137 case TPACKET_V1:
2138 val = sizeof(struct tpacket_hdr);
2139 break;
2140 case TPACKET_V2:
2141 val = sizeof(struct tpacket2_hdr);
2142 break;
2143 default:
2144 return -EINVAL;
2145 }
2146 data = &val;
2147 break;
8913336a
PM
2148 case PACKET_RESERVE:
2149 if (len > sizeof(unsigned int))
2150 len = sizeof(unsigned int);
2151 val = po->tp_reserve;
2152 data = &val;
2153 break;
69e3c75f
JB
2154 case PACKET_LOSS:
2155 if (len > sizeof(unsigned int))
2156 len = sizeof(unsigned int);
2157 val = po->tp_loss;
2158 data = &val;
2159 break;
614f60fa
SM
2160 case PACKET_TIMESTAMP:
2161 if (len > sizeof(int))
2162 len = sizeof(int);
2163 val = po->tp_tstamp;
2164 data = &val;
2165 break;
1da177e4
LT
2166 default:
2167 return -ENOPROTOOPT;
2168 }
2169
8ae55f04
KK
2170 if (put_user(len, optlen))
2171 return -EFAULT;
8dc41944
HX
2172 if (copy_to_user(optval, data, len))
2173 return -EFAULT;
8ae55f04 2174 return 0;
1da177e4
LT
2175}
2176
2177
2178static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2179{
2180 struct sock *sk;
2181 struct hlist_node *node;
ad930650 2182 struct net_device *dev = data;
c346dca1 2183 struct net *net = dev_net(dev);
1da177e4 2184
808f5114 2185 rcu_read_lock();
2186 sk_for_each_rcu(sk, node, &net->packet.sklist) {
1da177e4
LT
2187 struct packet_sock *po = pkt_sk(sk);
2188
2189 switch (msg) {
2190 case NETDEV_UNREGISTER:
1da177e4
LT
2191 if (po->mclist)
2192 packet_dev_mclist(dev, po->mclist, -1);
a2efcfa0
DM
2193 /* fallthrough */
2194
1da177e4
LT
2195 case NETDEV_DOWN:
2196 if (dev->ifindex == po->ifindex) {
2197 spin_lock(&po->bind_lock);
2198 if (po->running) {
2199 __dev_remove_pack(&po->prot_hook);
2200 __sock_put(sk);
2201 po->running = 0;
2202 sk->sk_err = ENETDOWN;
2203 if (!sock_flag(sk, SOCK_DEAD))
2204 sk->sk_error_report(sk);
2205 }
2206 if (msg == NETDEV_UNREGISTER) {
2207 po->ifindex = -1;
2208 po->prot_hook.dev = NULL;
2209 }
2210 spin_unlock(&po->bind_lock);
2211 }
2212 break;
2213 case NETDEV_UP:
808f5114 2214 if (dev->ifindex == po->ifindex) {
2215 spin_lock(&po->bind_lock);
2216 if (po->num && !po->running) {
2217 dev_add_pack(&po->prot_hook);
2218 sock_hold(sk);
2219 po->running = 1;
2220 }
2221 spin_unlock(&po->bind_lock);
1da177e4 2222 }
1da177e4
LT
2223 break;
2224 }
2225 }
808f5114 2226 rcu_read_unlock();
1da177e4
LT
2227 return NOTIFY_DONE;
2228}
2229
2230
2231static int packet_ioctl(struct socket *sock, unsigned int cmd,
2232 unsigned long arg)
2233{
2234 struct sock *sk = sock->sk;
2235
69e3c75f 2236 switch (cmd) {
40d4e3df
ED
2237 case SIOCOUTQ:
2238 {
2239 int amount = sk_wmem_alloc_get(sk);
31e6d363 2240
40d4e3df
ED
2241 return put_user(amount, (int __user *)arg);
2242 }
2243 case SIOCINQ:
2244 {
2245 struct sk_buff *skb;
2246 int amount = 0;
2247
2248 spin_lock_bh(&sk->sk_receive_queue.lock);
2249 skb = skb_peek(&sk->sk_receive_queue);
2250 if (skb)
2251 amount = skb->len;
2252 spin_unlock_bh(&sk->sk_receive_queue.lock);
2253 return put_user(amount, (int __user *)arg);
2254 }
2255 case SIOCGSTAMP:
2256 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2257 case SIOCGSTAMPNS:
2258 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1ce4f28b 2259
1da177e4 2260#ifdef CONFIG_INET
40d4e3df
ED
2261 case SIOCADDRT:
2262 case SIOCDELRT:
2263 case SIOCDARP:
2264 case SIOCGARP:
2265 case SIOCSARP:
2266 case SIOCGIFADDR:
2267 case SIOCSIFADDR:
2268 case SIOCGIFBRDADDR:
2269 case SIOCSIFBRDADDR:
2270 case SIOCGIFNETMASK:
2271 case SIOCSIFNETMASK:
2272 case SIOCGIFDSTADDR:
2273 case SIOCSIFDSTADDR:
2274 case SIOCSIFFLAGS:
40d4e3df 2275 return inet_dgram_ops.ioctl(sock, cmd, arg);
1da177e4
LT
2276#endif
2277
40d4e3df
ED
2278 default:
2279 return -ENOIOCTLCMD;
1da177e4
LT
2280 }
2281 return 0;
2282}
2283
40d4e3df 2284static unsigned int packet_poll(struct file *file, struct socket *sock,
1da177e4
LT
2285 poll_table *wait)
2286{
2287 struct sock *sk = sock->sk;
2288 struct packet_sock *po = pkt_sk(sk);
2289 unsigned int mask = datagram_poll(file, sock, wait);
2290
2291 spin_lock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
2292 if (po->rx_ring.pg_vec) {
2293 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
1da177e4
LT
2294 mask |= POLLIN | POLLRDNORM;
2295 }
2296 spin_unlock_bh(&sk->sk_receive_queue.lock);
69e3c75f
JB
2297 spin_lock_bh(&sk->sk_write_queue.lock);
2298 if (po->tx_ring.pg_vec) {
2299 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2300 mask |= POLLOUT | POLLWRNORM;
2301 }
2302 spin_unlock_bh(&sk->sk_write_queue.lock);
1da177e4
LT
2303 return mask;
2304}
2305
2306
2307/* Dirty? Well, I still did not learn better way to account
2308 * for user mmaps.
2309 */
2310
2311static void packet_mm_open(struct vm_area_struct *vma)
2312{
2313 struct file *file = vma->vm_file;
40d4e3df 2314 struct socket *sock = file->private_data;
1da177e4 2315 struct sock *sk = sock->sk;
1ce4f28b 2316
1da177e4
LT
2317 if (sk)
2318 atomic_inc(&pkt_sk(sk)->mapped);
2319}
2320
2321static void packet_mm_close(struct vm_area_struct *vma)
2322{
2323 struct file *file = vma->vm_file;
40d4e3df 2324 struct socket *sock = file->private_data;
1da177e4 2325 struct sock *sk = sock->sk;
1ce4f28b 2326
1da177e4
LT
2327 if (sk)
2328 atomic_dec(&pkt_sk(sk)->mapped);
2329}
2330
f0f37e2f 2331static const struct vm_operations_struct packet_mmap_ops = {
40d4e3df
ED
2332 .open = packet_mm_open,
2333 .close = packet_mm_close,
1da177e4
LT
2334};
2335
0e3125c7
NH
2336static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
2337 unsigned int len)
1da177e4
LT
2338{
2339 int i;
2340
4ebf0ae2 2341 for (i = 0; i < len; i++) {
0e3125c7
NH
2342 if (likely(pg_vec[i].buffer)) {
2343 if (pg_vec[i].flags & PGV_FROM_VMALLOC)
2344 vfree(pg_vec[i].buffer);
2345 else
2346 free_pages((unsigned long)pg_vec[i].buffer,
2347 order);
2348 pg_vec[i].buffer = NULL;
2349 }
1da177e4
LT
2350 }
2351 kfree(pg_vec);
2352}
2353
0e3125c7
NH
2354static inline char *alloc_one_pg_vec_page(unsigned long order,
2355 unsigned char *flags)
4ebf0ae2 2356{
0e3125c7
NH
2357 char *buffer = NULL;
2358 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
2359 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
2360
2361 buffer = (char *) __get_free_pages(gfp_flags, order);
2362
2363 if (buffer)
2364 return buffer;
2365
2366 /*
2367 * __get_free_pages failed, fall back to vmalloc
2368 */
2369 *flags |= PGV_FROM_VMALLOC;
2370 buffer = vmalloc((1 << order) * PAGE_SIZE);
719bfeaa 2371
0e3125c7
NH
2372 if (buffer)
2373 return buffer;
2374
2375 /*
2376 * vmalloc failed, lets dig into swap here
2377 */
2378 *flags = 0;
2379 gfp_flags &= ~__GFP_NORETRY;
2380 buffer = (char *)__get_free_pages(gfp_flags, order);
2381 if (buffer)
2382 return buffer;
2383
2384 /*
2385 * complete and utter failure
2386 */
2387 return NULL;
4ebf0ae2
DM
2388}
2389
0e3125c7 2390static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
4ebf0ae2
DM
2391{
2392 unsigned int block_nr = req->tp_block_nr;
0e3125c7 2393 struct pgv *pg_vec;
4ebf0ae2
DM
2394 int i;
2395
0e3125c7 2396 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
4ebf0ae2
DM
2397 if (unlikely(!pg_vec))
2398 goto out;
2399
2400 for (i = 0; i < block_nr; i++) {
0e3125c7
NH
2401 pg_vec[i].buffer = alloc_one_pg_vec_page(order,
2402 &pg_vec[i].flags);
2403 if (unlikely(!pg_vec[i].buffer))
4ebf0ae2
DM
2404 goto out_free_pgvec;
2405 }
2406
2407out:
2408 return pg_vec;
2409
2410out_free_pgvec:
2411 free_pg_vec(pg_vec, order, block_nr);
0e3125c7 2412 kfree(pg_vec);
4ebf0ae2
DM
2413 pg_vec = NULL;
2414 goto out;
2415}
1da177e4 2416
69e3c75f
JB
2417static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2418 int closing, int tx_ring)
1da177e4 2419{
0e3125c7 2420 struct pgv *pg_vec = NULL;
1da177e4 2421 struct packet_sock *po = pkt_sk(sk);
0e11c91e 2422 int was_running, order = 0;
69e3c75f
JB
2423 struct packet_ring_buffer *rb;
2424 struct sk_buff_head *rb_queue;
0e11c91e 2425 __be16 num;
69e3c75f 2426 int err;
1ce4f28b 2427
69e3c75f
JB
2428 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2429 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
1da177e4 2430
69e3c75f
JB
2431 err = -EBUSY;
2432 if (!closing) {
2433 if (atomic_read(&po->mapped))
2434 goto out;
2435 if (atomic_read(&rb->pending))
2436 goto out;
2437 }
1da177e4 2438
69e3c75f
JB
2439 if (req->tp_block_nr) {
2440 /* Sanity tests and some calculations */
2441 err = -EBUSY;
2442 if (unlikely(rb->pg_vec))
2443 goto out;
1da177e4 2444
bbd6ef87
PM
2445 switch (po->tp_version) {
2446 case TPACKET_V1:
2447 po->tp_hdrlen = TPACKET_HDRLEN;
2448 break;
2449 case TPACKET_V2:
2450 po->tp_hdrlen = TPACKET2_HDRLEN;
2451 break;
2452 }
2453
69e3c75f 2454 err = -EINVAL;
4ebf0ae2 2455 if (unlikely((int)req->tp_block_size <= 0))
69e3c75f 2456 goto out;
4ebf0ae2 2457 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
69e3c75f 2458 goto out;
8913336a 2459 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
69e3c75f
JB
2460 po->tp_reserve))
2461 goto out;
4ebf0ae2 2462 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
69e3c75f 2463 goto out;
1da177e4 2464
69e3c75f
JB
2465 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2466 if (unlikely(rb->frames_per_block <= 0))
2467 goto out;
2468 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2469 req->tp_frame_nr))
2470 goto out;
1da177e4
LT
2471
2472 err = -ENOMEM;
4ebf0ae2
DM
2473 order = get_order(req->tp_block_size);
2474 pg_vec = alloc_pg_vec(req, order);
2475 if (unlikely(!pg_vec))
1da177e4 2476 goto out;
69e3c75f
JB
2477 }
2478 /* Done */
2479 else {
2480 err = -EINVAL;
4ebf0ae2 2481 if (unlikely(req->tp_frame_nr))
69e3c75f 2482 goto out;
1da177e4
LT
2483 }
2484
2485 lock_sock(sk);
2486
2487 /* Detach socket from network */
2488 spin_lock(&po->bind_lock);
2489 was_running = po->running;
2490 num = po->num;
2491 if (was_running) {
2492 __dev_remove_pack(&po->prot_hook);
2493 po->num = 0;
2494 po->running = 0;
2495 __sock_put(sk);
2496 }
2497 spin_unlock(&po->bind_lock);
1ce4f28b 2498
1da177e4
LT
2499 synchronize_net();
2500
2501 err = -EBUSY;
905db440 2502 mutex_lock(&po->pg_vec_lock);
1da177e4
LT
2503 if (closing || atomic_read(&po->mapped) == 0) {
2504 err = 0;
2505#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
69e3c75f
JB
2506 spin_lock_bh(&rb_queue->lock);
2507 pg_vec = XC(rb->pg_vec, pg_vec);
2508 rb->frame_max = (req->tp_frame_nr - 1);
2509 rb->head = 0;
2510 rb->frame_size = req->tp_frame_size;
2511 spin_unlock_bh(&rb_queue->lock);
2512
2513 order = XC(rb->pg_vec_order, order);
2514 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2515
2516 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2517 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2518 tpacket_rcv : packet_rcv;
2519 skb_queue_purge(rb_queue);
1da177e4
LT
2520#undef XC
2521 if (atomic_read(&po->mapped))
40d4e3df
ED
2522 pr_err("packet_mmap: vma is busy: %d\n",
2523 atomic_read(&po->mapped));
1da177e4 2524 }
905db440 2525 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
2526
2527 spin_lock(&po->bind_lock);
2528 if (was_running && !po->running) {
2529 sock_hold(sk);
2530 po->running = 1;
2531 po->num = num;
2532 dev_add_pack(&po->prot_hook);
2533 }
2534 spin_unlock(&po->bind_lock);
2535
2536 release_sock(sk);
2537
1da177e4
LT
2538 if (pg_vec)
2539 free_pg_vec(pg_vec, order, req->tp_block_nr);
2540out:
2541 return err;
2542}
2543
69e3c75f
JB
2544static int packet_mmap(struct file *file, struct socket *sock,
2545 struct vm_area_struct *vma)
1da177e4
LT
2546{
2547 struct sock *sk = sock->sk;
2548 struct packet_sock *po = pkt_sk(sk);
69e3c75f
JB
2549 unsigned long size, expected_size;
2550 struct packet_ring_buffer *rb;
1da177e4
LT
2551 unsigned long start;
2552 int err = -EINVAL;
2553 int i;
2554
2555 if (vma->vm_pgoff)
2556 return -EINVAL;
2557
905db440 2558 mutex_lock(&po->pg_vec_lock);
69e3c75f
JB
2559
2560 expected_size = 0;
2561 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2562 if (rb->pg_vec) {
2563 expected_size += rb->pg_vec_len
2564 * rb->pg_vec_pages
2565 * PAGE_SIZE;
2566 }
2567 }
2568
2569 if (expected_size == 0)
1da177e4 2570 goto out;
69e3c75f
JB
2571
2572 size = vma->vm_end - vma->vm_start;
2573 if (size != expected_size)
1da177e4
LT
2574 goto out;
2575
1da177e4 2576 start = vma->vm_start;
69e3c75f
JB
2577 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2578 if (rb->pg_vec == NULL)
2579 continue;
2580
2581 for (i = 0; i < rb->pg_vec_len; i++) {
0e3125c7
NH
2582 struct page *page;
2583 void *kaddr = rb->pg_vec[i].buffer;
69e3c75f
JB
2584 int pg_num;
2585
2586 for (pg_num = 0; pg_num < rb->pg_vec_pages;
0e3125c7
NH
2587 pg_num++) {
2588 if (rb->pg_vec[i].flags & PGV_FROM_VMALLOC)
2589 page = vmalloc_to_page(kaddr);
2590 else
2591 page = virt_to_page(kaddr);
2592
69e3c75f
JB
2593 err = vm_insert_page(vma, start, page);
2594 if (unlikely(err))
2595 goto out;
2596 start += PAGE_SIZE;
0e3125c7 2597 kaddr += PAGE_SIZE;
69e3c75f 2598 }
4ebf0ae2 2599 }
1da177e4 2600 }
69e3c75f 2601
4ebf0ae2 2602 atomic_inc(&po->mapped);
1da177e4
LT
2603 vma->vm_ops = &packet_mmap_ops;
2604 err = 0;
2605
2606out:
905db440 2607 mutex_unlock(&po->pg_vec_lock);
1da177e4
LT
2608 return err;
2609}
1da177e4 2610
90ddc4f0 2611static const struct proto_ops packet_ops_spkt = {
1da177e4
LT
2612 .family = PF_PACKET,
2613 .owner = THIS_MODULE,
2614 .release = packet_release,
2615 .bind = packet_bind_spkt,
2616 .connect = sock_no_connect,
2617 .socketpair = sock_no_socketpair,
2618 .accept = sock_no_accept,
2619 .getname = packet_getname_spkt,
2620 .poll = datagram_poll,
2621 .ioctl = packet_ioctl,
2622 .listen = sock_no_listen,
2623 .shutdown = sock_no_shutdown,
2624 .setsockopt = sock_no_setsockopt,
2625 .getsockopt = sock_no_getsockopt,
2626 .sendmsg = packet_sendmsg_spkt,
2627 .recvmsg = packet_recvmsg,
2628 .mmap = sock_no_mmap,
2629 .sendpage = sock_no_sendpage,
2630};
1da177e4 2631
90ddc4f0 2632static const struct proto_ops packet_ops = {
1da177e4
LT
2633 .family = PF_PACKET,
2634 .owner = THIS_MODULE,
2635 .release = packet_release,
2636 .bind = packet_bind,
2637 .connect = sock_no_connect,
2638 .socketpair = sock_no_socketpair,
2639 .accept = sock_no_accept,
1ce4f28b 2640 .getname = packet_getname,
1da177e4
LT
2641 .poll = packet_poll,
2642 .ioctl = packet_ioctl,
2643 .listen = sock_no_listen,
2644 .shutdown = sock_no_shutdown,
2645 .setsockopt = packet_setsockopt,
2646 .getsockopt = packet_getsockopt,
2647 .sendmsg = packet_sendmsg,
2648 .recvmsg = packet_recvmsg,
2649 .mmap = packet_mmap,
2650 .sendpage = sock_no_sendpage,
2651};
2652
ec1b4cf7 2653static const struct net_proto_family packet_family_ops = {
1da177e4
LT
2654 .family = PF_PACKET,
2655 .create = packet_create,
2656 .owner = THIS_MODULE,
2657};
2658
2659static struct notifier_block packet_netdev_notifier = {
40d4e3df 2660 .notifier_call = packet_notifier,
1da177e4
LT
2661};
2662
2663#ifdef CONFIG_PROC_FS
1da177e4
LT
2664
2665static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
808f5114 2666 __acquires(RCU)
1da177e4 2667{
e372c414 2668 struct net *net = seq_file_net(seq);
808f5114 2669
2670 rcu_read_lock();
2671 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
1da177e4
LT
2672}
2673
2674static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2675{
1bf40954 2676 struct net *net = seq_file_net(seq);
808f5114 2677 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
1da177e4
LT
2678}
2679
2680static void packet_seq_stop(struct seq_file *seq, void *v)
808f5114 2681 __releases(RCU)
1da177e4 2682{
808f5114 2683 rcu_read_unlock();
1da177e4
LT
2684}
2685
1ce4f28b 2686static int packet_seq_show(struct seq_file *seq, void *v)
1da177e4
LT
2687{
2688 if (v == SEQ_START_TOKEN)
2689 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
2690 else {
b7ceabd9 2691 struct sock *s = sk_entry(v);
1da177e4
LT
2692 const struct packet_sock *po = pkt_sk(s);
2693
2694 seq_printf(seq,
2695 "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
2696 s,
2697 atomic_read(&s->sk_refcnt),
2698 s->sk_type,
2699 ntohs(po->num),
2700 po->ifindex,
2701 po->running,
2702 atomic_read(&s->sk_rmem_alloc),
2703 sock_i_uid(s),
40d4e3df 2704 sock_i_ino(s));
1da177e4
LT
2705 }
2706
2707 return 0;
2708}
2709
56b3d975 2710static const struct seq_operations packet_seq_ops = {
1da177e4
LT
2711 .start = packet_seq_start,
2712 .next = packet_seq_next,
2713 .stop = packet_seq_stop,
2714 .show = packet_seq_show,
2715};
2716
2717static int packet_seq_open(struct inode *inode, struct file *file)
2718{
e372c414
DL
2719 return seq_open_net(inode, file, &packet_seq_ops,
2720 sizeof(struct seq_net_private));
1da177e4
LT
2721}
2722
da7071d7 2723static const struct file_operations packet_seq_fops = {
1da177e4
LT
2724 .owner = THIS_MODULE,
2725 .open = packet_seq_open,
2726 .read = seq_read,
2727 .llseek = seq_lseek,
e372c414 2728 .release = seq_release_net,
1da177e4
LT
2729};
2730
2731#endif
2732
2c8c1e72 2733static int __net_init packet_net_init(struct net *net)
d12d01d6 2734{
808f5114 2735 spin_lock_init(&net->packet.sklist_lock);
2aaef4e4 2736 INIT_HLIST_HEAD(&net->packet.sklist);
d12d01d6
DL
2737
2738 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2739 return -ENOMEM;
2740
2741 return 0;
2742}
2743
2c8c1e72 2744static void __net_exit packet_net_exit(struct net *net)
d12d01d6
DL
2745{
2746 proc_net_remove(net, "packet");
2747}
2748
2749static struct pernet_operations packet_net_ops = {
2750 .init = packet_net_init,
2751 .exit = packet_net_exit,
2752};
2753
2754
1da177e4
LT
2755static void __exit packet_exit(void)
2756{
1da177e4 2757 unregister_netdevice_notifier(&packet_netdev_notifier);
d12d01d6 2758 unregister_pernet_subsys(&packet_net_ops);
1da177e4
LT
2759 sock_unregister(PF_PACKET);
2760 proto_unregister(&packet_proto);
2761}
2762
2763static int __init packet_init(void)
2764{
2765 int rc = proto_register(&packet_proto, 0);
2766
2767 if (rc != 0)
2768 goto out;
2769
2770 sock_register(&packet_family_ops);
d12d01d6 2771 register_pernet_subsys(&packet_net_ops);
1da177e4 2772 register_netdevice_notifier(&packet_netdev_notifier);
1da177e4
LT
2773out:
2774 return rc;
2775}
2776
2777module_init(packet_init);
2778module_exit(packet_exit);
2779MODULE_LICENSE("GPL");
2780MODULE_ALIAS_NETPROTO(PF_PACKET);