]> bbs.cooldavid.org Git - net-next-2.6.git/blame - drivers/net/macvtap.c
net/macvtap: fix reference counting
[net-next-2.6.git] / drivers / net / macvtap.c
CommitLineData
20d29d7a
AB
1#include <linux/etherdevice.h>
2#include <linux/if_macvlan.h>
3#include <linux/interrupt.h>
4#include <linux/nsproxy.h>
5#include <linux/compat.h>
6#include <linux/if_tun.h>
7#include <linux/module.h>
8#include <linux/skbuff.h>
9#include <linux/cache.h>
10#include <linux/sched.h>
11#include <linux/types.h>
12#include <linux/init.h>
13#include <linux/wait.h>
14#include <linux/cdev.h>
15#include <linux/fs.h>
16
17#include <net/net_namespace.h>
18#include <net/rtnetlink.h>
19#include <net/sock.h>
20
21/*
22 * A macvtap queue is the central object of this driver, it connects
23 * an open character device to a macvlan interface. There can be
24 * multiple queues on one interface, which map back to queues
25 * implemented in hardware on the underlying device.
26 *
27 * macvtap_proto is used to allocate queues through the sock allocation
28 * mechanism.
29 *
30 * TODO: multiqueue support is currently not implemented, even though
31 * macvtap is basically prepared for that. We will need to add this
32 * here as well as in virtio-net and qemu to get line rate on 10gbit
33 * adapters from a guest.
34 */
35struct macvtap_queue {
36 struct sock sk;
37 struct socket sock;
38 struct macvlan_dev *vlan;
39 struct file *file;
40};
41
42static struct proto macvtap_proto = {
43 .name = "macvtap",
44 .owner = THIS_MODULE,
45 .obj_size = sizeof (struct macvtap_queue),
46};
47
48/*
49 * Minor number matches netdev->ifindex, so need a potentially
50 * large value. This also makes it possible to split the
51 * tap functionality out again in the future by offering it
52 * from other drivers besides macvtap. As long as every device
53 * only has one tap, the interface numbers assure that the
54 * device nodes are unique.
55 */
56static unsigned int macvtap_major;
57#define MACVTAP_NUM_DEVS 65536
58static struct class *macvtap_class;
59static struct cdev macvtap_cdev;
60
61/*
62 * RCU usage:
63 * The macvtap_queue is referenced both from the chardev struct file
64 * and from the struct macvlan_dev using rcu_read_lock.
65 *
66 * We never actually update the contents of a macvtap_queue atomically
67 * with RCU but it is used for race-free destruction of a queue when
68 * either the file or the macvlan_dev goes away. Pointers back to
69 * the dev and the file are implicitly valid as long as the queue
70 * exists.
71 *
72 * The callbacks from macvlan are always done with rcu_read_lock held
564517e8
AB
73 * already. For calls from file_operations, we use the rcu_read_lock_bh
74 * to get a reference count on the socket and the device.
20d29d7a
AB
75 *
76 * When destroying a queue, we remove the pointers from the file and
77 * from the dev and then synchronize_rcu to make sure no thread is
78 * still using the queue. There may still be references to the struct
79 * sock inside of the queue from outbound SKBs, but these never
80 * reference back to the file or the dev. The data structure is freed
81 * through __sk_free when both our references and any pending SKBs
82 * are gone.
83 *
84 * macvtap_lock is only used to prevent multiple concurrent open()
85 * calls to assign a new vlan->tap pointer. It could be moved into
86 * the macvlan_dev itself but is extremely rarely used.
87 */
88static DEFINE_SPINLOCK(macvtap_lock);
89
90/*
91 * Choose the next free queue, for now there is only one
92 */
93static int macvtap_set_queue(struct net_device *dev, struct file *file,
94 struct macvtap_queue *q)
95{
96 struct macvlan_dev *vlan = netdev_priv(dev);
97 int err = -EBUSY;
98
99 spin_lock(&macvtap_lock);
100 if (rcu_dereference(vlan->tap))
101 goto out;
102
103 err = 0;
104 q->vlan = vlan;
105 rcu_assign_pointer(vlan->tap, q);
106
107 q->file = file;
108 rcu_assign_pointer(file->private_data, q);
109
110out:
111 spin_unlock(&macvtap_lock);
112 return err;
113}
114
115/*
116 * We must destroy each queue exactly once, when either
117 * the netdev or the file go away.
118 *
119 * Using the spinlock makes sure that we don't get
120 * to the queue again after destroying it.
121 *
122 * synchronize_rcu serializes with the packet flow
123 * that uses rcu_read_lock.
124 */
125static void macvtap_del_queue(struct macvtap_queue **qp)
126{
127 struct macvtap_queue *q;
128
129 spin_lock(&macvtap_lock);
130 q = rcu_dereference(*qp);
131 if (!q) {
132 spin_unlock(&macvtap_lock);
133 return;
134 }
135
136 rcu_assign_pointer(q->vlan->tap, NULL);
137 rcu_assign_pointer(q->file->private_data, NULL);
138 spin_unlock(&macvtap_lock);
139
140 synchronize_rcu();
141 sock_put(&q->sk);
142}
143
144/*
145 * Since we only support one queue, just dereference the pointer.
146 */
147static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
148 struct sk_buff *skb)
149{
150 struct macvlan_dev *vlan = netdev_priv(dev);
151
152 return rcu_dereference(vlan->tap);
153}
154
155static void macvtap_del_queues(struct net_device *dev)
156{
157 struct macvlan_dev *vlan = netdev_priv(dev);
158 macvtap_del_queue(&vlan->tap);
159}
160
161static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file)
162{
564517e8 163 struct macvtap_queue *q;
20d29d7a 164 rcu_read_lock_bh();
564517e8
AB
165 q = rcu_dereference(file->private_data);
166 if (q) {
167 sock_hold(&q->sk);
168 dev_hold(q->vlan->dev);
169 }
170 rcu_read_unlock_bh();
171 return q;
20d29d7a
AB
172}
173
564517e8 174static inline void macvtap_file_put_queue(struct macvtap_queue *q)
20d29d7a 175{
564517e8
AB
176 sock_put(&q->sk);
177 dev_put(q->vlan->dev);
20d29d7a
AB
178}
179
180/*
181 * Forward happens for data that gets sent from one macvlan
182 * endpoint to another one in bridge mode. We just take
183 * the skb and put it into the receive queue.
184 */
185static int macvtap_forward(struct net_device *dev, struct sk_buff *skb)
186{
187 struct macvtap_queue *q = macvtap_get_queue(dev, skb);
188 if (!q)
189 return -ENOLINK;
190
191 skb_queue_tail(&q->sk.sk_receive_queue, skb);
192 wake_up(q->sk.sk_sleep);
193 return 0;
194}
195
196/*
197 * Receive is for data from the external interface (lowerdev),
198 * in case of macvtap, we can treat that the same way as
199 * forward, which macvlan cannot.
200 */
201static int macvtap_receive(struct sk_buff *skb)
202{
203 skb_push(skb, ETH_HLEN);
204 return macvtap_forward(skb->dev, skb);
205}
206
207static int macvtap_newlink(struct net *src_net,
208 struct net_device *dev,
209 struct nlattr *tb[],
210 struct nlattr *data[])
211{
212 struct device *classdev;
213 dev_t devt;
214 int err;
215
216 err = macvlan_common_newlink(src_net, dev, tb, data,
217 macvtap_receive, macvtap_forward);
218 if (err)
219 goto out;
220
221 devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
222
223 classdev = device_create(macvtap_class, &dev->dev, devt,
224 dev, "tap%d", dev->ifindex);
225 if (IS_ERR(classdev)) {
226 err = PTR_ERR(classdev);
227 macvtap_del_queues(dev);
228 }
229
230out:
231 return err;
232}
233
234static void macvtap_dellink(struct net_device *dev,
235 struct list_head *head)
236{
237 device_destroy(macvtap_class,
238 MKDEV(MAJOR(macvtap_major), dev->ifindex));
239
240 macvtap_del_queues(dev);
241 macvlan_dellink(dev, head);
242}
243
244static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
245 .kind = "macvtap",
246 .newlink = macvtap_newlink,
247 .dellink = macvtap_dellink,
248};
249
250
251static void macvtap_sock_write_space(struct sock *sk)
252{
253 if (!sock_writeable(sk) ||
254 !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
255 return;
256
257 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
258 wake_up_interruptible_sync(sk->sk_sleep);
259}
260
261static int macvtap_open(struct inode *inode, struct file *file)
262{
263 struct net *net = current->nsproxy->net_ns;
264 struct net_device *dev = dev_get_by_index(net, iminor(inode));
265 struct macvtap_queue *q;
266 int err;
267
268 err = -ENODEV;
269 if (!dev)
270 goto out;
271
272 /* check if this is a macvtap device */
273 err = -EINVAL;
274 if (dev->rtnl_link_ops != &macvtap_link_ops)
275 goto out;
276
277 err = -ENOMEM;
278 q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
279 &macvtap_proto);
280 if (!q)
281 goto out;
282
283 init_waitqueue_head(&q->sock.wait);
284 q->sock.type = SOCK_RAW;
285 q->sock.state = SS_CONNECTED;
286 sock_init_data(&q->sock, &q->sk);
287 q->sk.sk_allocation = GFP_ATOMIC; /* for now */
288 q->sk.sk_write_space = macvtap_sock_write_space;
289
290 err = macvtap_set_queue(dev, file, q);
291 if (err)
292 sock_put(&q->sk);
293
294out:
295 if (dev)
296 dev_put(dev);
297
298 return err;
299}
300
301static int macvtap_release(struct inode *inode, struct file *file)
302{
303 macvtap_del_queue((struct macvtap_queue **)&file->private_data);
304 return 0;
305}
306
307static unsigned int macvtap_poll(struct file *file, poll_table * wait)
308{
309 struct macvtap_queue *q = macvtap_file_get_queue(file);
310 unsigned int mask = POLLERR;
311
312 if (!q)
313 goto out;
314
315 mask = 0;
316 poll_wait(file, &q->sock.wait, wait);
317
318 if (!skb_queue_empty(&q->sk.sk_receive_queue))
319 mask |= POLLIN | POLLRDNORM;
320
321 if (sock_writeable(&q->sk) ||
322 (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) &&
323 sock_writeable(&q->sk)))
324 mask |= POLLOUT | POLLWRNORM;
325
564517e8 326 macvtap_file_put_queue(q);
20d29d7a 327out:
20d29d7a
AB
328 return mask;
329}
330
331/* Get packet from user space buffer */
332static ssize_t macvtap_get_user(struct macvtap_queue *q,
333 const struct iovec *iv, size_t count,
334 int noblock)
335{
336 struct sk_buff *skb;
337 size_t len = count;
338 int err;
339
340 if (unlikely(len < ETH_HLEN))
341 return -EINVAL;
342
343 skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
344
345 if (!skb) {
346 macvlan_count_rx(q->vlan, 0, false, false);
347 return err;
348 }
349
350 skb_reserve(skb, NET_IP_ALIGN);
351 skb_put(skb, count);
352
353 if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) {
354 macvlan_count_rx(q->vlan, 0, false, false);
355 kfree_skb(skb);
356 return -EFAULT;
357 }
358
359 skb_set_network_header(skb, ETH_HLEN);
360
361 macvlan_start_xmit(skb, q->vlan->dev);
362
363 return count;
364}
365
366static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
367 unsigned long count, loff_t pos)
368{
369 struct file *file = iocb->ki_filp;
370 ssize_t result = -ENOLINK;
371 struct macvtap_queue *q = macvtap_file_get_queue(file);
372
373 if (!q)
374 goto out;
375
376 result = macvtap_get_user(q, iv, iov_length(iv, count),
377 file->f_flags & O_NONBLOCK);
564517e8 378 macvtap_file_put_queue(q);
20d29d7a 379out:
20d29d7a
AB
380 return result;
381}
382
383/* Put packet to the user space buffer */
384static ssize_t macvtap_put_user(struct macvtap_queue *q,
385 const struct sk_buff *skb,
386 const struct iovec *iv, int len)
387{
388 struct macvlan_dev *vlan = q->vlan;
389 int ret;
390
391 len = min_t(int, skb->len, len);
392
393 ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len);
394
395 macvlan_count_rx(vlan, len, ret == 0, 0);
396
397 return ret ? ret : len;
398}
399
400static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
401 unsigned long count, loff_t pos)
402{
403 struct file *file = iocb->ki_filp;
404 struct macvtap_queue *q = macvtap_file_get_queue(file);
405
406 DECLARE_WAITQUEUE(wait, current);
407 struct sk_buff *skb;
408 ssize_t len, ret = 0;
409
564517e8
AB
410 if (!q)
411 return -ENOLINK;
20d29d7a
AB
412
413 len = iov_length(iv, count);
414 if (len < 0) {
415 ret = -EINVAL;
416 goto out;
417 }
418
419 add_wait_queue(q->sk.sk_sleep, &wait);
420 while (len) {
421 current->state = TASK_INTERRUPTIBLE;
422
423 /* Read frames from the queue */
424 skb = skb_dequeue(&q->sk.sk_receive_queue);
425 if (!skb) {
426 if (file->f_flags & O_NONBLOCK) {
427 ret = -EAGAIN;
428 break;
429 }
430 if (signal_pending(current)) {
431 ret = -ERESTARTSYS;
432 break;
433 }
434 /* Nothing to read, let's sleep */
435 schedule();
436 continue;
437 }
438 ret = macvtap_put_user(q, skb, iv, len);
439 kfree_skb(skb);
440 break;
441 }
442
443 current->state = TASK_RUNNING;
444 remove_wait_queue(q->sk.sk_sleep, &wait);
445
446out:
564517e8 447 macvtap_file_put_queue(q);
20d29d7a
AB
448 return ret;
449}
450
451/*
452 * provide compatibility with generic tun/tap interface
453 */
454static long macvtap_ioctl(struct file *file, unsigned int cmd,
455 unsigned long arg)
456{
457 struct macvtap_queue *q;
458 void __user *argp = (void __user *)arg;
459 struct ifreq __user *ifr = argp;
460 unsigned int __user *up = argp;
461 unsigned int u;
462 char devname[IFNAMSIZ];
463
464 switch (cmd) {
465 case TUNSETIFF:
466 /* ignore the name, just look at flags */
467 if (get_user(u, &ifr->ifr_flags))
468 return -EFAULT;
469 if (u != (IFF_TAP | IFF_NO_PI))
470 return -EINVAL;
471 return 0;
472
473 case TUNGETIFF:
474 q = macvtap_file_get_queue(file);
475 if (!q)
476 return -ENOLINK;
477 memcpy(devname, q->vlan->dev->name, sizeof(devname));
564517e8 478 macvtap_file_put_queue(q);
20d29d7a
AB
479
480 if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) ||
481 put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags))
482 return -EFAULT;
483 return 0;
484
485 case TUNGETFEATURES:
486 if (put_user((IFF_TAP | IFF_NO_PI), up))
487 return -EFAULT;
488 return 0;
489
490 case TUNSETSNDBUF:
491 if (get_user(u, up))
492 return -EFAULT;
493
494 q = macvtap_file_get_queue(file);
564517e8
AB
495 if (!q)
496 return -ENOLINK;
20d29d7a 497 q->sk.sk_sndbuf = u;
564517e8 498 macvtap_file_put_queue(q);
20d29d7a
AB
499 return 0;
500
501 case TUNSETOFFLOAD:
502 /* let the user check for future flags */
503 if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
504 TUN_F_TSO_ECN | TUN_F_UFO))
505 return -EINVAL;
506
507 /* TODO: add support for these, so far we don't
508 support any offload */
509 if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
510 TUN_F_TSO_ECN | TUN_F_UFO))
511 return -EINVAL;
512
513 return 0;
514
515 default:
516 return -EINVAL;
517 }
518}
519
520#ifdef CONFIG_COMPAT
521static long macvtap_compat_ioctl(struct file *file, unsigned int cmd,
522 unsigned long arg)
523{
524 return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
525}
526#endif
527
528static const struct file_operations macvtap_fops = {
529 .owner = THIS_MODULE,
530 .open = macvtap_open,
531 .release = macvtap_release,
532 .aio_read = macvtap_aio_read,
533 .aio_write = macvtap_aio_write,
534 .poll = macvtap_poll,
535 .llseek = no_llseek,
536 .unlocked_ioctl = macvtap_ioctl,
537#ifdef CONFIG_COMPAT
538 .compat_ioctl = macvtap_compat_ioctl,
539#endif
540};
541
542static int macvtap_init(void)
543{
544 int err;
545
546 err = alloc_chrdev_region(&macvtap_major, 0,
547 MACVTAP_NUM_DEVS, "macvtap");
548 if (err)
549 goto out1;
550
551 cdev_init(&macvtap_cdev, &macvtap_fops);
552 err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
553 if (err)
554 goto out2;
555
556 macvtap_class = class_create(THIS_MODULE, "macvtap");
557 if (IS_ERR(macvtap_class)) {
558 err = PTR_ERR(macvtap_class);
559 goto out3;
560 }
561
562 err = macvlan_link_register(&macvtap_link_ops);
563 if (err)
564 goto out4;
565
566 return 0;
567
568out4:
569 class_unregister(macvtap_class);
570out3:
571 cdev_del(&macvtap_cdev);
572out2:
573 unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
574out1:
575 return err;
576}
577module_init(macvtap_init);
578
579static void macvtap_exit(void)
580{
581 rtnl_link_unregister(&macvtap_link_ops);
582 class_unregister(macvtap_class);
583 cdev_del(&macvtap_cdev);
584 unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
585}
586module_exit(macvtap_exit);
587
588MODULE_ALIAS_RTNL_LINK("macvtap");
589MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
590MODULE_LICENSE("GPL");