]> bbs.cooldavid.org Git - net-next-2.6.git/blame - drivers/net/macvtap.c
macvtap: rework object lifetime rules
[net-next-2.6.git] / drivers / net / macvtap.c
CommitLineData
20d29d7a
AB
1#include <linux/etherdevice.h>
2#include <linux/if_macvlan.h>
3#include <linux/interrupt.h>
4#include <linux/nsproxy.h>
5#include <linux/compat.h>
6#include <linux/if_tun.h>
7#include <linux/module.h>
8#include <linux/skbuff.h>
9#include <linux/cache.h>
10#include <linux/sched.h>
11#include <linux/types.h>
12#include <linux/init.h>
13#include <linux/wait.h>
14#include <linux/cdev.h>
15#include <linux/fs.h>
16
17#include <net/net_namespace.h>
18#include <net/rtnetlink.h>
19#include <net/sock.h>
20
21/*
22 * A macvtap queue is the central object of this driver, it connects
23 * an open character device to a macvlan interface. There can be
24 * multiple queues on one interface, which map back to queues
25 * implemented in hardware on the underlying device.
26 *
27 * macvtap_proto is used to allocate queues through the sock allocation
28 * mechanism.
29 *
30 * TODO: multiqueue support is currently not implemented, even though
31 * macvtap is basically prepared for that. We will need to add this
32 * here as well as in virtio-net and qemu to get line rate on 10gbit
33 * adapters from a guest.
34 */
35struct macvtap_queue {
36 struct sock sk;
37 struct socket sock;
38 struct macvlan_dev *vlan;
39 struct file *file;
40};
41
42static struct proto macvtap_proto = {
43 .name = "macvtap",
44 .owner = THIS_MODULE,
45 .obj_size = sizeof (struct macvtap_queue),
46};
47
48/*
49 * Minor number matches netdev->ifindex, so need a potentially
50 * large value. This also makes it possible to split the
51 * tap functionality out again in the future by offering it
52 * from other drivers besides macvtap. As long as every device
53 * only has one tap, the interface numbers assure that the
54 * device nodes are unique.
55 */
56static unsigned int macvtap_major;
57#define MACVTAP_NUM_DEVS 65536
58static struct class *macvtap_class;
59static struct cdev macvtap_cdev;
60
61/*
62 * RCU usage:
02df55d2
AB
63 * The macvtap_queue and the macvlan_dev are loosely coupled, the
64 * pointers from one to the other can only be read while rcu_read_lock
65 * or macvtap_lock is held.
20d29d7a 66 *
02df55d2
AB
67 * Both the file and the macvlan_dev hold a reference on the macvtap_queue
68 * through sock_hold(&q->sk). When the macvlan_dev goes away first,
69 * q->vlan becomes inaccessible. When the files gets closed,
70 * macvtap_get_queue() fails.
20d29d7a 71 *
02df55d2
AB
72 * There may still be references to the struct sock inside of the
73 * queue from outbound SKBs, but these never reference back to the
74 * file or the dev. The data structure is freed through __sk_free
75 * when both our references and any pending SKBs are gone.
20d29d7a
AB
76 */
77static DEFINE_SPINLOCK(macvtap_lock);
78
79/*
80 * Choose the next free queue, for now there is only one
81 */
82static int macvtap_set_queue(struct net_device *dev, struct file *file,
83 struct macvtap_queue *q)
84{
85 struct macvlan_dev *vlan = netdev_priv(dev);
86 int err = -EBUSY;
87
88 spin_lock(&macvtap_lock);
89 if (rcu_dereference(vlan->tap))
90 goto out;
91
92 err = 0;
02df55d2 93 rcu_assign_pointer(q->vlan, vlan);
20d29d7a 94 rcu_assign_pointer(vlan->tap, q);
02df55d2 95 sock_hold(&q->sk);
20d29d7a
AB
96
97 q->file = file;
02df55d2 98 file->private_data = q;
20d29d7a
AB
99
100out:
101 spin_unlock(&macvtap_lock);
102 return err;
103}
104
105/*
02df55d2
AB
106 * The file owning the queue got closed, give up both
107 * the reference that the files holds as well as the
108 * one from the macvlan_dev if that still exists.
20d29d7a
AB
109 *
110 * Using the spinlock makes sure that we don't get
111 * to the queue again after destroying it.
20d29d7a 112 */
02df55d2 113static void macvtap_put_queue(struct macvtap_queue *q)
20d29d7a 114{
02df55d2 115 struct macvlan_dev *vlan;
20d29d7a
AB
116
117 spin_lock(&macvtap_lock);
02df55d2
AB
118 vlan = rcu_dereference(q->vlan);
119 if (vlan) {
120 rcu_assign_pointer(vlan->tap, NULL);
121 rcu_assign_pointer(q->vlan, NULL);
122 sock_put(&q->sk);
20d29d7a
AB
123 }
124
20d29d7a
AB
125 spin_unlock(&macvtap_lock);
126
127 synchronize_rcu();
128 sock_put(&q->sk);
129}
130
131/*
132 * Since we only support one queue, just dereference the pointer.
133 */
134static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
135 struct sk_buff *skb)
136{
137 struct macvlan_dev *vlan = netdev_priv(dev);
138
139 return rcu_dereference(vlan->tap);
140}
141
02df55d2
AB
142/*
143 * The net_device is going away, give up the reference
144 * that it holds on the queue (all the queues one day)
145 * and safely set the pointer from the queues to NULL.
146 */
20d29d7a
AB
147static void macvtap_del_queues(struct net_device *dev)
148{
149 struct macvlan_dev *vlan = netdev_priv(dev);
564517e8 150 struct macvtap_queue *q;
02df55d2
AB
151
152 spin_lock(&macvtap_lock);
153 q = rcu_dereference(vlan->tap);
154 if (!q) {
155 spin_unlock(&macvtap_lock);
156 return;
564517e8 157 }
20d29d7a 158
02df55d2
AB
159 rcu_assign_pointer(vlan->tap, NULL);
160 rcu_assign_pointer(q->vlan, NULL);
161 spin_unlock(&macvtap_lock);
162
163 synchronize_rcu();
564517e8 164 sock_put(&q->sk);
20d29d7a
AB
165}
166
167/*
168 * Forward happens for data that gets sent from one macvlan
169 * endpoint to another one in bridge mode. We just take
170 * the skb and put it into the receive queue.
171 */
172static int macvtap_forward(struct net_device *dev, struct sk_buff *skb)
173{
174 struct macvtap_queue *q = macvtap_get_queue(dev, skb);
175 if (!q)
176 return -ENOLINK;
177
178 skb_queue_tail(&q->sk.sk_receive_queue, skb);
179 wake_up(q->sk.sk_sleep);
180 return 0;
181}
182
183/*
184 * Receive is for data from the external interface (lowerdev),
185 * in case of macvtap, we can treat that the same way as
186 * forward, which macvlan cannot.
187 */
188static int macvtap_receive(struct sk_buff *skb)
189{
190 skb_push(skb, ETH_HLEN);
191 return macvtap_forward(skb->dev, skb);
192}
193
194static int macvtap_newlink(struct net *src_net,
195 struct net_device *dev,
196 struct nlattr *tb[],
197 struct nlattr *data[])
198{
199 struct device *classdev;
200 dev_t devt;
201 int err;
202
203 err = macvlan_common_newlink(src_net, dev, tb, data,
204 macvtap_receive, macvtap_forward);
205 if (err)
206 goto out;
207
208 devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
209
210 classdev = device_create(macvtap_class, &dev->dev, devt,
211 dev, "tap%d", dev->ifindex);
212 if (IS_ERR(classdev)) {
213 err = PTR_ERR(classdev);
214 macvtap_del_queues(dev);
215 }
216
217out:
218 return err;
219}
220
221static void macvtap_dellink(struct net_device *dev,
222 struct list_head *head)
223{
224 device_destroy(macvtap_class,
225 MKDEV(MAJOR(macvtap_major), dev->ifindex));
226
227 macvtap_del_queues(dev);
228 macvlan_dellink(dev, head);
229}
230
231static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
232 .kind = "macvtap",
233 .newlink = macvtap_newlink,
234 .dellink = macvtap_dellink,
235};
236
237
238static void macvtap_sock_write_space(struct sock *sk)
239{
240 if (!sock_writeable(sk) ||
241 !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
242 return;
243
244 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
245 wake_up_interruptible_sync(sk->sk_sleep);
246}
247
248static int macvtap_open(struct inode *inode, struct file *file)
249{
250 struct net *net = current->nsproxy->net_ns;
251 struct net_device *dev = dev_get_by_index(net, iminor(inode));
252 struct macvtap_queue *q;
253 int err;
254
255 err = -ENODEV;
256 if (!dev)
257 goto out;
258
259 /* check if this is a macvtap device */
260 err = -EINVAL;
261 if (dev->rtnl_link_ops != &macvtap_link_ops)
262 goto out;
263
264 err = -ENOMEM;
265 q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
266 &macvtap_proto);
267 if (!q)
268 goto out;
269
270 init_waitqueue_head(&q->sock.wait);
271 q->sock.type = SOCK_RAW;
272 q->sock.state = SS_CONNECTED;
273 sock_init_data(&q->sock, &q->sk);
20d29d7a
AB
274 q->sk.sk_write_space = macvtap_sock_write_space;
275
276 err = macvtap_set_queue(dev, file, q);
277 if (err)
278 sock_put(&q->sk);
279
280out:
281 if (dev)
282 dev_put(dev);
283
284 return err;
285}
286
287static int macvtap_release(struct inode *inode, struct file *file)
288{
02df55d2
AB
289 struct macvtap_queue *q = file->private_data;
290 macvtap_put_queue(q);
20d29d7a
AB
291 return 0;
292}
293
294static unsigned int macvtap_poll(struct file *file, poll_table * wait)
295{
02df55d2 296 struct macvtap_queue *q = file->private_data;
20d29d7a
AB
297 unsigned int mask = POLLERR;
298
299 if (!q)
300 goto out;
301
302 mask = 0;
303 poll_wait(file, &q->sock.wait, wait);
304
305 if (!skb_queue_empty(&q->sk.sk_receive_queue))
306 mask |= POLLIN | POLLRDNORM;
307
308 if (sock_writeable(&q->sk) ||
309 (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) &&
310 sock_writeable(&q->sk)))
311 mask |= POLLOUT | POLLWRNORM;
312
313out:
20d29d7a
AB
314 return mask;
315}
316
317/* Get packet from user space buffer */
318static ssize_t macvtap_get_user(struct macvtap_queue *q,
319 const struct iovec *iv, size_t count,
320 int noblock)
321{
322 struct sk_buff *skb;
02df55d2 323 struct macvlan_dev *vlan;
20d29d7a
AB
324 size_t len = count;
325 int err;
326
327 if (unlikely(len < ETH_HLEN))
328 return -EINVAL;
329
330 skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
02df55d2
AB
331 if (!skb)
332 goto err;
20d29d7a
AB
333
334 skb_reserve(skb, NET_IP_ALIGN);
335 skb_put(skb, count);
336
02df55d2
AB
337 err = skb_copy_datagram_from_iovec(skb, 0, iv, 0, len);
338 if (err)
339 goto err;
20d29d7a
AB
340
341 skb_set_network_header(skb, ETH_HLEN);
02df55d2
AB
342 rcu_read_lock_bh();
343 vlan = rcu_dereference(q->vlan);
344 if (vlan)
345 macvlan_start_xmit(skb, vlan->dev);
346 else
347 kfree_skb(skb);
348 rcu_read_unlock_bh();
20d29d7a
AB
349
350 return count;
02df55d2
AB
351
352err:
353 rcu_read_lock_bh();
354 vlan = rcu_dereference(q->vlan);
355 if (vlan)
356 macvlan_count_rx(q->vlan, 0, false, false);
357 rcu_read_unlock_bh();
358
359 kfree_skb(skb);
360
361 return err;
20d29d7a
AB
362}
363
364static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
365 unsigned long count, loff_t pos)
366{
367 struct file *file = iocb->ki_filp;
368 ssize_t result = -ENOLINK;
02df55d2 369 struct macvtap_queue *q = file->private_data;
20d29d7a
AB
370
371 result = macvtap_get_user(q, iv, iov_length(iv, count),
372 file->f_flags & O_NONBLOCK);
20d29d7a
AB
373 return result;
374}
375
376/* Put packet to the user space buffer */
377static ssize_t macvtap_put_user(struct macvtap_queue *q,
378 const struct sk_buff *skb,
379 const struct iovec *iv, int len)
380{
02df55d2 381 struct macvlan_dev *vlan;
20d29d7a
AB
382 int ret;
383
384 len = min_t(int, skb->len, len);
385
386 ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len);
387
02df55d2
AB
388 rcu_read_lock_bh();
389 vlan = rcu_dereference(q->vlan);
20d29d7a 390 macvlan_count_rx(vlan, len, ret == 0, 0);
02df55d2 391 rcu_read_unlock_bh();
20d29d7a
AB
392
393 return ret ? ret : len;
394}
395
396static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
397 unsigned long count, loff_t pos)
398{
399 struct file *file = iocb->ki_filp;
02df55d2 400 struct macvtap_queue *q = file->private_data;
20d29d7a
AB
401
402 DECLARE_WAITQUEUE(wait, current);
403 struct sk_buff *skb;
404 ssize_t len, ret = 0;
405
02df55d2
AB
406 if (!q) {
407 ret = -ENOLINK;
408 goto out;
409 }
20d29d7a
AB
410
411 len = iov_length(iv, count);
412 if (len < 0) {
413 ret = -EINVAL;
414 goto out;
415 }
416
417 add_wait_queue(q->sk.sk_sleep, &wait);
418 while (len) {
419 current->state = TASK_INTERRUPTIBLE;
420
421 /* Read frames from the queue */
422 skb = skb_dequeue(&q->sk.sk_receive_queue);
423 if (!skb) {
424 if (file->f_flags & O_NONBLOCK) {
425 ret = -EAGAIN;
426 break;
427 }
428 if (signal_pending(current)) {
429 ret = -ERESTARTSYS;
430 break;
431 }
432 /* Nothing to read, let's sleep */
433 schedule();
434 continue;
435 }
436 ret = macvtap_put_user(q, skb, iv, len);
437 kfree_skb(skb);
438 break;
439 }
440
441 current->state = TASK_RUNNING;
442 remove_wait_queue(q->sk.sk_sleep, &wait);
443
444out:
20d29d7a
AB
445 return ret;
446}
447
448/*
449 * provide compatibility with generic tun/tap interface
450 */
451static long macvtap_ioctl(struct file *file, unsigned int cmd,
452 unsigned long arg)
453{
02df55d2
AB
454 struct macvtap_queue *q = file->private_data;
455 struct macvlan_dev *vlan;
20d29d7a
AB
456 void __user *argp = (void __user *)arg;
457 struct ifreq __user *ifr = argp;
458 unsigned int __user *up = argp;
459 unsigned int u;
02df55d2 460 int ret;
20d29d7a
AB
461
462 switch (cmd) {
463 case TUNSETIFF:
464 /* ignore the name, just look at flags */
465 if (get_user(u, &ifr->ifr_flags))
466 return -EFAULT;
467 if (u != (IFF_TAP | IFF_NO_PI))
468 return -EINVAL;
469 return 0;
470
471 case TUNGETIFF:
02df55d2
AB
472 rcu_read_lock_bh();
473 vlan = rcu_dereference(q->vlan);
474 if (vlan)
475 dev_hold(vlan->dev);
476 rcu_read_unlock_bh();
477
478 if (!vlan)
20d29d7a 479 return -ENOLINK;
20d29d7a 480
02df55d2 481 ret = 0;
20d29d7a
AB
482 if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) ||
483 put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags))
02df55d2
AB
484 ret = -EFAULT;
485 dev_put(vlan->dev);
486 return ret;
20d29d7a
AB
487
488 case TUNGETFEATURES:
489 if (put_user((IFF_TAP | IFF_NO_PI), up))
490 return -EFAULT;
491 return 0;
492
493 case TUNSETSNDBUF:
494 if (get_user(u, up))
495 return -EFAULT;
496
20d29d7a 497 q->sk.sk_sndbuf = u;
20d29d7a
AB
498 return 0;
499
500 case TUNSETOFFLOAD:
501 /* let the user check for future flags */
502 if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
503 TUN_F_TSO_ECN | TUN_F_UFO))
504 return -EINVAL;
505
506 /* TODO: add support for these, so far we don't
507 support any offload */
508 if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
509 TUN_F_TSO_ECN | TUN_F_UFO))
510 return -EINVAL;
511
512 return 0;
513
514 default:
515 return -EINVAL;
516 }
517}
518
519#ifdef CONFIG_COMPAT
520static long macvtap_compat_ioctl(struct file *file, unsigned int cmd,
521 unsigned long arg)
522{
523 return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
524}
525#endif
526
527static const struct file_operations macvtap_fops = {
528 .owner = THIS_MODULE,
529 .open = macvtap_open,
530 .release = macvtap_release,
531 .aio_read = macvtap_aio_read,
532 .aio_write = macvtap_aio_write,
533 .poll = macvtap_poll,
534 .llseek = no_llseek,
535 .unlocked_ioctl = macvtap_ioctl,
536#ifdef CONFIG_COMPAT
537 .compat_ioctl = macvtap_compat_ioctl,
538#endif
539};
540
541static int macvtap_init(void)
542{
543 int err;
544
545 err = alloc_chrdev_region(&macvtap_major, 0,
546 MACVTAP_NUM_DEVS, "macvtap");
547 if (err)
548 goto out1;
549
550 cdev_init(&macvtap_cdev, &macvtap_fops);
551 err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
552 if (err)
553 goto out2;
554
555 macvtap_class = class_create(THIS_MODULE, "macvtap");
556 if (IS_ERR(macvtap_class)) {
557 err = PTR_ERR(macvtap_class);
558 goto out3;
559 }
560
561 err = macvlan_link_register(&macvtap_link_ops);
562 if (err)
563 goto out4;
564
565 return 0;
566
567out4:
568 class_unregister(macvtap_class);
569out3:
570 cdev_del(&macvtap_cdev);
571out2:
572 unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
573out1:
574 return err;
575}
576module_init(macvtap_init);
577
578static void macvtap_exit(void)
579{
580 rtnl_link_unregister(&macvtap_link_ops);
581 class_unregister(macvtap_class);
582 cdev_del(&macvtap_cdev);
583 unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
584}
585module_exit(macvtap_exit);
586
587MODULE_ALIAS_RTNL_LINK("macvtap");
588MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
589MODULE_LICENSE("GPL");