2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
51 * Rudi Cilibrasi : Pass the right thing to
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <linux/if_bridge.h>
105 #include <linux/if_macvlan.h>
107 #include <net/pkt_sched.h>
108 #include <net/checksum.h>
109 #include <net/xfrm.h>
110 #include <linux/highmem.h>
111 #include <linux/init.h>
112 #include <linux/kmod.h>
113 #include <linux/module.h>
114 #include <linux/netpoll.h>
115 #include <linux/rcupdate.h>
116 #include <linux/delay.h>
117 #include <net/wext.h>
118 #include <net/iw_handler.h>
119 #include <asm/current.h>
120 #include <linux/audit.h>
121 #include <linux/dmaengine.h>
122 #include <linux/err.h>
123 #include <linux/ctype.h>
124 #include <linux/if_arp.h>
125 #include <linux/if_vlan.h>
126 #include <linux/ip.h>
128 #include <linux/ipv6.h>
129 #include <linux/in.h>
130 #include <linux/jhash.h>
131 #include <linux/random.h>
132 #include <trace/events/napi.h>
133 #include <linux/pci.h>
135 #include "net-sysfs.h"
137 /* Instead of increasing this, you should create a hash table. */
138 #define MAX_GRO_SKBS 8
140 /* This should be increased if a protocol with a bigger head is added. */
141 #define GRO_MAX_HEAD (MAX_HEADER + 128)
144 * The list of packet types we will receive (as opposed to discard)
145 * and the routines to invoke.
147 * Why 16. Because with 16 the only overlap we get on a hash of the
148 * low nibble of the protocol value is RARP/SNAP/X.25.
150 * NOTE: That is no longer true with the addition of VLAN tags. Not
151 * sure which should go first, but I bet it won't make much
152 * difference if we are running VLANs. The good news is that
153 * this protocol won't be in the list unless compiled in, so
154 * the average user (w/out VLANs) will not be adversely affected.
171 #define PTYPE_HASH_SIZE (16)
172 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
174 static DEFINE_SPINLOCK(ptype_lock);
175 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
176 static struct list_head ptype_all __read_mostly; /* Taps */
179 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
182 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
184 * Writers must hold the rtnl semaphore while they loop through the
185 * dev_base_head list, and hold dev_base_lock for writing when they do the
186 * actual updates. This allows pure readers to access the list even
187 * while a writer is preparing to update it.
189 * To put it another way, dev_base_lock is held for writing only to
190 * protect against pure readers; the rtnl semaphore provides the
191 * protection against other writers.
193 * See, for example usages, register_netdevice() and
194 * unregister_netdevice(), which must be called with the rtnl
197 DEFINE_RWLOCK(dev_base_lock);
198 EXPORT_SYMBOL(dev_base_lock);
200 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
202 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
203 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
206 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
208 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
211 static inline void rps_lock(struct softnet_data *sd)
214 spin_lock(&sd->input_pkt_queue.lock);
218 static inline void rps_unlock(struct softnet_data *sd)
221 spin_unlock(&sd->input_pkt_queue.lock);
225 /* Device list insertion */
226 static int list_netdevice(struct net_device *dev)
228 struct net *net = dev_net(dev);
232 write_lock_bh(&dev_base_lock);
233 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
234 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
235 hlist_add_head_rcu(&dev->index_hlist,
236 dev_index_hash(net, dev->ifindex));
237 write_unlock_bh(&dev_base_lock);
241 /* Device list removal
242 * caller must respect a RCU grace period before freeing/reusing dev
244 static void unlist_netdevice(struct net_device *dev)
248 /* Unlink dev from the device chain */
249 write_lock_bh(&dev_base_lock);
250 list_del_rcu(&dev->dev_list);
251 hlist_del_rcu(&dev->name_hlist);
252 hlist_del_rcu(&dev->index_hlist);
253 write_unlock_bh(&dev_base_lock);
260 static RAW_NOTIFIER_HEAD(netdev_chain);
263 * Device drivers call our routines to queue packets here. We empty the
264 * queue in the local softnet handler.
267 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
268 EXPORT_PER_CPU_SYMBOL(softnet_data);
270 #ifdef CONFIG_LOCKDEP
272 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
273 * according to dev->type
275 static const unsigned short netdev_lock_type[] =
276 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
277 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
278 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
279 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
280 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
281 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
282 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
283 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
284 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
285 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
286 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
287 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
288 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
289 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
290 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
291 ARPHRD_VOID, ARPHRD_NONE};
293 static const char *const netdev_lock_name[] =
294 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
295 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
296 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
297 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
298 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
299 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
300 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
301 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
302 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
303 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
304 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
305 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
306 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
307 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
308 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
309 "_xmit_VOID", "_xmit_NONE"};
311 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
312 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
314 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
318 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319 if (netdev_lock_type[i] == dev_type)
321 /* the last key is used by default */
322 return ARRAY_SIZE(netdev_lock_type) - 1;
325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 unsigned short dev_type)
330 i = netdev_lock_pos(dev_type);
331 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332 netdev_lock_name[i]);
335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
339 i = netdev_lock_pos(dev->type);
340 lockdep_set_class_and_name(&dev->addr_list_lock,
341 &netdev_addr_lock_key[i],
342 netdev_lock_name[i]);
345 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346 unsigned short dev_type)
349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
354 /*******************************************************************************
356 Protocol management and registration routines
358 *******************************************************************************/
361 * Add a protocol ID to the list. Now that the input handler is
362 * smarter we can dispense with all the messy stuff that used to be
365 * BEWARE!!! Protocol handlers, mangling input packets,
366 * MUST BE last in hash buckets and checking protocol handlers
367 * MUST start from promiscuous ptype_all chain in net_bh.
368 * It is true now, do not change it.
369 * Explanation follows: if protocol handler, mangling packet, will
370 * be the first on list, it is not able to sense, that packet
371 * is cloned and should be copied-on-write, so that it will
372 * change it and subsequent readers will get broken packet.
377 * dev_add_pack - add packet handler
378 * @pt: packet type declaration
380 * Add a protocol handler to the networking stack. The passed &packet_type
381 * is linked into kernel lists and may not be freed until it has been
382 * removed from the kernel lists.
384 * This call does not sleep therefore it can not
385 * guarantee all CPU's that are in middle of receiving packets
386 * will see the new packet type (until the next received packet).
389 void dev_add_pack(struct packet_type *pt)
393 spin_lock_bh(&ptype_lock);
394 if (pt->type == htons(ETH_P_ALL))
395 list_add_rcu(&pt->list, &ptype_all);
397 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
398 list_add_rcu(&pt->list, &ptype_base[hash]);
400 spin_unlock_bh(&ptype_lock);
402 EXPORT_SYMBOL(dev_add_pack);
405 * __dev_remove_pack - remove packet handler
406 * @pt: packet type declaration
408 * Remove a protocol handler that was previously added to the kernel
409 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
410 * from the kernel lists and can be freed or reused once this function
413 * The packet type might still be in use by receivers
414 * and must not be freed until after all the CPU's have gone
415 * through a quiescent state.
417 void __dev_remove_pack(struct packet_type *pt)
419 struct list_head *head;
420 struct packet_type *pt1;
422 spin_lock_bh(&ptype_lock);
424 if (pt->type == htons(ETH_P_ALL))
427 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
429 list_for_each_entry(pt1, head, list) {
431 list_del_rcu(&pt->list);
436 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
438 spin_unlock_bh(&ptype_lock);
440 EXPORT_SYMBOL(__dev_remove_pack);
443 * dev_remove_pack - remove packet handler
444 * @pt: packet type declaration
446 * Remove a protocol handler that was previously added to the kernel
447 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
448 * from the kernel lists and can be freed or reused once this function
451 * This call sleeps to guarantee that no CPU is looking at the packet
454 void dev_remove_pack(struct packet_type *pt)
456 __dev_remove_pack(pt);
460 EXPORT_SYMBOL(dev_remove_pack);
462 /******************************************************************************
464 Device Boot-time Settings Routines
466 *******************************************************************************/
468 /* Boot time configuration table */
469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
472 * netdev_boot_setup_add - add new setup entry
473 * @name: name of the device
474 * @map: configured settings for the device
476 * Adds new setup entry to the dev_boot_setup list. The function
477 * returns 0 on error and 1 on success. This is a generic routine to
480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
482 struct netdev_boot_setup *s;
486 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 memset(s[i].name, 0, sizeof(s[i].name));
489 strlcpy(s[i].name, name, IFNAMSIZ);
490 memcpy(&s[i].map, map, sizeof(s[i].map));
495 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
499 * netdev_boot_setup_check - check boot time settings
500 * @dev: the netdevice
502 * Check boot time settings for the device.
503 * The found settings are set for the device to be used
504 * later in the device probing.
505 * Returns 0 if no settings found, 1 if they are.
507 int netdev_boot_setup_check(struct net_device *dev)
509 struct netdev_boot_setup *s = dev_boot_setup;
512 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
514 !strcmp(dev->name, s[i].name)) {
515 dev->irq = s[i].map.irq;
516 dev->base_addr = s[i].map.base_addr;
517 dev->mem_start = s[i].map.mem_start;
518 dev->mem_end = s[i].map.mem_end;
524 EXPORT_SYMBOL(netdev_boot_setup_check);
528 * netdev_boot_base - get address from boot time settings
529 * @prefix: prefix for network device
530 * @unit: id for network device
532 * Check boot time settings for the base address of device.
533 * The found settings are set for the device to be used
534 * later in the device probing.
535 * Returns 0 if no settings found.
537 unsigned long netdev_boot_base(const char *prefix, int unit)
539 const struct netdev_boot_setup *s = dev_boot_setup;
543 sprintf(name, "%s%d", prefix, unit);
546 * If device already registered then return base of 1
547 * to indicate not to probe for this interface
549 if (__dev_get_by_name(&init_net, name))
552 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 if (!strcmp(name, s[i].name))
554 return s[i].map.base_addr;
559 * Saves at boot time configured settings for any netdevice.
561 int __init netdev_boot_setup(char *str)
566 str = get_options(str, ARRAY_SIZE(ints), ints);
571 memset(&map, 0, sizeof(map));
575 map.base_addr = ints[2];
577 map.mem_start = ints[3];
579 map.mem_end = ints[4];
581 /* Add new entry to the list */
582 return netdev_boot_setup_add(str, &map);
585 __setup("netdev=", netdev_boot_setup);
587 /*******************************************************************************
589 Device Interface Subroutines
591 *******************************************************************************/
594 * __dev_get_by_name - find a device by its name
595 * @net: the applicable net namespace
596 * @name: name to find
598 * Find an interface by name. Must be called under RTNL semaphore
599 * or @dev_base_lock. If the name is found a pointer to the device
600 * is returned. If the name is not found then %NULL is returned. The
601 * reference counters are not incremented so the caller must be
602 * careful with locks.
605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
607 struct hlist_node *p;
608 struct net_device *dev;
609 struct hlist_head *head = dev_name_hash(net, name);
611 hlist_for_each_entry(dev, p, head, name_hlist)
612 if (!strncmp(dev->name, name, IFNAMSIZ))
617 EXPORT_SYMBOL(__dev_get_by_name);
620 * dev_get_by_name_rcu - find a device by its name
621 * @net: the applicable net namespace
622 * @name: name to find
624 * Find an interface by name.
625 * If the name is found a pointer to the device is returned.
626 * If the name is not found then %NULL is returned.
627 * The reference counters are not incremented so the caller must be
628 * careful with locks. The caller must hold RCU lock.
631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
633 struct hlist_node *p;
634 struct net_device *dev;
635 struct hlist_head *head = dev_name_hash(net, name);
637 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 if (!strncmp(dev->name, name, IFNAMSIZ))
643 EXPORT_SYMBOL(dev_get_by_name_rcu);
646 * dev_get_by_name - find a device by its name
647 * @net: the applicable net namespace
648 * @name: name to find
650 * Find an interface by name. This can be called from any
651 * context and does its own locking. The returned handle has
652 * the usage count incremented and the caller must use dev_put() to
653 * release it when it is no longer needed. %NULL is returned if no
654 * matching device is found.
657 struct net_device *dev_get_by_name(struct net *net, const char *name)
659 struct net_device *dev;
662 dev = dev_get_by_name_rcu(net, name);
668 EXPORT_SYMBOL(dev_get_by_name);
671 * __dev_get_by_index - find a device by its ifindex
672 * @net: the applicable net namespace
673 * @ifindex: index of device
675 * Search for an interface by index. Returns %NULL if the device
676 * is not found or a pointer to the device. The device has not
677 * had its reference counter increased so the caller must be careful
678 * about locking. The caller must hold either the RTNL semaphore
682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
684 struct hlist_node *p;
685 struct net_device *dev;
686 struct hlist_head *head = dev_index_hash(net, ifindex);
688 hlist_for_each_entry(dev, p, head, index_hlist)
689 if (dev->ifindex == ifindex)
694 EXPORT_SYMBOL(__dev_get_by_index);
697 * dev_get_by_index_rcu - find a device by its ifindex
698 * @net: the applicable net namespace
699 * @ifindex: index of device
701 * Search for an interface by index. Returns %NULL if the device
702 * is not found or a pointer to the device. The device has not
703 * had its reference counter increased so the caller must be careful
704 * about locking. The caller must hold RCU lock.
707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
709 struct hlist_node *p;
710 struct net_device *dev;
711 struct hlist_head *head = dev_index_hash(net, ifindex);
713 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 if (dev->ifindex == ifindex)
719 EXPORT_SYMBOL(dev_get_by_index_rcu);
723 * dev_get_by_index - find a device by its ifindex
724 * @net: the applicable net namespace
725 * @ifindex: index of device
727 * Search for an interface by index. Returns NULL if the device
728 * is not found or a pointer to the device. The device returned has
729 * had a reference added and the pointer is safe until the user calls
730 * dev_put to indicate they have finished with it.
733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
735 struct net_device *dev;
738 dev = dev_get_by_index_rcu(net, ifindex);
744 EXPORT_SYMBOL(dev_get_by_index);
747 * dev_getbyhwaddr - find a device by its hardware address
748 * @net: the applicable net namespace
749 * @type: media type of device
750 * @ha: hardware address
752 * Search for an interface by MAC address. Returns NULL if the device
753 * is not found or a pointer to the device. The caller must hold the
754 * rtnl semaphore. The returned device has not had its ref count increased
755 * and the caller must therefore be careful about locking
758 * If the API was consistent this would be __dev_get_by_hwaddr
761 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
763 struct net_device *dev;
767 for_each_netdev(net, dev)
768 if (dev->type == type &&
769 !memcmp(dev->dev_addr, ha, dev->addr_len))
774 EXPORT_SYMBOL(dev_getbyhwaddr);
776 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
778 struct net_device *dev;
781 for_each_netdev(net, dev)
782 if (dev->type == type)
787 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
789 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
791 struct net_device *dev, *ret = NULL;
794 for_each_netdev_rcu(net, dev)
795 if (dev->type == type) {
803 EXPORT_SYMBOL(dev_getfirstbyhwtype);
806 * dev_get_by_flags_rcu - find any device with given flags
807 * @net: the applicable net namespace
808 * @if_flags: IFF_* values
809 * @mask: bitmask of bits in if_flags to check
811 * Search for any interface with the given flags. Returns NULL if a device
812 * is not found or a pointer to the device. Must be called inside
813 * rcu_read_lock(), and result refcount is unchanged.
816 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
819 struct net_device *dev, *ret;
822 for_each_netdev_rcu(net, dev) {
823 if (((dev->flags ^ if_flags) & mask) == 0) {
830 EXPORT_SYMBOL(dev_get_by_flags_rcu);
833 * dev_valid_name - check if name is okay for network device
836 * Network device names need to be valid file names to
837 * to allow sysfs to work. We also disallow any kind of
840 int dev_valid_name(const char *name)
844 if (strlen(name) >= IFNAMSIZ)
846 if (!strcmp(name, ".") || !strcmp(name, ".."))
850 if (*name == '/' || isspace(*name))
856 EXPORT_SYMBOL(dev_valid_name);
859 * __dev_alloc_name - allocate a name for a device
860 * @net: network namespace to allocate the device name in
861 * @name: name format string
862 * @buf: scratch buffer and result name string
864 * Passed a format string - eg "lt%d" it will try and find a suitable
865 * id. It scans list of devices to build up a free map, then chooses
866 * the first empty slot. The caller must hold the dev_base or rtnl lock
867 * while allocating the name and adding the device in order to avoid
869 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
870 * Returns the number of the unit assigned or a negative errno code.
873 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
877 const int max_netdevices = 8*PAGE_SIZE;
878 unsigned long *inuse;
879 struct net_device *d;
881 p = strnchr(name, IFNAMSIZ-1, '%');
884 * Verify the string as this thing may have come from
885 * the user. There must be either one "%d" and no other "%"
888 if (p[1] != 'd' || strchr(p + 2, '%'))
891 /* Use one page as a bit array of possible slots */
892 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
896 for_each_netdev(net, d) {
897 if (!sscanf(d->name, name, &i))
899 if (i < 0 || i >= max_netdevices)
902 /* avoid cases where sscanf is not exact inverse of printf */
903 snprintf(buf, IFNAMSIZ, name, i);
904 if (!strncmp(buf, d->name, IFNAMSIZ))
908 i = find_first_zero_bit(inuse, max_netdevices);
909 free_page((unsigned long) inuse);
913 snprintf(buf, IFNAMSIZ, name, i);
914 if (!__dev_get_by_name(net, buf))
917 /* It is possible to run out of possible slots
918 * when the name is long and there isn't enough space left
919 * for the digits, or if all bits are used.
925 * dev_alloc_name - allocate a name for a device
927 * @name: name format string
929 * Passed a format string - eg "lt%d" it will try and find a suitable
930 * id. It scans list of devices to build up a free map, then chooses
931 * the first empty slot. The caller must hold the dev_base or rtnl lock
932 * while allocating the name and adding the device in order to avoid
934 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
935 * Returns the number of the unit assigned or a negative errno code.
938 int dev_alloc_name(struct net_device *dev, const char *name)
944 BUG_ON(!dev_net(dev));
946 ret = __dev_alloc_name(net, name, buf);
948 strlcpy(dev->name, buf, IFNAMSIZ);
951 EXPORT_SYMBOL(dev_alloc_name);
953 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
957 BUG_ON(!dev_net(dev));
960 if (!dev_valid_name(name))
963 if (fmt && strchr(name, '%'))
964 return dev_alloc_name(dev, name);
965 else if (__dev_get_by_name(net, name))
967 else if (dev->name != name)
968 strlcpy(dev->name, name, IFNAMSIZ);
974 * dev_change_name - change name of a device
976 * @newname: name (or format string) must be at least IFNAMSIZ
978 * Change name of a device, can pass format strings "eth%d".
981 int dev_change_name(struct net_device *dev, const char *newname)
983 char oldname[IFNAMSIZ];
989 BUG_ON(!dev_net(dev));
992 if (dev->flags & IFF_UP)
995 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
998 memcpy(oldname, dev->name, IFNAMSIZ);
1000 err = dev_get_valid_name(dev, newname, 1);
1005 ret = device_rename(&dev->dev, dev->name);
1007 memcpy(dev->name, oldname, IFNAMSIZ);
1011 write_lock_bh(&dev_base_lock);
1012 hlist_del(&dev->name_hlist);
1013 write_unlock_bh(&dev_base_lock);
1017 write_lock_bh(&dev_base_lock);
1018 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1019 write_unlock_bh(&dev_base_lock);
1021 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1022 ret = notifier_to_errno(ret);
1025 /* err >= 0 after dev_alloc_name() or stores the first errno */
1028 memcpy(dev->name, oldname, IFNAMSIZ);
1032 "%s: name change rollback failed: %d.\n",
1041 * dev_set_alias - change ifalias of a device
1043 * @alias: name up to IFALIASZ
1044 * @len: limit of bytes to copy from info
1046 * Set ifalias for a device,
1048 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1052 if (len >= IFALIASZ)
1057 kfree(dev->ifalias);
1058 dev->ifalias = NULL;
1063 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1067 strlcpy(dev->ifalias, alias, len+1);
1073 * netdev_features_change - device changes features
1074 * @dev: device to cause notification
1076 * Called to indicate a device has changed features.
1078 void netdev_features_change(struct net_device *dev)
1080 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1082 EXPORT_SYMBOL(netdev_features_change);
1085 * netdev_state_change - device changes state
1086 * @dev: device to cause notification
1088 * Called to indicate a device has changed state. This function calls
1089 * the notifier chains for netdev_chain and sends a NEWLINK message
1090 * to the routing socket.
1092 void netdev_state_change(struct net_device *dev)
1094 if (dev->flags & IFF_UP) {
1095 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1096 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1099 EXPORT_SYMBOL(netdev_state_change);
1101 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1103 return call_netdevice_notifiers(event, dev);
1105 EXPORT_SYMBOL(netdev_bonding_change);
1108 * dev_load - load a network module
1109 * @net: the applicable net namespace
1110 * @name: name of interface
1112 * If a network interface is not present and the process has suitable
1113 * privileges this function loads the module. If module loading is not
1114 * available in this kernel then it becomes a nop.
1117 void dev_load(struct net *net, const char *name)
1119 struct net_device *dev;
1122 dev = dev_get_by_name_rcu(net, name);
1125 if (!dev && capable(CAP_NET_ADMIN))
1126 request_module("%s", name);
1128 EXPORT_SYMBOL(dev_load);
1130 static int __dev_open(struct net_device *dev)
1132 const struct net_device_ops *ops = dev->netdev_ops;
1138 * Is it even present?
1140 if (!netif_device_present(dev))
1143 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1144 ret = notifier_to_errno(ret);
1149 * Call device private open method
1151 set_bit(__LINK_STATE_START, &dev->state);
1153 if (ops->ndo_validate_addr)
1154 ret = ops->ndo_validate_addr(dev);
1156 if (!ret && ops->ndo_open)
1157 ret = ops->ndo_open(dev);
1160 * If it went open OK then:
1164 clear_bit(__LINK_STATE_START, &dev->state);
1169 dev->flags |= IFF_UP;
1174 net_dmaengine_get();
1177 * Initialize multicasting status
1179 dev_set_rx_mode(dev);
1182 * Wakeup transmit queue engine
1191 * dev_open - prepare an interface for use.
1192 * @dev: device to open
1194 * Takes a device from down to up state. The device's private open
1195 * function is invoked and then the multicast lists are loaded. Finally
1196 * the device is moved into the up state and a %NETDEV_UP message is
1197 * sent to the netdev notifier chain.
1199 * Calling this function on an active interface is a nop. On a failure
1200 * a negative errno code is returned.
1202 int dev_open(struct net_device *dev)
1209 if (dev->flags & IFF_UP)
1215 ret = __dev_open(dev);
1220 * ... and announce new interface.
1222 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1223 call_netdevice_notifiers(NETDEV_UP, dev);
1227 EXPORT_SYMBOL(dev_open);
1229 static int __dev_close(struct net_device *dev)
1231 const struct net_device_ops *ops = dev->netdev_ops;
1237 * Tell people we are going down, so that they can
1238 * prepare to death, when device is still operating.
1240 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1242 clear_bit(__LINK_STATE_START, &dev->state);
1244 /* Synchronize to scheduled poll. We cannot touch poll list,
1245 * it can be even on different cpu. So just clear netif_running().
1247 * dev->stop() will invoke napi_disable() on all of it's
1248 * napi_struct instances on this device.
1250 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1252 dev_deactivate(dev);
1255 * Call the device specific close. This cannot fail.
1256 * Only if device is UP
1258 * We allow it to be called even after a DETACH hot-plug
1265 * Device is now down.
1268 dev->flags &= ~IFF_UP;
1273 net_dmaengine_put();
1279 * dev_close - shutdown an interface.
1280 * @dev: device to shutdown
1282 * This function moves an active device into down state. A
1283 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1284 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1287 int dev_close(struct net_device *dev)
1289 if (!(dev->flags & IFF_UP))
1295 * Tell people we are down
1297 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1298 call_netdevice_notifiers(NETDEV_DOWN, dev);
1302 EXPORT_SYMBOL(dev_close);
1306 * dev_disable_lro - disable Large Receive Offload on a device
1309 * Disable Large Receive Offload (LRO) on a net device. Must be
1310 * called under RTNL. This is needed if received packets may be
1311 * forwarded to another interface.
1313 void dev_disable_lro(struct net_device *dev)
1315 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1316 dev->ethtool_ops->set_flags) {
1317 u32 flags = dev->ethtool_ops->get_flags(dev);
1318 if (flags & ETH_FLAG_LRO) {
1319 flags &= ~ETH_FLAG_LRO;
1320 dev->ethtool_ops->set_flags(dev, flags);
1323 WARN_ON(dev->features & NETIF_F_LRO);
1325 EXPORT_SYMBOL(dev_disable_lro);
1328 static int dev_boot_phase = 1;
1331 * Device change register/unregister. These are not inline or static
1332 * as we export them to the world.
1336 * register_netdevice_notifier - register a network notifier block
1339 * Register a notifier to be called when network device events occur.
1340 * The notifier passed is linked into the kernel structures and must
1341 * not be reused until it has been unregistered. A negative errno code
1342 * is returned on a failure.
1344 * When registered all registration and up events are replayed
1345 * to the new notifier to allow device to have a race free
1346 * view of the network device list.
1349 int register_netdevice_notifier(struct notifier_block *nb)
1351 struct net_device *dev;
1352 struct net_device *last;
1357 err = raw_notifier_chain_register(&netdev_chain, nb);
1363 for_each_netdev(net, dev) {
1364 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1365 err = notifier_to_errno(err);
1369 if (!(dev->flags & IFF_UP))
1372 nb->notifier_call(nb, NETDEV_UP, dev);
1383 for_each_netdev(net, dev) {
1387 if (dev->flags & IFF_UP) {
1388 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1389 nb->notifier_call(nb, NETDEV_DOWN, dev);
1391 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1392 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1396 raw_notifier_chain_unregister(&netdev_chain, nb);
1399 EXPORT_SYMBOL(register_netdevice_notifier);
1402 * unregister_netdevice_notifier - unregister a network notifier block
1405 * Unregister a notifier previously registered by
1406 * register_netdevice_notifier(). The notifier is unlinked into the
1407 * kernel structures and may then be reused. A negative errno code
1408 * is returned on a failure.
1411 int unregister_netdevice_notifier(struct notifier_block *nb)
1416 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1420 EXPORT_SYMBOL(unregister_netdevice_notifier);
1423 * call_netdevice_notifiers - call all network notifier blocks
1424 * @val: value passed unmodified to notifier function
1425 * @dev: net_device pointer passed unmodified to notifier function
1427 * Call all network notifier blocks. Parameters and return value
1428 * are as for raw_notifier_call_chain().
1431 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1434 return raw_notifier_call_chain(&netdev_chain, val, dev);
1437 /* When > 0 there are consumers of rx skb time stamps */
1438 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1440 void net_enable_timestamp(void)
1442 atomic_inc(&netstamp_needed);
1444 EXPORT_SYMBOL(net_enable_timestamp);
1446 void net_disable_timestamp(void)
1448 atomic_dec(&netstamp_needed);
1450 EXPORT_SYMBOL(net_disable_timestamp);
1452 static inline void net_timestamp_set(struct sk_buff *skb)
1454 if (atomic_read(&netstamp_needed))
1455 __net_timestamp(skb);
1457 skb->tstamp.tv64 = 0;
1460 static inline void net_timestamp_check(struct sk_buff *skb)
1462 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1463 __net_timestamp(skb);
1467 * dev_forward_skb - loopback an skb to another netif
1469 * @dev: destination network device
1470 * @skb: buffer to forward
1473 * NET_RX_SUCCESS (no congestion)
1474 * NET_RX_DROP (packet was dropped, but freed)
1476 * dev_forward_skb can be used for injecting an skb from the
1477 * start_xmit function of one device into the receive queue
1478 * of another device.
1480 * The receiving device may be in another namespace, so
1481 * we have to clear all information in the skb that could
1482 * impact namespace isolation.
1484 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1488 if (!(dev->flags & IFF_UP) ||
1489 (skb->len > (dev->mtu + dev->hard_header_len))) {
1493 skb_set_dev(skb, dev);
1494 skb->tstamp.tv64 = 0;
1495 skb->pkt_type = PACKET_HOST;
1496 skb->protocol = eth_type_trans(skb, dev);
1497 return netif_rx(skb);
1499 EXPORT_SYMBOL_GPL(dev_forward_skb);
1502 * Support routine. Sends outgoing frames to any network
1503 * taps currently in use.
1506 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1508 struct packet_type *ptype;
1510 #ifdef CONFIG_NET_CLS_ACT
1511 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1512 net_timestamp_set(skb);
1514 net_timestamp_set(skb);
1518 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1519 /* Never send packets back to the socket
1520 * they originated from - MvS (miquels@drinkel.ow.org)
1522 if ((ptype->dev == dev || !ptype->dev) &&
1523 (ptype->af_packet_priv == NULL ||
1524 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1525 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1529 /* skb->nh should be correctly
1530 set by sender, so that the second statement is
1531 just protection against buggy protocols.
1533 skb_reset_mac_header(skb2);
1535 if (skb_network_header(skb2) < skb2->data ||
1536 skb2->network_header > skb2->tail) {
1537 if (net_ratelimit())
1538 printk(KERN_CRIT "protocol %04x is "
1540 skb2->protocol, dev->name);
1541 skb_reset_network_header(skb2);
1544 skb2->transport_header = skb2->network_header;
1545 skb2->pkt_type = PACKET_OUTGOING;
1546 ptype->func(skb2, skb->dev, ptype, skb->dev);
1553 static inline void __netif_reschedule(struct Qdisc *q)
1555 struct softnet_data *sd;
1556 unsigned long flags;
1558 local_irq_save(flags);
1559 sd = &__get_cpu_var(softnet_data);
1560 q->next_sched = NULL;
1561 *sd->output_queue_tailp = q;
1562 sd->output_queue_tailp = &q->next_sched;
1563 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1564 local_irq_restore(flags);
1567 void __netif_schedule(struct Qdisc *q)
1569 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1570 __netif_reschedule(q);
1572 EXPORT_SYMBOL(__netif_schedule);
1574 void dev_kfree_skb_irq(struct sk_buff *skb)
1576 if (!skb->destructor)
1578 else if (atomic_dec_and_test(&skb->users)) {
1579 struct softnet_data *sd;
1580 unsigned long flags;
1582 local_irq_save(flags);
1583 sd = &__get_cpu_var(softnet_data);
1584 skb->next = sd->completion_queue;
1585 sd->completion_queue = skb;
1586 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1587 local_irq_restore(flags);
1590 EXPORT_SYMBOL(dev_kfree_skb_irq);
1592 void dev_kfree_skb_any(struct sk_buff *skb)
1594 if (in_irq() || irqs_disabled())
1595 dev_kfree_skb_irq(skb);
1599 EXPORT_SYMBOL(dev_kfree_skb_any);
1603 * netif_device_detach - mark device as removed
1604 * @dev: network device
1606 * Mark device as removed from system and therefore no longer available.
1608 void netif_device_detach(struct net_device *dev)
1610 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1611 netif_running(dev)) {
1612 netif_tx_stop_all_queues(dev);
1615 EXPORT_SYMBOL(netif_device_detach);
1618 * netif_device_attach - mark device as attached
1619 * @dev: network device
1621 * Mark device as attached from system and restart if needed.
1623 void netif_device_attach(struct net_device *dev)
1625 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1626 netif_running(dev)) {
1627 netif_tx_wake_all_queues(dev);
1628 __netdev_watchdog_up(dev);
1631 EXPORT_SYMBOL(netif_device_attach);
1633 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1635 return ((features & NETIF_F_GEN_CSUM) ||
1636 ((features & NETIF_F_IP_CSUM) &&
1637 protocol == htons(ETH_P_IP)) ||
1638 ((features & NETIF_F_IPV6_CSUM) &&
1639 protocol == htons(ETH_P_IPV6)) ||
1640 ((features & NETIF_F_FCOE_CRC) &&
1641 protocol == htons(ETH_P_FCOE)));
1644 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1646 if (can_checksum_protocol(dev->features, skb->protocol))
1649 if (skb->protocol == htons(ETH_P_8021Q)) {
1650 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1651 if (can_checksum_protocol(dev->features & dev->vlan_features,
1652 veh->h_vlan_encapsulated_proto))
1660 * skb_dev_set -- assign a new device to a buffer
1661 * @skb: buffer for the new device
1662 * @dev: network device
1664 * If an skb is owned by a device already, we have to reset
1665 * all data private to the namespace a device belongs to
1666 * before assigning it a new device.
1668 #ifdef CONFIG_NET_NS
1669 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1672 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1675 skb_init_secmark(skb);
1679 skb->ipvs_property = 0;
1680 #ifdef CONFIG_NET_SCHED
1686 EXPORT_SYMBOL(skb_set_dev);
1687 #endif /* CONFIG_NET_NS */
1690 * Invalidate hardware checksum when packet is to be mangled, and
1691 * complete checksum manually on outgoing path.
1693 int skb_checksum_help(struct sk_buff *skb)
1696 int ret = 0, offset;
1698 if (skb->ip_summed == CHECKSUM_COMPLETE)
1699 goto out_set_summed;
1701 if (unlikely(skb_shinfo(skb)->gso_size)) {
1702 /* Let GSO fix up the checksum. */
1703 goto out_set_summed;
1706 offset = skb->csum_start - skb_headroom(skb);
1707 BUG_ON(offset >= skb_headlen(skb));
1708 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1710 offset += skb->csum_offset;
1711 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1713 if (skb_cloned(skb) &&
1714 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1715 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1720 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1722 skb->ip_summed = CHECKSUM_NONE;
1726 EXPORT_SYMBOL(skb_checksum_help);
1729 * skb_gso_segment - Perform segmentation on skb.
1730 * @skb: buffer to segment
1731 * @features: features for the output path (see dev->features)
1733 * This function segments the given skb and returns a list of segments.
1735 * It may return NULL if the skb requires no segmentation. This is
1736 * only possible when GSO is used for verifying header integrity.
1738 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1740 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1741 struct packet_type *ptype;
1742 __be16 type = skb->protocol;
1745 skb_reset_mac_header(skb);
1746 skb->mac_len = skb->network_header - skb->mac_header;
1747 __skb_pull(skb, skb->mac_len);
1749 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1750 struct net_device *dev = skb->dev;
1751 struct ethtool_drvinfo info = {};
1753 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1754 dev->ethtool_ops->get_drvinfo(dev, &info);
1756 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1758 info.driver, dev ? dev->features : 0L,
1759 skb->sk ? skb->sk->sk_route_caps : 0L,
1760 skb->len, skb->data_len, skb->ip_summed);
1762 if (skb_header_cloned(skb) &&
1763 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1764 return ERR_PTR(err);
1768 list_for_each_entry_rcu(ptype,
1769 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1770 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1771 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1772 err = ptype->gso_send_check(skb);
1773 segs = ERR_PTR(err);
1774 if (err || skb_gso_ok(skb, features))
1776 __skb_push(skb, (skb->data -
1777 skb_network_header(skb)));
1779 segs = ptype->gso_segment(skb, features);
1785 __skb_push(skb, skb->data - skb_mac_header(skb));
1789 EXPORT_SYMBOL(skb_gso_segment);
1791 /* Take action when hardware reception checksum errors are detected. */
1793 void netdev_rx_csum_fault(struct net_device *dev)
1795 if (net_ratelimit()) {
1796 printk(KERN_ERR "%s: hw csum failure.\n",
1797 dev ? dev->name : "<unknown>");
1801 EXPORT_SYMBOL(netdev_rx_csum_fault);
1804 /* Actually, we should eliminate this check as soon as we know, that:
1805 * 1. IOMMU is present and allows to map all the memory.
1806 * 2. No high memory really exists on this machine.
1809 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1811 #ifdef CONFIG_HIGHMEM
1813 if (!(dev->features & NETIF_F_HIGHDMA)) {
1814 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1815 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1819 if (PCI_DMA_BUS_IS_PHYS) {
1820 struct device *pdev = dev->dev.parent;
1824 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1825 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1826 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1835 void (*destructor)(struct sk_buff *skb);
1838 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1840 static void dev_gso_skb_destructor(struct sk_buff *skb)
1842 struct dev_gso_cb *cb;
1845 struct sk_buff *nskb = skb->next;
1847 skb->next = nskb->next;
1850 } while (skb->next);
1852 cb = DEV_GSO_CB(skb);
1854 cb->destructor(skb);
1858 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1859 * @skb: buffer to segment
1861 * This function segments the given skb and stores the list of segments
1864 static int dev_gso_segment(struct sk_buff *skb)
1866 struct net_device *dev = skb->dev;
1867 struct sk_buff *segs;
1868 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1871 segs = skb_gso_segment(skb, features);
1873 /* Verifying header integrity only. */
1878 return PTR_ERR(segs);
1881 DEV_GSO_CB(skb)->destructor = skb->destructor;
1882 skb->destructor = dev_gso_skb_destructor;
1888 * Try to orphan skb early, right before transmission by the device.
1889 * We cannot orphan skb if tx timestamp is requested, since
1890 * drivers need to call skb_tstamp_tx() to send the timestamp.
1892 static inline void skb_orphan_try(struct sk_buff *skb)
1894 if (!skb_tx(skb)->flags)
1899 * Returns true if either:
1900 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
1901 * 2. skb is fragmented and the device does not support SG, or if
1902 * at least one of fragments is in highmem and device does not
1903 * support DMA from it.
1905 static inline int skb_needs_linearize(struct sk_buff *skb,
1906 struct net_device *dev)
1908 return skb_is_nonlinear(skb) &&
1909 ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
1910 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
1911 illegal_highdma(dev, skb))));
1914 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1915 struct netdev_queue *txq)
1917 const struct net_device_ops *ops = dev->netdev_ops;
1918 int rc = NETDEV_TX_OK;
1920 if (likely(!skb->next)) {
1921 if (!list_empty(&ptype_all))
1922 dev_queue_xmit_nit(skb, dev);
1925 * If device doesnt need skb->dst, release it right now while
1926 * its hot in this cpu cache
1928 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1931 skb_orphan_try(skb);
1933 if (netif_needs_gso(dev, skb)) {
1934 if (unlikely(dev_gso_segment(skb)))
1939 if (skb_needs_linearize(skb, dev) &&
1940 __skb_linearize(skb))
1943 /* If packet is not checksummed and device does not
1944 * support checksumming for this protocol, complete
1945 * checksumming here.
1947 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1948 skb_set_transport_header(skb, skb->csum_start -
1950 if (!dev_can_checksum(dev, skb) &&
1951 skb_checksum_help(skb))
1956 rc = ops->ndo_start_xmit(skb, dev);
1957 if (rc == NETDEV_TX_OK)
1958 txq_trans_update(txq);
1964 struct sk_buff *nskb = skb->next;
1966 skb->next = nskb->next;
1970 * If device doesnt need nskb->dst, release it right now while
1971 * its hot in this cpu cache
1973 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1976 rc = ops->ndo_start_xmit(nskb, dev);
1977 if (unlikely(rc != NETDEV_TX_OK)) {
1978 if (rc & ~NETDEV_TX_MASK)
1979 goto out_kfree_gso_skb;
1980 nskb->next = skb->next;
1984 txq_trans_update(txq);
1985 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1986 return NETDEV_TX_BUSY;
1987 } while (skb->next);
1990 if (likely(skb->next == NULL))
1991 skb->destructor = DEV_GSO_CB(skb)->destructor;
1997 static u32 hashrnd __read_mostly;
1999 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
2003 if (skb_rx_queue_recorded(skb)) {
2004 hash = skb_get_rx_queue(skb);
2005 while (unlikely(hash >= dev->real_num_tx_queues))
2006 hash -= dev->real_num_tx_queues;
2010 if (skb->sk && skb->sk->sk_hash)
2011 hash = skb->sk->sk_hash;
2013 hash = (__force u16) skb->protocol;
2015 hash = jhash_1word(hash, hashrnd);
2017 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
2019 EXPORT_SYMBOL(skb_tx_hash);
2021 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2023 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2024 if (net_ratelimit()) {
2025 pr_warning("%s selects TX queue %d, but "
2026 "real number of TX queues is %d\n",
2027 dev->name, queue_index, dev->real_num_tx_queues);
2034 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2035 struct sk_buff *skb)
2038 struct sock *sk = skb->sk;
2040 if (sk_tx_queue_recorded(sk)) {
2041 queue_index = sk_tx_queue_get(sk);
2043 const struct net_device_ops *ops = dev->netdev_ops;
2045 if (ops->ndo_select_queue) {
2046 queue_index = ops->ndo_select_queue(dev, skb);
2047 queue_index = dev_cap_txqueue(dev, queue_index);
2050 if (dev->real_num_tx_queues > 1)
2051 queue_index = skb_tx_hash(dev, skb);
2054 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
2056 if (dst && skb_dst(skb) == dst)
2057 sk_tx_queue_set(sk, queue_index);
2062 skb_set_queue_mapping(skb, queue_index);
2063 return netdev_get_tx_queue(dev, queue_index);
2066 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2067 struct net_device *dev,
2068 struct netdev_queue *txq)
2070 spinlock_t *root_lock = qdisc_lock(q);
2071 bool contended = qdisc_is_running(q);
2075 * Heuristic to force contended enqueues to serialize on a
2076 * separate lock before trying to get qdisc main lock.
2077 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2078 * and dequeue packets faster.
2080 if (unlikely(contended))
2081 spin_lock(&q->busylock);
2083 spin_lock(root_lock);
2084 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2087 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2088 qdisc_run_begin(q)) {
2090 * This is a work-conserving queue; there are no old skbs
2091 * waiting to be sent out; and the qdisc is not running -
2092 * xmit the skb directly.
2094 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2096 __qdisc_update_bstats(q, skb->len);
2097 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2098 if (unlikely(contended)) {
2099 spin_unlock(&q->busylock);
2106 rc = NET_XMIT_SUCCESS;
2109 rc = qdisc_enqueue_root(skb, q);
2110 if (qdisc_run_begin(q)) {
2111 if (unlikely(contended)) {
2112 spin_unlock(&q->busylock);
2118 spin_unlock(root_lock);
2119 if (unlikely(contended))
2120 spin_unlock(&q->busylock);
2125 * dev_queue_xmit - transmit a buffer
2126 * @skb: buffer to transmit
2128 * Queue a buffer for transmission to a network device. The caller must
2129 * have set the device and priority and built the buffer before calling
2130 * this function. The function can be called from an interrupt.
2132 * A negative errno code is returned on a failure. A success does not
2133 * guarantee the frame will be transmitted as it may be dropped due
2134 * to congestion or traffic shaping.
2136 * -----------------------------------------------------------------------------------
2137 * I notice this method can also return errors from the queue disciplines,
2138 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2141 * Regardless of the return value, the skb is consumed, so it is currently
2142 * difficult to retry a send to this method. (You can bump the ref count
2143 * before sending to hold a reference for retry if you are careful.)
2145 * When calling this method, interrupts MUST be enabled. This is because
2146 * the BH enable code must have IRQs enabled so that it will not deadlock.
2149 int dev_queue_xmit(struct sk_buff *skb)
2151 struct net_device *dev = skb->dev;
2152 struct netdev_queue *txq;
2156 /* Disable soft irqs for various locks below. Also
2157 * stops preemption for RCU.
2161 txq = dev_pick_tx(dev, skb);
2162 q = rcu_dereference_bh(txq->qdisc);
2164 #ifdef CONFIG_NET_CLS_ACT
2165 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2168 rc = __dev_xmit_skb(skb, q, dev, txq);
2172 /* The device has no queue. Common case for software devices:
2173 loopback, all the sorts of tunnels...
2175 Really, it is unlikely that netif_tx_lock protection is necessary
2176 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2178 However, it is possible, that they rely on protection
2181 Check this and shot the lock. It is not prone from deadlocks.
2182 Either shot noqueue qdisc, it is even simpler 8)
2184 if (dev->flags & IFF_UP) {
2185 int cpu = smp_processor_id(); /* ok because BHs are off */
2187 if (txq->xmit_lock_owner != cpu) {
2189 HARD_TX_LOCK(dev, txq, cpu);
2191 if (!netif_tx_queue_stopped(txq)) {
2192 rc = dev_hard_start_xmit(skb, dev, txq);
2193 if (dev_xmit_complete(rc)) {
2194 HARD_TX_UNLOCK(dev, txq);
2198 HARD_TX_UNLOCK(dev, txq);
2199 if (net_ratelimit())
2200 printk(KERN_CRIT "Virtual device %s asks to "
2201 "queue packet!\n", dev->name);
2203 /* Recursion is detected! It is possible,
2205 if (net_ratelimit())
2206 printk(KERN_CRIT "Dead loop on virtual device "
2207 "%s, fix it urgently!\n", dev->name);
2212 rcu_read_unlock_bh();
2217 rcu_read_unlock_bh();
2220 EXPORT_SYMBOL(dev_queue_xmit);
2223 /*=======================================================================
2225 =======================================================================*/
2227 int netdev_max_backlog __read_mostly = 1000;
2228 int netdev_tstamp_prequeue __read_mostly = 1;
2229 int netdev_budget __read_mostly = 300;
2230 int weight_p __read_mostly = 64; /* old backlog weight */
2232 /* Called with irq disabled */
2233 static inline void ____napi_schedule(struct softnet_data *sd,
2234 struct napi_struct *napi)
2236 list_add_tail(&napi->poll_list, &sd->poll_list);
2237 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2242 /* One global table that all flow-based protocols share. */
2243 struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
2244 EXPORT_SYMBOL(rps_sock_flow_table);
2247 * get_rps_cpu is called from netif_receive_skb and returns the target
2248 * CPU from the RPS map of the receiving queue for a given skb.
2249 * rcu_read_lock must be held on entry.
2251 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2252 struct rps_dev_flow **rflowp)
2254 struct ipv6hdr *ip6;
2256 struct netdev_rx_queue *rxqueue;
2257 struct rps_map *map;
2258 struct rps_dev_flow_table *flow_table;
2259 struct rps_sock_flow_table *sock_flow_table;
2263 u32 addr1, addr2, ihl;
2269 if (skb_rx_queue_recorded(skb)) {
2270 u16 index = skb_get_rx_queue(skb);
2271 if (unlikely(index >= dev->num_rx_queues)) {
2272 WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
2273 "on queue %u, but number of RX queues is %u\n",
2274 dev->name, index, dev->num_rx_queues);
2277 rxqueue = dev->_rx + index;
2281 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
2285 goto got_hash; /* Skip hash computation on packet header */
2287 switch (skb->protocol) {
2288 case __constant_htons(ETH_P_IP):
2289 if (!pskb_may_pull(skb, sizeof(*ip)))
2292 ip = (struct iphdr *) skb->data;
2293 ip_proto = ip->protocol;
2294 addr1 = (__force u32) ip->saddr;
2295 addr2 = (__force u32) ip->daddr;
2298 case __constant_htons(ETH_P_IPV6):
2299 if (!pskb_may_pull(skb, sizeof(*ip6)))
2302 ip6 = (struct ipv6hdr *) skb->data;
2303 ip_proto = ip6->nexthdr;
2304 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2305 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2318 case IPPROTO_UDPLITE:
2319 if (pskb_may_pull(skb, (ihl * 4) + 4)) {
2320 ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
2321 if (ports.v16[1] < ports.v16[0])
2322 swap(ports.v16[0], ports.v16[1]);
2330 /* get a consistent hash (same value on both flow directions) */
2333 skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2338 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2339 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2340 if (flow_table && sock_flow_table) {
2342 struct rps_dev_flow *rflow;
2344 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2347 next_cpu = sock_flow_table->ents[skb->rxhash &
2348 sock_flow_table->mask];
2351 * If the desired CPU (where last recvmsg was done) is
2352 * different from current CPU (one in the rx-queue flow
2353 * table entry), switch if one of the following holds:
2354 * - Current CPU is unset (equal to RPS_NO_CPU).
2355 * - Current CPU is offline.
2356 * - The current CPU's queue tail has advanced beyond the
2357 * last packet that was enqueued using this table entry.
2358 * This guarantees that all previous packets for the flow
2359 * have been dequeued, thus preserving in order delivery.
2361 if (unlikely(tcpu != next_cpu) &&
2362 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2363 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2364 rflow->last_qtail)) >= 0)) {
2365 tcpu = rflow->cpu = next_cpu;
2366 if (tcpu != RPS_NO_CPU)
2367 rflow->last_qtail = per_cpu(softnet_data,
2368 tcpu).input_queue_head;
2370 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2377 map = rcu_dereference(rxqueue->rps_map);
2379 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2381 if (cpu_online(tcpu)) {
2391 /* Called from hardirq (IPI) context */
2392 static void rps_trigger_softirq(void *data)
2394 struct softnet_data *sd = data;
2396 ____napi_schedule(sd, &sd->backlog);
2400 #endif /* CONFIG_RPS */
2403 * Check if this softnet_data structure is another cpu one
2404 * If yes, queue it to our IPI list and return 1
2407 static int rps_ipi_queued(struct softnet_data *sd)
2410 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2413 sd->rps_ipi_next = mysd->rps_ipi_list;
2414 mysd->rps_ipi_list = sd;
2416 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2419 #endif /* CONFIG_RPS */
2424 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2425 * queue (may be a remote CPU queue).
2427 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2428 unsigned int *qtail)
2430 struct softnet_data *sd;
2431 unsigned long flags;
2433 sd = &per_cpu(softnet_data, cpu);
2435 local_irq_save(flags);
2438 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2439 if (skb_queue_len(&sd->input_pkt_queue)) {
2441 __skb_queue_tail(&sd->input_pkt_queue, skb);
2442 input_queue_tail_incr_save(sd, qtail);
2444 local_irq_restore(flags);
2445 return NET_RX_SUCCESS;
2448 /* Schedule NAPI for backlog device
2449 * We can use non atomic operation since we own the queue lock
2451 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2452 if (!rps_ipi_queued(sd))
2453 ____napi_schedule(sd, &sd->backlog);
2461 local_irq_restore(flags);
2468 * netif_rx - post buffer to the network code
2469 * @skb: buffer to post
2471 * This function receives a packet from a device driver and queues it for
2472 * the upper (protocol) levels to process. It always succeeds. The buffer
2473 * may be dropped during processing for congestion control or by the
2477 * NET_RX_SUCCESS (no congestion)
2478 * NET_RX_DROP (packet was dropped)
2482 int netif_rx(struct sk_buff *skb)
2486 /* if netpoll wants it, pretend we never saw it */
2487 if (netpoll_rx(skb))
2490 if (netdev_tstamp_prequeue)
2491 net_timestamp_check(skb);
2495 struct rps_dev_flow voidflow, *rflow = &voidflow;
2500 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2502 cpu = smp_processor_id();
2504 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2511 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2517 EXPORT_SYMBOL(netif_rx);
2519 int netif_rx_ni(struct sk_buff *skb)
2524 err = netif_rx(skb);
2525 if (local_softirq_pending())
2531 EXPORT_SYMBOL(netif_rx_ni);
2533 static void net_tx_action(struct softirq_action *h)
2535 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2537 if (sd->completion_queue) {
2538 struct sk_buff *clist;
2540 local_irq_disable();
2541 clist = sd->completion_queue;
2542 sd->completion_queue = NULL;
2546 struct sk_buff *skb = clist;
2547 clist = clist->next;
2549 WARN_ON(atomic_read(&skb->users));
2554 if (sd->output_queue) {
2557 local_irq_disable();
2558 head = sd->output_queue;
2559 sd->output_queue = NULL;
2560 sd->output_queue_tailp = &sd->output_queue;
2564 struct Qdisc *q = head;
2565 spinlock_t *root_lock;
2567 head = head->next_sched;
2569 root_lock = qdisc_lock(q);
2570 if (spin_trylock(root_lock)) {
2571 smp_mb__before_clear_bit();
2572 clear_bit(__QDISC_STATE_SCHED,
2575 spin_unlock(root_lock);
2577 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2579 __netif_reschedule(q);
2581 smp_mb__before_clear_bit();
2582 clear_bit(__QDISC_STATE_SCHED,
2590 static inline int deliver_skb(struct sk_buff *skb,
2591 struct packet_type *pt_prev,
2592 struct net_device *orig_dev)
2594 atomic_inc(&skb->users);
2595 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2598 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2599 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2600 /* This hook is defined here for ATM LANE */
2601 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2602 unsigned char *addr) __read_mostly;
2603 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2606 #ifdef CONFIG_NET_CLS_ACT
2607 /* TODO: Maybe we should just force sch_ingress to be compiled in
2608 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2609 * a compare and 2 stores extra right now if we dont have it on
2610 * but have CONFIG_NET_CLS_ACT
2611 * NOTE: This doesnt stop any functionality; if you dont have
2612 * the ingress scheduler, you just cant add policies on ingress.
2615 static int ing_filter(struct sk_buff *skb)
2617 struct net_device *dev = skb->dev;
2618 u32 ttl = G_TC_RTTL(skb->tc_verd);
2619 struct netdev_queue *rxq;
2620 int result = TC_ACT_OK;
2623 if (MAX_RED_LOOP < ttl++) {
2625 "Redir loop detected Dropping packet (%d->%d)\n",
2626 skb->skb_iif, dev->ifindex);
2630 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2631 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2633 rxq = &dev->rx_queue;
2636 if (q != &noop_qdisc) {
2637 spin_lock(qdisc_lock(q));
2638 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2639 result = qdisc_enqueue_root(skb, q);
2640 spin_unlock(qdisc_lock(q));
2646 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2647 struct packet_type **pt_prev,
2648 int *ret, struct net_device *orig_dev)
2650 if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2654 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2658 switch (ing_filter(skb)) {
2672 * netif_nit_deliver - deliver received packets to network taps
2675 * This function is used to deliver incoming packets to network
2676 * taps. It should be used when the normal netif_receive_skb path
2677 * is bypassed, for example because of VLAN acceleration.
2679 void netif_nit_deliver(struct sk_buff *skb)
2681 struct packet_type *ptype;
2683 if (list_empty(&ptype_all))
2686 skb_reset_network_header(skb);
2687 skb_reset_transport_header(skb);
2688 skb->mac_len = skb->network_header - skb->mac_header;
2691 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2692 if (!ptype->dev || ptype->dev == skb->dev)
2693 deliver_skb(skb, ptype, skb->dev);
2699 * netdev_rx_handler_register - register receive handler
2700 * @dev: device to register a handler for
2701 * @rx_handler: receive handler to register
2702 * @rx_handler_data: data pointer that is used by rx handler
2704 * Register a receive hander for a device. This handler will then be
2705 * called from __netif_receive_skb. A negative errno code is returned
2708 * The caller must hold the rtnl_mutex.
2710 int netdev_rx_handler_register(struct net_device *dev,
2711 rx_handler_func_t *rx_handler,
2712 void *rx_handler_data)
2716 if (dev->rx_handler)
2719 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2720 rcu_assign_pointer(dev->rx_handler, rx_handler);
2724 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2727 * netdev_rx_handler_unregister - unregister receive handler
2728 * @dev: device to unregister a handler from
2730 * Unregister a receive hander from a device.
2732 * The caller must hold the rtnl_mutex.
2734 void netdev_rx_handler_unregister(struct net_device *dev)
2738 rcu_assign_pointer(dev->rx_handler, NULL);
2739 rcu_assign_pointer(dev->rx_handler_data, NULL);
2741 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2743 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2744 struct net_device *master)
2746 if (skb->pkt_type == PACKET_HOST) {
2747 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2749 memcpy(dest, master->dev_addr, ETH_ALEN);
2753 /* On bonding slaves other than the currently active slave, suppress
2754 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2755 * ARP on active-backup slaves with arp_validate enabled.
2757 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2759 struct net_device *dev = skb->dev;
2761 if (master->priv_flags & IFF_MASTER_ARPMON)
2762 dev->last_rx = jiffies;
2764 if ((master->priv_flags & IFF_MASTER_ALB) &&
2765 (master->priv_flags & IFF_BRIDGE_PORT)) {
2766 /* Do address unmangle. The local destination address
2767 * will be always the one master has. Provides the right
2768 * functionality in a bridge.
2770 skb_bond_set_mac_by_master(skb, master);
2773 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2774 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2775 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2778 if (master->priv_flags & IFF_MASTER_ALB) {
2779 if (skb->pkt_type != PACKET_BROADCAST &&
2780 skb->pkt_type != PACKET_MULTICAST)
2783 if (master->priv_flags & IFF_MASTER_8023AD &&
2784 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2791 EXPORT_SYMBOL(__skb_bond_should_drop);
2793 static int __netif_receive_skb(struct sk_buff *skb)
2795 struct packet_type *ptype, *pt_prev;
2796 rx_handler_func_t *rx_handler;
2797 struct net_device *orig_dev;
2798 struct net_device *master;
2799 struct net_device *null_or_orig;
2800 struct net_device *orig_or_bond;
2801 int ret = NET_RX_DROP;
2804 if (!netdev_tstamp_prequeue)
2805 net_timestamp_check(skb);
2807 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2808 return NET_RX_SUCCESS;
2810 /* if we've gotten here through NAPI, check netpoll */
2811 if (netpoll_receive_skb(skb))
2815 skb->skb_iif = skb->dev->ifindex;
2818 * bonding note: skbs received on inactive slaves should only
2819 * be delivered to pkt handlers that are exact matches. Also
2820 * the deliver_no_wcard flag will be set. If packet handlers
2821 * are sensitive to duplicate packets these skbs will need to
2822 * be dropped at the handler. The vlan accel path may have
2823 * already set the deliver_no_wcard flag.
2825 null_or_orig = NULL;
2826 orig_dev = skb->dev;
2827 master = ACCESS_ONCE(orig_dev->master);
2828 if (skb->deliver_no_wcard)
2829 null_or_orig = orig_dev;
2831 if (skb_bond_should_drop(skb, master)) {
2832 skb->deliver_no_wcard = 1;
2833 null_or_orig = orig_dev; /* deliver only exact match */
2838 __this_cpu_inc(softnet_data.processed);
2839 skb_reset_network_header(skb);
2840 skb_reset_transport_header(skb);
2841 skb->mac_len = skb->network_header - skb->mac_header;
2847 #ifdef CONFIG_NET_CLS_ACT
2848 if (skb->tc_verd & TC_NCLS) {
2849 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2854 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2855 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2856 ptype->dev == orig_dev) {
2858 ret = deliver_skb(skb, pt_prev, orig_dev);
2863 #ifdef CONFIG_NET_CLS_ACT
2864 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2870 /* Handle special case of bridge or macvlan */
2871 rx_handler = rcu_dereference(skb->dev->rx_handler);
2874 ret = deliver_skb(skb, pt_prev, orig_dev);
2877 skb = rx_handler(skb);
2883 * Make sure frames received on VLAN interfaces stacked on
2884 * bonding interfaces still make their way to any base bonding
2885 * device that may have registered for a specific ptype. The
2886 * handler may have to adjust skb->dev and orig_dev.
2888 orig_or_bond = orig_dev;
2889 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2890 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2891 orig_or_bond = vlan_dev_real_dev(skb->dev);
2894 type = skb->protocol;
2895 list_for_each_entry_rcu(ptype,
2896 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2897 if (ptype->type == type && (ptype->dev == null_or_orig ||
2898 ptype->dev == skb->dev || ptype->dev == orig_dev ||
2899 ptype->dev == orig_or_bond)) {
2901 ret = deliver_skb(skb, pt_prev, orig_dev);
2907 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2910 /* Jamal, now you will not able to escape explaining
2911 * me how you were going to use this. :-)
2922 * netif_receive_skb - process receive buffer from network
2923 * @skb: buffer to process
2925 * netif_receive_skb() is the main receive data processing function.
2926 * It always succeeds. The buffer may be dropped during processing
2927 * for congestion control or by the protocol layers.
2929 * This function may only be called from softirq context and interrupts
2930 * should be enabled.
2932 * Return values (usually ignored):
2933 * NET_RX_SUCCESS: no congestion
2934 * NET_RX_DROP: packet was dropped
2936 int netif_receive_skb(struct sk_buff *skb)
2938 if (netdev_tstamp_prequeue)
2939 net_timestamp_check(skb);
2943 struct rps_dev_flow voidflow, *rflow = &voidflow;
2948 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2951 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2955 ret = __netif_receive_skb(skb);
2961 return __netif_receive_skb(skb);
2964 EXPORT_SYMBOL(netif_receive_skb);
2966 /* Network device is going away, flush any packets still pending
2967 * Called with irqs disabled.
2969 static void flush_backlog(void *arg)
2971 struct net_device *dev = arg;
2972 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2973 struct sk_buff *skb, *tmp;
2976 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
2977 if (skb->dev == dev) {
2978 __skb_unlink(skb, &sd->input_pkt_queue);
2980 input_queue_head_incr(sd);
2985 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
2986 if (skb->dev == dev) {
2987 __skb_unlink(skb, &sd->process_queue);
2989 input_queue_head_incr(sd);
2994 static int napi_gro_complete(struct sk_buff *skb)
2996 struct packet_type *ptype;
2997 __be16 type = skb->protocol;
2998 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3001 if (NAPI_GRO_CB(skb)->count == 1) {
3002 skb_shinfo(skb)->gso_size = 0;
3007 list_for_each_entry_rcu(ptype, head, list) {
3008 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3011 err = ptype->gro_complete(skb);
3017 WARN_ON(&ptype->list == head);
3019 return NET_RX_SUCCESS;
3023 return netif_receive_skb(skb);
3026 static void napi_gro_flush(struct napi_struct *napi)
3028 struct sk_buff *skb, *next;
3030 for (skb = napi->gro_list; skb; skb = next) {
3033 napi_gro_complete(skb);
3036 napi->gro_count = 0;
3037 napi->gro_list = NULL;
3040 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3042 struct sk_buff **pp = NULL;
3043 struct packet_type *ptype;
3044 __be16 type = skb->protocol;
3045 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3048 enum gro_result ret;
3050 if (!(skb->dev->features & NETIF_F_GRO))
3053 if (skb_is_gso(skb) || skb_has_frags(skb))
3057 list_for_each_entry_rcu(ptype, head, list) {
3058 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3061 skb_set_network_header(skb, skb_gro_offset(skb));
3062 mac_len = skb->network_header - skb->mac_header;
3063 skb->mac_len = mac_len;
3064 NAPI_GRO_CB(skb)->same_flow = 0;
3065 NAPI_GRO_CB(skb)->flush = 0;
3066 NAPI_GRO_CB(skb)->free = 0;
3068 pp = ptype->gro_receive(&napi->gro_list, skb);
3073 if (&ptype->list == head)
3076 same_flow = NAPI_GRO_CB(skb)->same_flow;
3077 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3080 struct sk_buff *nskb = *pp;
3084 napi_gro_complete(nskb);
3091 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3095 NAPI_GRO_CB(skb)->count = 1;
3096 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3097 skb->next = napi->gro_list;
3098 napi->gro_list = skb;
3102 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3103 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3105 BUG_ON(skb->end - skb->tail < grow);
3107 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3110 skb->data_len -= grow;
3112 skb_shinfo(skb)->frags[0].page_offset += grow;
3113 skb_shinfo(skb)->frags[0].size -= grow;
3115 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3116 put_page(skb_shinfo(skb)->frags[0].page);
3117 memmove(skb_shinfo(skb)->frags,
3118 skb_shinfo(skb)->frags + 1,
3119 --skb_shinfo(skb)->nr_frags);
3130 EXPORT_SYMBOL(dev_gro_receive);
3133 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3137 if (netpoll_rx_on(skb))
3140 for (p = napi->gro_list; p; p = p->next) {
3141 NAPI_GRO_CB(p)->same_flow =
3142 (p->dev == skb->dev) &&
3143 !compare_ether_header(skb_mac_header(p),
3144 skb_gro_mac_header(skb));
3145 NAPI_GRO_CB(p)->flush = 0;
3148 return dev_gro_receive(napi, skb);
3151 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3155 if (netif_receive_skb(skb))
3160 case GRO_MERGED_FREE:
3171 EXPORT_SYMBOL(napi_skb_finish);
3173 void skb_gro_reset_offset(struct sk_buff *skb)
3175 NAPI_GRO_CB(skb)->data_offset = 0;
3176 NAPI_GRO_CB(skb)->frag0 = NULL;
3177 NAPI_GRO_CB(skb)->frag0_len = 0;
3179 if (skb->mac_header == skb->tail &&
3180 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3181 NAPI_GRO_CB(skb)->frag0 =
3182 page_address(skb_shinfo(skb)->frags[0].page) +
3183 skb_shinfo(skb)->frags[0].page_offset;
3184 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3187 EXPORT_SYMBOL(skb_gro_reset_offset);
3189 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3191 skb_gro_reset_offset(skb);
3193 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3195 EXPORT_SYMBOL(napi_gro_receive);
3197 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3199 __skb_pull(skb, skb_headlen(skb));
3200 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3204 EXPORT_SYMBOL(napi_reuse_skb);
3206 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3208 struct sk_buff *skb = napi->skb;
3211 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3217 EXPORT_SYMBOL(napi_get_frags);
3219 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3225 skb->protocol = eth_type_trans(skb, skb->dev);
3227 if (ret == GRO_HELD)
3228 skb_gro_pull(skb, -ETH_HLEN);
3229 else if (netif_receive_skb(skb))
3234 case GRO_MERGED_FREE:
3235 napi_reuse_skb(napi, skb);
3244 EXPORT_SYMBOL(napi_frags_finish);
3246 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3248 struct sk_buff *skb = napi->skb;
3255 skb_reset_mac_header(skb);
3256 skb_gro_reset_offset(skb);
3258 off = skb_gro_offset(skb);
3259 hlen = off + sizeof(*eth);
3260 eth = skb_gro_header_fast(skb, off);
3261 if (skb_gro_header_hard(skb, hlen)) {
3262 eth = skb_gro_header_slow(skb, hlen, off);
3263 if (unlikely(!eth)) {
3264 napi_reuse_skb(napi, skb);
3270 skb_gro_pull(skb, sizeof(*eth));
3273 * This works because the only protocols we care about don't require
3274 * special handling. We'll fix it up properly at the end.
3276 skb->protocol = eth->h_proto;
3281 EXPORT_SYMBOL(napi_frags_skb);
3283 gro_result_t napi_gro_frags(struct napi_struct *napi)
3285 struct sk_buff *skb = napi_frags_skb(napi);
3290 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3292 EXPORT_SYMBOL(napi_gro_frags);
3295 * net_rps_action sends any pending IPI's for rps.
3296 * Note: called with local irq disabled, but exits with local irq enabled.
3298 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3301 struct softnet_data *remsd = sd->rps_ipi_list;
3304 sd->rps_ipi_list = NULL;
3308 /* Send pending IPI's to kick RPS processing on remote cpus. */
3310 struct softnet_data *next = remsd->rps_ipi_next;
3312 if (cpu_online(remsd->cpu))
3313 __smp_call_function_single(remsd->cpu,
3322 static int process_backlog(struct napi_struct *napi, int quota)
3325 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3328 /* Check if we have pending ipi, its better to send them now,
3329 * not waiting net_rx_action() end.
3331 if (sd->rps_ipi_list) {
3332 local_irq_disable();
3333 net_rps_action_and_irq_enable(sd);
3336 napi->weight = weight_p;
3337 local_irq_disable();
3338 while (work < quota) {
3339 struct sk_buff *skb;
3342 while ((skb = __skb_dequeue(&sd->process_queue))) {
3344 __netif_receive_skb(skb);
3345 local_irq_disable();
3346 input_queue_head_incr(sd);
3347 if (++work >= quota) {
3354 qlen = skb_queue_len(&sd->input_pkt_queue);
3356 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3357 &sd->process_queue);
3359 if (qlen < quota - work) {
3361 * Inline a custom version of __napi_complete().
3362 * only current cpu owns and manipulates this napi,
3363 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3364 * we can use a plain write instead of clear_bit(),
3365 * and we dont need an smp_mb() memory barrier.
3367 list_del(&napi->poll_list);
3370 quota = work + qlen;
3380 * __napi_schedule - schedule for receive
3381 * @n: entry to schedule
3383 * The entry's receive function will be scheduled to run
3385 void __napi_schedule(struct napi_struct *n)
3387 unsigned long flags;
3389 local_irq_save(flags);
3390 ____napi_schedule(&__get_cpu_var(softnet_data), n);
3391 local_irq_restore(flags);
3393 EXPORT_SYMBOL(__napi_schedule);
3395 void __napi_complete(struct napi_struct *n)
3397 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3398 BUG_ON(n->gro_list);
3400 list_del(&n->poll_list);
3401 smp_mb__before_clear_bit();
3402 clear_bit(NAPI_STATE_SCHED, &n->state);
3404 EXPORT_SYMBOL(__napi_complete);
3406 void napi_complete(struct napi_struct *n)
3408 unsigned long flags;
3411 * don't let napi dequeue from the cpu poll list
3412 * just in case its running on a different cpu
3414 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3418 local_irq_save(flags);
3420 local_irq_restore(flags);
3422 EXPORT_SYMBOL(napi_complete);
3424 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3425 int (*poll)(struct napi_struct *, int), int weight)
3427 INIT_LIST_HEAD(&napi->poll_list);
3428 napi->gro_count = 0;
3429 napi->gro_list = NULL;
3432 napi->weight = weight;
3433 list_add(&napi->dev_list, &dev->napi_list);
3435 #ifdef CONFIG_NETPOLL
3436 spin_lock_init(&napi->poll_lock);
3437 napi->poll_owner = -1;
3439 set_bit(NAPI_STATE_SCHED, &napi->state);
3441 EXPORT_SYMBOL(netif_napi_add);
3443 void netif_napi_del(struct napi_struct *napi)
3445 struct sk_buff *skb, *next;
3447 list_del_init(&napi->dev_list);
3448 napi_free_frags(napi);
3450 for (skb = napi->gro_list; skb; skb = next) {
3456 napi->gro_list = NULL;
3457 napi->gro_count = 0;
3459 EXPORT_SYMBOL(netif_napi_del);
3461 static void net_rx_action(struct softirq_action *h)
3463 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3464 unsigned long time_limit = jiffies + 2;
3465 int budget = netdev_budget;
3468 local_irq_disable();
3470 while (!list_empty(&sd->poll_list)) {
3471 struct napi_struct *n;
3474 /* If softirq window is exhuasted then punt.
3475 * Allow this to run for 2 jiffies since which will allow
3476 * an average latency of 1.5/HZ.
3478 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3483 /* Even though interrupts have been re-enabled, this
3484 * access is safe because interrupts can only add new
3485 * entries to the tail of this list, and only ->poll()
3486 * calls can remove this head entry from the list.
3488 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3490 have = netpoll_poll_lock(n);
3494 /* This NAPI_STATE_SCHED test is for avoiding a race
3495 * with netpoll's poll_napi(). Only the entity which
3496 * obtains the lock and sees NAPI_STATE_SCHED set will
3497 * actually make the ->poll() call. Therefore we avoid
3498 * accidently calling ->poll() when NAPI is not scheduled.
3501 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3502 work = n->poll(n, weight);
3506 WARN_ON_ONCE(work > weight);
3510 local_irq_disable();
3512 /* Drivers must not modify the NAPI state if they
3513 * consume the entire weight. In such cases this code
3514 * still "owns" the NAPI instance and therefore can
3515 * move the instance around on the list at-will.
3517 if (unlikely(work == weight)) {
3518 if (unlikely(napi_disable_pending(n))) {
3521 local_irq_disable();
3523 list_move_tail(&n->poll_list, &sd->poll_list);
3526 netpoll_poll_unlock(have);
3529 net_rps_action_and_irq_enable(sd);
3531 #ifdef CONFIG_NET_DMA
3533 * There may not be any more sk_buffs coming right now, so push
3534 * any pending DMA copies to hardware
3536 dma_issue_pending_all();
3543 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3547 static gifconf_func_t *gifconf_list[NPROTO];
3550 * register_gifconf - register a SIOCGIF handler
3551 * @family: Address family
3552 * @gifconf: Function handler
3554 * Register protocol dependent address dumping routines. The handler
3555 * that is passed must not be freed or reused until it has been replaced
3556 * by another handler.
3558 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3560 if (family >= NPROTO)
3562 gifconf_list[family] = gifconf;
3565 EXPORT_SYMBOL(register_gifconf);
3569 * Map an interface index to its name (SIOCGIFNAME)
3573 * We need this ioctl for efficient implementation of the
3574 * if_indextoname() function required by the IPv6 API. Without
3575 * it, we would have to search all the interfaces to find a
3579 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3581 struct net_device *dev;
3585 * Fetch the caller's info block.
3588 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3592 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3598 strcpy(ifr.ifr_name, dev->name);
3601 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3607 * Perform a SIOCGIFCONF call. This structure will change
3608 * size eventually, and there is nothing I can do about it.
3609 * Thus we will need a 'compatibility mode'.
3612 static int dev_ifconf(struct net *net, char __user *arg)
3615 struct net_device *dev;
3622 * Fetch the caller's info block.
3625 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3632 * Loop over the interfaces, and write an info block for each.
3636 for_each_netdev(net, dev) {
3637 for (i = 0; i < NPROTO; i++) {
3638 if (gifconf_list[i]) {
3641 done = gifconf_list[i](dev, NULL, 0);
3643 done = gifconf_list[i](dev, pos + total,
3653 * All done. Write the updated control block back to the caller.
3655 ifc.ifc_len = total;
3658 * Both BSD and Solaris return 0 here, so we do too.
3660 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3663 #ifdef CONFIG_PROC_FS
3665 * This is invoked by the /proc filesystem handler to display a device
3668 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3671 struct net *net = seq_file_net(seq);
3673 struct net_device *dev;
3677 return SEQ_START_TOKEN;
3680 for_each_netdev_rcu(net, dev)
3687 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3689 struct net_device *dev = (v == SEQ_START_TOKEN) ?
3690 first_net_device(seq_file_net(seq)) :
3691 next_net_device((struct net_device *)v);
3694 return rcu_dereference(dev);
3697 void dev_seq_stop(struct seq_file *seq, void *v)
3703 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3705 const struct rtnl_link_stats64 *stats = dev_get_stats(dev);
3707 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3708 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3709 dev->name, stats->rx_bytes, stats->rx_packets,
3711 stats->rx_dropped + stats->rx_missed_errors,
3712 stats->rx_fifo_errors,
3713 stats->rx_length_errors + stats->rx_over_errors +
3714 stats->rx_crc_errors + stats->rx_frame_errors,
3715 stats->rx_compressed, stats->multicast,
3716 stats->tx_bytes, stats->tx_packets,
3717 stats->tx_errors, stats->tx_dropped,
3718 stats->tx_fifo_errors, stats->collisions,
3719 stats->tx_carrier_errors +
3720 stats->tx_aborted_errors +
3721 stats->tx_window_errors +
3722 stats->tx_heartbeat_errors,
3723 stats->tx_compressed);
3727 * Called from the PROCfs module. This now uses the new arbitrary sized
3728 * /proc/net interface to create /proc/net/dev
3730 static int dev_seq_show(struct seq_file *seq, void *v)
3732 if (v == SEQ_START_TOKEN)
3733 seq_puts(seq, "Inter-| Receive "
3735 " face |bytes packets errs drop fifo frame "
3736 "compressed multicast|bytes packets errs "
3737 "drop fifo colls carrier compressed\n");
3739 dev_seq_printf_stats(seq, v);
3743 static struct softnet_data *softnet_get_online(loff_t *pos)
3745 struct softnet_data *sd = NULL;
3747 while (*pos < nr_cpu_ids)
3748 if (cpu_online(*pos)) {
3749 sd = &per_cpu(softnet_data, *pos);
3756 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3758 return softnet_get_online(pos);
3761 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3764 return softnet_get_online(pos);
3767 static void softnet_seq_stop(struct seq_file *seq, void *v)
3771 static int softnet_seq_show(struct seq_file *seq, void *v)
3773 struct softnet_data *sd = v;
3775 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3776 sd->processed, sd->dropped, sd->time_squeeze, 0,
3777 0, 0, 0, 0, /* was fastroute */
3778 sd->cpu_collision, sd->received_rps);
3782 static const struct seq_operations dev_seq_ops = {
3783 .start = dev_seq_start,
3784 .next = dev_seq_next,
3785 .stop = dev_seq_stop,
3786 .show = dev_seq_show,
3789 static int dev_seq_open(struct inode *inode, struct file *file)
3791 return seq_open_net(inode, file, &dev_seq_ops,
3792 sizeof(struct seq_net_private));
3795 static const struct file_operations dev_seq_fops = {
3796 .owner = THIS_MODULE,
3797 .open = dev_seq_open,
3799 .llseek = seq_lseek,
3800 .release = seq_release_net,
3803 static const struct seq_operations softnet_seq_ops = {
3804 .start = softnet_seq_start,
3805 .next = softnet_seq_next,
3806 .stop = softnet_seq_stop,
3807 .show = softnet_seq_show,
3810 static int softnet_seq_open(struct inode *inode, struct file *file)
3812 return seq_open(file, &softnet_seq_ops);
3815 static const struct file_operations softnet_seq_fops = {
3816 .owner = THIS_MODULE,
3817 .open = softnet_seq_open,
3819 .llseek = seq_lseek,
3820 .release = seq_release,
3823 static void *ptype_get_idx(loff_t pos)
3825 struct packet_type *pt = NULL;
3829 list_for_each_entry_rcu(pt, &ptype_all, list) {
3835 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3836 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3845 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3849 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3852 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3854 struct packet_type *pt;
3855 struct list_head *nxt;
3859 if (v == SEQ_START_TOKEN)
3860 return ptype_get_idx(0);
3863 nxt = pt->list.next;
3864 if (pt->type == htons(ETH_P_ALL)) {
3865 if (nxt != &ptype_all)
3868 nxt = ptype_base[0].next;
3870 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3872 while (nxt == &ptype_base[hash]) {
3873 if (++hash >= PTYPE_HASH_SIZE)
3875 nxt = ptype_base[hash].next;
3878 return list_entry(nxt, struct packet_type, list);
3881 static void ptype_seq_stop(struct seq_file *seq, void *v)
3887 static int ptype_seq_show(struct seq_file *seq, void *v)
3889 struct packet_type *pt = v;
3891 if (v == SEQ_START_TOKEN)
3892 seq_puts(seq, "Type Device Function\n");
3893 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3894 if (pt->type == htons(ETH_P_ALL))
3895 seq_puts(seq, "ALL ");
3897 seq_printf(seq, "%04x", ntohs(pt->type));
3899 seq_printf(seq, " %-8s %pF\n",
3900 pt->dev ? pt->dev->name : "", pt->func);
3906 static const struct seq_operations ptype_seq_ops = {
3907 .start = ptype_seq_start,
3908 .next = ptype_seq_next,
3909 .stop = ptype_seq_stop,
3910 .show = ptype_seq_show,
3913 static int ptype_seq_open(struct inode *inode, struct file *file)
3915 return seq_open_net(inode, file, &ptype_seq_ops,
3916 sizeof(struct seq_net_private));
3919 static const struct file_operations ptype_seq_fops = {
3920 .owner = THIS_MODULE,
3921 .open = ptype_seq_open,
3923 .llseek = seq_lseek,
3924 .release = seq_release_net,
3928 static int __net_init dev_proc_net_init(struct net *net)
3932 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3934 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3936 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3939 if (wext_proc_init(net))
3945 proc_net_remove(net, "ptype");
3947 proc_net_remove(net, "softnet_stat");
3949 proc_net_remove(net, "dev");
3953 static void __net_exit dev_proc_net_exit(struct net *net)
3955 wext_proc_exit(net);
3957 proc_net_remove(net, "ptype");
3958 proc_net_remove(net, "softnet_stat");
3959 proc_net_remove(net, "dev");
3962 static struct pernet_operations __net_initdata dev_proc_ops = {
3963 .init = dev_proc_net_init,
3964 .exit = dev_proc_net_exit,
3967 static int __init dev_proc_init(void)
3969 return register_pernet_subsys(&dev_proc_ops);
3972 #define dev_proc_init() 0
3973 #endif /* CONFIG_PROC_FS */
3977 * netdev_set_master - set up master/slave pair
3978 * @slave: slave device
3979 * @master: new master device
3981 * Changes the master device of the slave. Pass %NULL to break the
3982 * bonding. The caller must hold the RTNL semaphore. On a failure
3983 * a negative errno code is returned. On success the reference counts
3984 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3985 * function returns zero.
3987 int netdev_set_master(struct net_device *slave, struct net_device *master)
3989 struct net_device *old = slave->master;
3999 slave->master = master;
4006 slave->flags |= IFF_SLAVE;
4008 slave->flags &= ~IFF_SLAVE;
4010 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4013 EXPORT_SYMBOL(netdev_set_master);
4015 static void dev_change_rx_flags(struct net_device *dev, int flags)
4017 const struct net_device_ops *ops = dev->netdev_ops;
4019 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4020 ops->ndo_change_rx_flags(dev, flags);
4023 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4025 unsigned short old_flags = dev->flags;
4031 dev->flags |= IFF_PROMISC;
4032 dev->promiscuity += inc;
4033 if (dev->promiscuity == 0) {
4036 * If inc causes overflow, untouch promisc and return error.
4039 dev->flags &= ~IFF_PROMISC;
4041 dev->promiscuity -= inc;
4042 printk(KERN_WARNING "%s: promiscuity touches roof, "
4043 "set promiscuity failed, promiscuity feature "
4044 "of device might be broken.\n", dev->name);
4048 if (dev->flags != old_flags) {
4049 printk(KERN_INFO "device %s %s promiscuous mode\n",
4050 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4052 if (audit_enabled) {
4053 current_uid_gid(&uid, &gid);
4054 audit_log(current->audit_context, GFP_ATOMIC,
4055 AUDIT_ANOM_PROMISCUOUS,
4056 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4057 dev->name, (dev->flags & IFF_PROMISC),
4058 (old_flags & IFF_PROMISC),
4059 audit_get_loginuid(current),
4061 audit_get_sessionid(current));
4064 dev_change_rx_flags(dev, IFF_PROMISC);
4070 * dev_set_promiscuity - update promiscuity count on a device
4074 * Add or remove promiscuity from a device. While the count in the device
4075 * remains above zero the interface remains promiscuous. Once it hits zero
4076 * the device reverts back to normal filtering operation. A negative inc
4077 * value is used to drop promiscuity on the device.
4078 * Return 0 if successful or a negative errno code on error.
4080 int dev_set_promiscuity(struct net_device *dev, int inc)
4082 unsigned short old_flags = dev->flags;
4085 err = __dev_set_promiscuity(dev, inc);
4088 if (dev->flags != old_flags)
4089 dev_set_rx_mode(dev);
4092 EXPORT_SYMBOL(dev_set_promiscuity);
4095 * dev_set_allmulti - update allmulti count on a device
4099 * Add or remove reception of all multicast frames to a device. While the
4100 * count in the device remains above zero the interface remains listening
4101 * to all interfaces. Once it hits zero the device reverts back to normal
4102 * filtering operation. A negative @inc value is used to drop the counter
4103 * when releasing a resource needing all multicasts.
4104 * Return 0 if successful or a negative errno code on error.
4107 int dev_set_allmulti(struct net_device *dev, int inc)
4109 unsigned short old_flags = dev->flags;
4113 dev->flags |= IFF_ALLMULTI;
4114 dev->allmulti += inc;
4115 if (dev->allmulti == 0) {
4118 * If inc causes overflow, untouch allmulti and return error.
4121 dev->flags &= ~IFF_ALLMULTI;
4123 dev->allmulti -= inc;
4124 printk(KERN_WARNING "%s: allmulti touches roof, "
4125 "set allmulti failed, allmulti feature of "
4126 "device might be broken.\n", dev->name);
4130 if (dev->flags ^ old_flags) {
4131 dev_change_rx_flags(dev, IFF_ALLMULTI);
4132 dev_set_rx_mode(dev);
4136 EXPORT_SYMBOL(dev_set_allmulti);
4139 * Upload unicast and multicast address lists to device and
4140 * configure RX filtering. When the device doesn't support unicast
4141 * filtering it is put in promiscuous mode while unicast addresses
4144 void __dev_set_rx_mode(struct net_device *dev)
4146 const struct net_device_ops *ops = dev->netdev_ops;
4148 /* dev_open will call this function so the list will stay sane. */
4149 if (!(dev->flags&IFF_UP))
4152 if (!netif_device_present(dev))
4155 if (ops->ndo_set_rx_mode)
4156 ops->ndo_set_rx_mode(dev);
4158 /* Unicast addresses changes may only happen under the rtnl,
4159 * therefore calling __dev_set_promiscuity here is safe.
4161 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4162 __dev_set_promiscuity(dev, 1);
4163 dev->uc_promisc = 1;
4164 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4165 __dev_set_promiscuity(dev, -1);
4166 dev->uc_promisc = 0;
4169 if (ops->ndo_set_multicast_list)
4170 ops->ndo_set_multicast_list(dev);
4174 void dev_set_rx_mode(struct net_device *dev)
4176 netif_addr_lock_bh(dev);
4177 __dev_set_rx_mode(dev);
4178 netif_addr_unlock_bh(dev);
4182 * dev_get_flags - get flags reported to userspace
4185 * Get the combination of flag bits exported through APIs to userspace.
4187 unsigned dev_get_flags(const struct net_device *dev)
4191 flags = (dev->flags & ~(IFF_PROMISC |
4196 (dev->gflags & (IFF_PROMISC |
4199 if (netif_running(dev)) {
4200 if (netif_oper_up(dev))
4201 flags |= IFF_RUNNING;
4202 if (netif_carrier_ok(dev))
4203 flags |= IFF_LOWER_UP;
4204 if (netif_dormant(dev))
4205 flags |= IFF_DORMANT;
4210 EXPORT_SYMBOL(dev_get_flags);
4212 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4214 int old_flags = dev->flags;
4220 * Set the flags on our device.
4223 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4224 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4226 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4230 * Load in the correct multicast list now the flags have changed.
4233 if ((old_flags ^ flags) & IFF_MULTICAST)
4234 dev_change_rx_flags(dev, IFF_MULTICAST);
4236 dev_set_rx_mode(dev);
4239 * Have we downed the interface. We handle IFF_UP ourselves
4240 * according to user attempts to set it, rather than blindly
4245 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4246 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4249 dev_set_rx_mode(dev);
4252 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4253 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4255 dev->gflags ^= IFF_PROMISC;
4256 dev_set_promiscuity(dev, inc);
4259 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4260 is important. Some (broken) drivers set IFF_PROMISC, when
4261 IFF_ALLMULTI is requested not asking us and not reporting.
4263 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4264 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4266 dev->gflags ^= IFF_ALLMULTI;
4267 dev_set_allmulti(dev, inc);
4273 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4275 unsigned int changes = dev->flags ^ old_flags;
4277 if (changes & IFF_UP) {
4278 if (dev->flags & IFF_UP)
4279 call_netdevice_notifiers(NETDEV_UP, dev);
4281 call_netdevice_notifiers(NETDEV_DOWN, dev);
4284 if (dev->flags & IFF_UP &&
4285 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4286 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4290 * dev_change_flags - change device settings
4292 * @flags: device state flags
4294 * Change settings on device based state flags. The flags are
4295 * in the userspace exported format.
4297 int dev_change_flags(struct net_device *dev, unsigned flags)
4300 int old_flags = dev->flags;
4302 ret = __dev_change_flags(dev, flags);
4306 changes = old_flags ^ dev->flags;
4308 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4310 __dev_notify_flags(dev, old_flags);
4313 EXPORT_SYMBOL(dev_change_flags);
4316 * dev_set_mtu - Change maximum transfer unit
4318 * @new_mtu: new transfer unit
4320 * Change the maximum transfer size of the network device.
4322 int dev_set_mtu(struct net_device *dev, int new_mtu)
4324 const struct net_device_ops *ops = dev->netdev_ops;
4327 if (new_mtu == dev->mtu)
4330 /* MTU must be positive. */
4334 if (!netif_device_present(dev))
4338 if (ops->ndo_change_mtu)
4339 err = ops->ndo_change_mtu(dev, new_mtu);
4343 if (!err && dev->flags & IFF_UP)
4344 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4347 EXPORT_SYMBOL(dev_set_mtu);
4350 * dev_set_mac_address - Change Media Access Control Address
4354 * Change the hardware (MAC) address of the device
4356 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4358 const struct net_device_ops *ops = dev->netdev_ops;
4361 if (!ops->ndo_set_mac_address)
4363 if (sa->sa_family != dev->type)
4365 if (!netif_device_present(dev))
4367 err = ops->ndo_set_mac_address(dev, sa);
4369 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4372 EXPORT_SYMBOL(dev_set_mac_address);
4375 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4377 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4380 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4386 case SIOCGIFFLAGS: /* Get interface flags */
4387 ifr->ifr_flags = (short) dev_get_flags(dev);
4390 case SIOCGIFMETRIC: /* Get the metric on the interface
4391 (currently unused) */
4392 ifr->ifr_metric = 0;
4395 case SIOCGIFMTU: /* Get the MTU of a device */
4396 ifr->ifr_mtu = dev->mtu;
4401 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4403 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4404 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4405 ifr->ifr_hwaddr.sa_family = dev->type;
4413 ifr->ifr_map.mem_start = dev->mem_start;
4414 ifr->ifr_map.mem_end = dev->mem_end;
4415 ifr->ifr_map.base_addr = dev->base_addr;
4416 ifr->ifr_map.irq = dev->irq;
4417 ifr->ifr_map.dma = dev->dma;
4418 ifr->ifr_map.port = dev->if_port;
4422 ifr->ifr_ifindex = dev->ifindex;
4426 ifr->ifr_qlen = dev->tx_queue_len;
4430 /* dev_ioctl() should ensure this case
4442 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4444 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4447 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4448 const struct net_device_ops *ops;
4453 ops = dev->netdev_ops;
4456 case SIOCSIFFLAGS: /* Set interface flags */
4457 return dev_change_flags(dev, ifr->ifr_flags);
4459 case SIOCSIFMETRIC: /* Set the metric on the interface
4460 (currently unused) */
4463 case SIOCSIFMTU: /* Set the MTU of a device */
4464 return dev_set_mtu(dev, ifr->ifr_mtu);
4467 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4469 case SIOCSIFHWBROADCAST:
4470 if (ifr->ifr_hwaddr.sa_family != dev->type)
4472 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4473 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4474 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4478 if (ops->ndo_set_config) {
4479 if (!netif_device_present(dev))
4481 return ops->ndo_set_config(dev, &ifr->ifr_map);
4486 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4487 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4489 if (!netif_device_present(dev))
4491 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4494 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4495 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4497 if (!netif_device_present(dev))
4499 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4502 if (ifr->ifr_qlen < 0)
4504 dev->tx_queue_len = ifr->ifr_qlen;
4508 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4509 return dev_change_name(dev, ifr->ifr_newname);
4512 * Unknown or private ioctl
4515 if ((cmd >= SIOCDEVPRIVATE &&
4516 cmd <= SIOCDEVPRIVATE + 15) ||
4517 cmd == SIOCBONDENSLAVE ||
4518 cmd == SIOCBONDRELEASE ||
4519 cmd == SIOCBONDSETHWADDR ||
4520 cmd == SIOCBONDSLAVEINFOQUERY ||
4521 cmd == SIOCBONDINFOQUERY ||
4522 cmd == SIOCBONDCHANGEACTIVE ||
4523 cmd == SIOCGMIIPHY ||
4524 cmd == SIOCGMIIREG ||
4525 cmd == SIOCSMIIREG ||
4526 cmd == SIOCBRADDIF ||
4527 cmd == SIOCBRDELIF ||
4528 cmd == SIOCSHWTSTAMP ||
4529 cmd == SIOCWANDEV) {
4531 if (ops->ndo_do_ioctl) {
4532 if (netif_device_present(dev))
4533 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4545 * This function handles all "interface"-type I/O control requests. The actual
4546 * 'doing' part of this is dev_ifsioc above.
4550 * dev_ioctl - network device ioctl
4551 * @net: the applicable net namespace
4552 * @cmd: command to issue
4553 * @arg: pointer to a struct ifreq in user space
4555 * Issue ioctl functions to devices. This is normally called by the
4556 * user space syscall interfaces but can sometimes be useful for
4557 * other purposes. The return value is the return from the syscall if
4558 * positive or a negative errno code on error.
4561 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4567 /* One special case: SIOCGIFCONF takes ifconf argument
4568 and requires shared lock, because it sleeps writing
4572 if (cmd == SIOCGIFCONF) {
4574 ret = dev_ifconf(net, (char __user *) arg);
4578 if (cmd == SIOCGIFNAME)
4579 return dev_ifname(net, (struct ifreq __user *)arg);
4581 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4584 ifr.ifr_name[IFNAMSIZ-1] = 0;
4586 colon = strchr(ifr.ifr_name, ':');
4591 * See which interface the caller is talking about.
4596 * These ioctl calls:
4597 * - can be done by all.
4598 * - atomic and do not require locking.
4609 dev_load(net, ifr.ifr_name);
4611 ret = dev_ifsioc_locked(net, &ifr, cmd);
4616 if (copy_to_user(arg, &ifr,
4617 sizeof(struct ifreq)))
4623 dev_load(net, ifr.ifr_name);
4625 ret = dev_ethtool(net, &ifr);
4630 if (copy_to_user(arg, &ifr,
4631 sizeof(struct ifreq)))
4637 * These ioctl calls:
4638 * - require superuser power.
4639 * - require strict serialization.
4645 if (!capable(CAP_NET_ADMIN))
4647 dev_load(net, ifr.ifr_name);
4649 ret = dev_ifsioc(net, &ifr, cmd);
4654 if (copy_to_user(arg, &ifr,
4655 sizeof(struct ifreq)))
4661 * These ioctl calls:
4662 * - require superuser power.
4663 * - require strict serialization.
4664 * - do not return a value
4674 case SIOCSIFHWBROADCAST:
4677 case SIOCBONDENSLAVE:
4678 case SIOCBONDRELEASE:
4679 case SIOCBONDSETHWADDR:
4680 case SIOCBONDCHANGEACTIVE:
4684 if (!capable(CAP_NET_ADMIN))
4687 case SIOCBONDSLAVEINFOQUERY:
4688 case SIOCBONDINFOQUERY:
4689 dev_load(net, ifr.ifr_name);
4691 ret = dev_ifsioc(net, &ifr, cmd);
4696 /* Get the per device memory space. We can add this but
4697 * currently do not support it */
4699 /* Set the per device memory buffer space.
4700 * Not applicable in our case */
4705 * Unknown or private ioctl.
4708 if (cmd == SIOCWANDEV ||
4709 (cmd >= SIOCDEVPRIVATE &&
4710 cmd <= SIOCDEVPRIVATE + 15)) {
4711 dev_load(net, ifr.ifr_name);
4713 ret = dev_ifsioc(net, &ifr, cmd);
4715 if (!ret && copy_to_user(arg, &ifr,
4716 sizeof(struct ifreq)))
4720 /* Take care of Wireless Extensions */
4721 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4722 return wext_handle_ioctl(net, &ifr, cmd, arg);
4729 * dev_new_index - allocate an ifindex
4730 * @net: the applicable net namespace
4732 * Returns a suitable unique value for a new device interface
4733 * number. The caller must hold the rtnl semaphore or the
4734 * dev_base_lock to be sure it remains unique.
4736 static int dev_new_index(struct net *net)
4742 if (!__dev_get_by_index(net, ifindex))
4747 /* Delayed registration/unregisteration */
4748 static LIST_HEAD(net_todo_list);
4750 static void net_set_todo(struct net_device *dev)
4752 list_add_tail(&dev->todo_list, &net_todo_list);
4755 static void rollback_registered_many(struct list_head *head)
4757 struct net_device *dev, *tmp;
4759 BUG_ON(dev_boot_phase);
4762 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4763 /* Some devices call without registering
4764 * for initialization unwind. Remove those
4765 * devices and proceed with the remaining.
4767 if (dev->reg_state == NETREG_UNINITIALIZED) {
4768 pr_debug("unregister_netdevice: device %s/%p never "
4769 "was registered\n", dev->name, dev);
4772 list_del(&dev->unreg_list);
4776 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4778 /* If device is running, close it first. */
4781 /* And unlink it from device chain. */
4782 unlist_netdevice(dev);
4784 dev->reg_state = NETREG_UNREGISTERING;
4789 list_for_each_entry(dev, head, unreg_list) {
4790 /* Shutdown queueing discipline. */
4794 /* Notify protocols, that we are about to destroy
4795 this device. They should clean all the things.
4797 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4799 if (!dev->rtnl_link_ops ||
4800 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4801 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4804 * Flush the unicast and multicast chains
4809 if (dev->netdev_ops->ndo_uninit)
4810 dev->netdev_ops->ndo_uninit(dev);
4812 /* Notifier chain MUST detach us from master device. */
4813 WARN_ON(dev->master);
4815 /* Remove entries from kobject tree */
4816 netdev_unregister_kobject(dev);
4819 /* Process any work delayed until the end of the batch */
4820 dev = list_first_entry(head, struct net_device, unreg_list);
4821 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4825 list_for_each_entry(dev, head, unreg_list)
4829 static void rollback_registered(struct net_device *dev)
4833 list_add(&dev->unreg_list, &single);
4834 rollback_registered_many(&single);
4837 static void __netdev_init_queue_locks_one(struct net_device *dev,
4838 struct netdev_queue *dev_queue,
4841 spin_lock_init(&dev_queue->_xmit_lock);
4842 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4843 dev_queue->xmit_lock_owner = -1;
4846 static void netdev_init_queue_locks(struct net_device *dev)
4848 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4849 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4852 unsigned long netdev_fix_features(unsigned long features, const char *name)
4854 /* Fix illegal SG+CSUM combinations. */
4855 if ((features & NETIF_F_SG) &&
4856 !(features & NETIF_F_ALL_CSUM)) {
4858 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4859 "checksum feature.\n", name);
4860 features &= ~NETIF_F_SG;
4863 /* TSO requires that SG is present as well. */
4864 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4866 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4867 "SG feature.\n", name);
4868 features &= ~NETIF_F_TSO;
4871 if (features & NETIF_F_UFO) {
4872 if (!(features & NETIF_F_GEN_CSUM)) {
4874 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4875 "since no NETIF_F_HW_CSUM feature.\n",
4877 features &= ~NETIF_F_UFO;
4880 if (!(features & NETIF_F_SG)) {
4882 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4883 "since no NETIF_F_SG feature.\n", name);
4884 features &= ~NETIF_F_UFO;
4890 EXPORT_SYMBOL(netdev_fix_features);
4893 * netif_stacked_transfer_operstate - transfer operstate
4894 * @rootdev: the root or lower level device to transfer state from
4895 * @dev: the device to transfer operstate to
4897 * Transfer operational state from root to device. This is normally
4898 * called when a stacking relationship exists between the root
4899 * device and the device(a leaf device).
4901 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4902 struct net_device *dev)
4904 if (rootdev->operstate == IF_OPER_DORMANT)
4905 netif_dormant_on(dev);
4907 netif_dormant_off(dev);
4909 if (netif_carrier_ok(rootdev)) {
4910 if (!netif_carrier_ok(dev))
4911 netif_carrier_on(dev);
4913 if (netif_carrier_ok(dev))
4914 netif_carrier_off(dev);
4917 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4920 * register_netdevice - register a network device
4921 * @dev: device to register
4923 * Take a completed network device structure and add it to the kernel
4924 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4925 * chain. 0 is returned on success. A negative errno code is returned
4926 * on a failure to set up the device, or if the name is a duplicate.
4928 * Callers must hold the rtnl semaphore. You may want
4929 * register_netdev() instead of this.
4932 * The locking appears insufficient to guarantee two parallel registers
4933 * will not get the same name.
4936 int register_netdevice(struct net_device *dev)
4939 struct net *net = dev_net(dev);
4941 BUG_ON(dev_boot_phase);
4946 /* When net_device's are persistent, this will be fatal. */
4947 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4950 spin_lock_init(&dev->addr_list_lock);
4951 netdev_set_addr_lockdep_class(dev);
4952 netdev_init_queue_locks(dev);
4957 if (!dev->num_rx_queues) {
4959 * Allocate a single RX queue if driver never called
4963 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
4969 dev->_rx->first = dev->_rx;
4970 atomic_set(&dev->_rx->count, 1);
4971 dev->num_rx_queues = 1;
4974 /* Init, if this function is available */
4975 if (dev->netdev_ops->ndo_init) {
4976 ret = dev->netdev_ops->ndo_init(dev);
4984 ret = dev_get_valid_name(dev, dev->name, 0);
4988 dev->ifindex = dev_new_index(net);
4989 if (dev->iflink == -1)
4990 dev->iflink = dev->ifindex;
4992 /* Fix illegal checksum combinations */
4993 if ((dev->features & NETIF_F_HW_CSUM) &&
4994 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4995 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4997 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5000 if ((dev->features & NETIF_F_NO_CSUM) &&
5001 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5002 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5004 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5007 dev->features = netdev_fix_features(dev->features, dev->name);
5009 /* Enable software GSO if SG is supported. */
5010 if (dev->features & NETIF_F_SG)
5011 dev->features |= NETIF_F_GSO;
5013 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5014 ret = notifier_to_errno(ret);
5018 ret = netdev_register_kobject(dev);
5021 dev->reg_state = NETREG_REGISTERED;
5024 * Default initial state at registry is that the
5025 * device is present.
5028 set_bit(__LINK_STATE_PRESENT, &dev->state);
5030 dev_init_scheduler(dev);
5032 list_netdevice(dev);
5034 /* Notify protocols, that a new device appeared. */
5035 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5036 ret = notifier_to_errno(ret);
5038 rollback_registered(dev);
5039 dev->reg_state = NETREG_UNREGISTERED;
5042 * Prevent userspace races by waiting until the network
5043 * device is fully setup before sending notifications.
5045 if (!dev->rtnl_link_ops ||
5046 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5047 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5053 if (dev->netdev_ops->ndo_uninit)
5054 dev->netdev_ops->ndo_uninit(dev);
5057 EXPORT_SYMBOL(register_netdevice);
5060 * init_dummy_netdev - init a dummy network device for NAPI
5061 * @dev: device to init
5063 * This takes a network device structure and initialize the minimum
5064 * amount of fields so it can be used to schedule NAPI polls without
5065 * registering a full blown interface. This is to be used by drivers
5066 * that need to tie several hardware interfaces to a single NAPI
5067 * poll scheduler due to HW limitations.
5069 int init_dummy_netdev(struct net_device *dev)
5071 /* Clear everything. Note we don't initialize spinlocks
5072 * are they aren't supposed to be taken by any of the
5073 * NAPI code and this dummy netdev is supposed to be
5074 * only ever used for NAPI polls
5076 memset(dev, 0, sizeof(struct net_device));
5078 /* make sure we BUG if trying to hit standard
5079 * register/unregister code path
5081 dev->reg_state = NETREG_DUMMY;
5083 /* initialize the ref count */
5084 atomic_set(&dev->refcnt, 1);
5086 /* NAPI wants this */
5087 INIT_LIST_HEAD(&dev->napi_list);
5089 /* a dummy interface is started by default */
5090 set_bit(__LINK_STATE_PRESENT, &dev->state);
5091 set_bit(__LINK_STATE_START, &dev->state);
5095 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5099 * register_netdev - register a network device
5100 * @dev: device to register
5102 * Take a completed network device structure and add it to the kernel
5103 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5104 * chain. 0 is returned on success. A negative errno code is returned
5105 * on a failure to set up the device, or if the name is a duplicate.
5107 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5108 * and expands the device name if you passed a format string to
5111 int register_netdev(struct net_device *dev)
5118 * If the name is a format string the caller wants us to do a
5121 if (strchr(dev->name, '%')) {
5122 err = dev_alloc_name(dev, dev->name);
5127 err = register_netdevice(dev);
5132 EXPORT_SYMBOL(register_netdev);
5135 * netdev_wait_allrefs - wait until all references are gone.
5137 * This is called when unregistering network devices.
5139 * Any protocol or device that holds a reference should register
5140 * for netdevice notification, and cleanup and put back the
5141 * reference if they receive an UNREGISTER event.
5142 * We can get stuck here if buggy protocols don't correctly
5145 static void netdev_wait_allrefs(struct net_device *dev)
5147 unsigned long rebroadcast_time, warning_time;
5149 linkwatch_forget_dev(dev);
5151 rebroadcast_time = warning_time = jiffies;
5152 while (atomic_read(&dev->refcnt) != 0) {
5153 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5156 /* Rebroadcast unregister notification */
5157 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5158 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5159 * should have already handle it the first time */
5161 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5163 /* We must not have linkwatch events
5164 * pending on unregister. If this
5165 * happens, we simply run the queue
5166 * unscheduled, resulting in a noop
5169 linkwatch_run_queue();
5174 rebroadcast_time = jiffies;
5179 if (time_after(jiffies, warning_time + 10 * HZ)) {
5180 printk(KERN_EMERG "unregister_netdevice: "
5181 "waiting for %s to become free. Usage "
5183 dev->name, atomic_read(&dev->refcnt));
5184 warning_time = jiffies;
5193 * register_netdevice(x1);
5194 * register_netdevice(x2);
5196 * unregister_netdevice(y1);
5197 * unregister_netdevice(y2);
5203 * We are invoked by rtnl_unlock().
5204 * This allows us to deal with problems:
5205 * 1) We can delete sysfs objects which invoke hotplug
5206 * without deadlocking with linkwatch via keventd.
5207 * 2) Since we run with the RTNL semaphore not held, we can sleep
5208 * safely in order to wait for the netdev refcnt to drop to zero.
5210 * We must not return until all unregister events added during
5211 * the interval the lock was held have been completed.
5213 void netdev_run_todo(void)
5215 struct list_head list;
5217 /* Snapshot list, allow later requests */
5218 list_replace_init(&net_todo_list, &list);
5222 while (!list_empty(&list)) {
5223 struct net_device *dev
5224 = list_first_entry(&list, struct net_device, todo_list);
5225 list_del(&dev->todo_list);
5227 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5228 printk(KERN_ERR "network todo '%s' but state %d\n",
5229 dev->name, dev->reg_state);
5234 dev->reg_state = NETREG_UNREGISTERED;
5236 on_each_cpu(flush_backlog, dev, 1);
5238 netdev_wait_allrefs(dev);
5241 BUG_ON(atomic_read(&dev->refcnt));
5242 WARN_ON(dev->ip_ptr);
5243 WARN_ON(dev->ip6_ptr);
5244 WARN_ON(dev->dn_ptr);
5246 if (dev->destructor)
5247 dev->destructor(dev);
5249 /* Free network device */
5250 kobject_put(&dev->dev.kobj);
5255 * dev_txq_stats_fold - fold tx_queues stats
5256 * @dev: device to get statistics from
5257 * @stats: struct net_device_stats to hold results
5259 void dev_txq_stats_fold(const struct net_device *dev,
5260 struct net_device_stats *stats)
5262 unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5264 struct netdev_queue *txq;
5266 for (i = 0; i < dev->num_tx_queues; i++) {
5267 txq = netdev_get_tx_queue(dev, i);
5268 tx_bytes += txq->tx_bytes;
5269 tx_packets += txq->tx_packets;
5270 tx_dropped += txq->tx_dropped;
5272 if (tx_bytes || tx_packets || tx_dropped) {
5273 stats->tx_bytes = tx_bytes;
5274 stats->tx_packets = tx_packets;
5275 stats->tx_dropped = tx_dropped;
5278 EXPORT_SYMBOL(dev_txq_stats_fold);
5281 * dev_get_stats - get network device statistics
5282 * @dev: device to get statistics from
5284 * Get network statistics from device. The device driver may provide
5285 * its own method by setting dev->netdev_ops->get_stats64 or
5286 * dev->netdev_ops->get_stats; otherwise the internal statistics
5287 * structure is used.
5289 const struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev)
5291 const struct net_device_ops *ops = dev->netdev_ops;
5293 if (ops->ndo_get_stats64)
5294 return ops->ndo_get_stats64(dev);
5295 if (ops->ndo_get_stats)
5296 return (struct rtnl_link_stats64 *)ops->ndo_get_stats(dev);
5298 dev_txq_stats_fold(dev, &dev->stats);
5299 return &dev->stats64;
5301 EXPORT_SYMBOL(dev_get_stats);
5303 static void netdev_init_one_queue(struct net_device *dev,
5304 struct netdev_queue *queue,
5310 static void netdev_init_queues(struct net_device *dev)
5312 netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5313 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5314 spin_lock_init(&dev->tx_global_lock);
5318 * alloc_netdev_mq - allocate network device
5319 * @sizeof_priv: size of private data to allocate space for
5320 * @name: device name format string
5321 * @setup: callback to initialize device
5322 * @queue_count: the number of subqueues to allocate
5324 * Allocates a struct net_device with private data area for driver use
5325 * and performs basic initialization. Also allocates subquue structs
5326 * for each queue on the device at the end of the netdevice.
5328 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5329 void (*setup)(struct net_device *), unsigned int queue_count)
5331 struct netdev_queue *tx;
5332 struct net_device *dev;
5334 struct net_device *p;
5336 struct netdev_rx_queue *rx;
5340 BUG_ON(strlen(name) >= sizeof(dev->name));
5342 alloc_size = sizeof(struct net_device);
5344 /* ensure 32-byte alignment of private area */
5345 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5346 alloc_size += sizeof_priv;
5348 /* ensure 32-byte alignment of whole construct */
5349 alloc_size += NETDEV_ALIGN - 1;
5351 p = kzalloc(alloc_size, GFP_KERNEL);
5353 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5357 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5359 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5365 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5367 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5372 atomic_set(&rx->count, queue_count);
5375 * Set a pointer to first element in the array which holds the
5378 for (i = 0; i < queue_count; i++)
5382 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5383 dev->padded = (char *)dev - (char *)p;
5385 if (dev_addr_init(dev))
5391 dev_net_set(dev, &init_net);
5394 dev->num_tx_queues = queue_count;
5395 dev->real_num_tx_queues = queue_count;
5399 dev->num_rx_queues = queue_count;
5402 dev->gso_max_size = GSO_MAX_SIZE;
5404 netdev_init_queues(dev);
5406 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5407 dev->ethtool_ntuple_list.count = 0;
5408 INIT_LIST_HEAD(&dev->napi_list);
5409 INIT_LIST_HEAD(&dev->unreg_list);
5410 INIT_LIST_HEAD(&dev->link_watch_list);
5411 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5413 strcpy(dev->name, name);
5426 EXPORT_SYMBOL(alloc_netdev_mq);
5429 * free_netdev - free network device
5432 * This function does the last stage of destroying an allocated device
5433 * interface. The reference to the device object is released.
5434 * If this is the last reference then it will be freed.
5436 void free_netdev(struct net_device *dev)
5438 struct napi_struct *p, *n;
5440 release_net(dev_net(dev));
5444 /* Flush device addresses */
5445 dev_addr_flush(dev);
5447 /* Clear ethtool n-tuple list */
5448 ethtool_ntuple_flush(dev);
5450 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5453 /* Compatibility with error handling in drivers */
5454 if (dev->reg_state == NETREG_UNINITIALIZED) {
5455 kfree((char *)dev - dev->padded);
5459 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5460 dev->reg_state = NETREG_RELEASED;
5462 /* will free via device release */
5463 put_device(&dev->dev);
5465 EXPORT_SYMBOL(free_netdev);
5468 * synchronize_net - Synchronize with packet receive processing
5470 * Wait for packets currently being received to be done.
5471 * Does not block later packets from starting.
5473 void synchronize_net(void)
5478 EXPORT_SYMBOL(synchronize_net);
5481 * unregister_netdevice_queue - remove device from the kernel
5485 * This function shuts down a device interface and removes it
5486 * from the kernel tables.
5487 * If head not NULL, device is queued to be unregistered later.
5489 * Callers must hold the rtnl semaphore. You may want
5490 * unregister_netdev() instead of this.
5493 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5498 list_move_tail(&dev->unreg_list, head);
5500 rollback_registered(dev);
5501 /* Finish processing unregister after unlock */
5505 EXPORT_SYMBOL(unregister_netdevice_queue);
5508 * unregister_netdevice_many - unregister many devices
5509 * @head: list of devices
5511 void unregister_netdevice_many(struct list_head *head)
5513 struct net_device *dev;
5515 if (!list_empty(head)) {
5516 rollback_registered_many(head);
5517 list_for_each_entry(dev, head, unreg_list)
5521 EXPORT_SYMBOL(unregister_netdevice_many);
5524 * unregister_netdev - remove device from the kernel
5527 * This function shuts down a device interface and removes it
5528 * from the kernel tables.
5530 * This is just a wrapper for unregister_netdevice that takes
5531 * the rtnl semaphore. In general you want to use this and not
5532 * unregister_netdevice.
5534 void unregister_netdev(struct net_device *dev)
5537 unregister_netdevice(dev);
5540 EXPORT_SYMBOL(unregister_netdev);
5543 * dev_change_net_namespace - move device to different nethost namespace
5545 * @net: network namespace
5546 * @pat: If not NULL name pattern to try if the current device name
5547 * is already taken in the destination network namespace.
5549 * This function shuts down a device interface and moves it
5550 * to a new network namespace. On success 0 is returned, on
5551 * a failure a netagive errno code is returned.
5553 * Callers must hold the rtnl semaphore.
5556 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5562 /* Don't allow namespace local devices to be moved. */
5564 if (dev->features & NETIF_F_NETNS_LOCAL)
5567 /* Ensure the device has been registrered */
5569 if (dev->reg_state != NETREG_REGISTERED)
5572 /* Get out if there is nothing todo */
5574 if (net_eq(dev_net(dev), net))
5577 /* Pick the destination device name, and ensure
5578 * we can use it in the destination network namespace.
5581 if (__dev_get_by_name(net, dev->name)) {
5582 /* We get here if we can't use the current device name */
5585 if (dev_get_valid_name(dev, pat, 1))
5590 * And now a mini version of register_netdevice unregister_netdevice.
5593 /* If device is running close it first. */
5596 /* And unlink it from device chain */
5598 unlist_netdevice(dev);
5602 /* Shutdown queueing discipline. */
5605 /* Notify protocols, that we are about to destroy
5606 this device. They should clean all the things.
5608 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5609 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5612 * Flush the unicast and multicast chains
5617 /* Actually switch the network namespace */
5618 dev_net_set(dev, net);
5620 /* If there is an ifindex conflict assign a new one */
5621 if (__dev_get_by_index(net, dev->ifindex)) {
5622 int iflink = (dev->iflink == dev->ifindex);
5623 dev->ifindex = dev_new_index(net);
5625 dev->iflink = dev->ifindex;
5628 /* Fixup kobjects */
5629 err = device_rename(&dev->dev, dev->name);
5632 /* Add the device back in the hashes */
5633 list_netdevice(dev);
5635 /* Notify protocols, that a new device appeared. */
5636 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5639 * Prevent userspace races by waiting until the network
5640 * device is fully setup before sending notifications.
5642 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5649 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5651 static int dev_cpu_callback(struct notifier_block *nfb,
5652 unsigned long action,
5655 struct sk_buff **list_skb;
5656 struct sk_buff *skb;
5657 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5658 struct softnet_data *sd, *oldsd;
5660 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5663 local_irq_disable();
5664 cpu = smp_processor_id();
5665 sd = &per_cpu(softnet_data, cpu);
5666 oldsd = &per_cpu(softnet_data, oldcpu);
5668 /* Find end of our completion_queue. */
5669 list_skb = &sd->completion_queue;
5671 list_skb = &(*list_skb)->next;
5672 /* Append completion queue from offline CPU. */
5673 *list_skb = oldsd->completion_queue;
5674 oldsd->completion_queue = NULL;
5676 /* Append output queue from offline CPU. */
5677 if (oldsd->output_queue) {
5678 *sd->output_queue_tailp = oldsd->output_queue;
5679 sd->output_queue_tailp = oldsd->output_queue_tailp;
5680 oldsd->output_queue = NULL;
5681 oldsd->output_queue_tailp = &oldsd->output_queue;
5684 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5687 /* Process offline CPU's input_pkt_queue */
5688 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5690 input_queue_head_incr(oldsd);
5692 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5694 input_queue_head_incr(oldsd);
5702 * netdev_increment_features - increment feature set by one
5703 * @all: current feature set
5704 * @one: new feature set
5705 * @mask: mask feature set
5707 * Computes a new feature set after adding a device with feature set
5708 * @one to the master device with current feature set @all. Will not
5709 * enable anything that is off in @mask. Returns the new feature set.
5711 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5714 /* If device needs checksumming, downgrade to it. */
5715 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5716 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5717 else if (mask & NETIF_F_ALL_CSUM) {
5718 /* If one device supports v4/v6 checksumming, set for all. */
5719 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5720 !(all & NETIF_F_GEN_CSUM)) {
5721 all &= ~NETIF_F_ALL_CSUM;
5722 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5725 /* If one device supports hw checksumming, set for all. */
5726 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5727 all &= ~NETIF_F_ALL_CSUM;
5728 all |= NETIF_F_HW_CSUM;
5732 one |= NETIF_F_ALL_CSUM;
5734 one |= all & NETIF_F_ONE_FOR_ALL;
5735 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5736 all |= one & mask & NETIF_F_ONE_FOR_ALL;
5740 EXPORT_SYMBOL(netdev_increment_features);
5742 static struct hlist_head *netdev_create_hash(void)
5745 struct hlist_head *hash;
5747 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5749 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5750 INIT_HLIST_HEAD(&hash[i]);
5755 /* Initialize per network namespace state */
5756 static int __net_init netdev_init(struct net *net)
5758 INIT_LIST_HEAD(&net->dev_base_head);
5760 net->dev_name_head = netdev_create_hash();
5761 if (net->dev_name_head == NULL)
5764 net->dev_index_head = netdev_create_hash();
5765 if (net->dev_index_head == NULL)
5771 kfree(net->dev_name_head);
5777 * netdev_drivername - network driver for the device
5778 * @dev: network device
5779 * @buffer: buffer for resulting name
5780 * @len: size of buffer
5782 * Determine network driver for device.
5784 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5786 const struct device_driver *driver;
5787 const struct device *parent;
5789 if (len <= 0 || !buffer)
5793 parent = dev->dev.parent;
5798 driver = parent->driver;
5799 if (driver && driver->name)
5800 strlcpy(buffer, driver->name, len);
5804 static void __net_exit netdev_exit(struct net *net)
5806 kfree(net->dev_name_head);
5807 kfree(net->dev_index_head);
5810 static struct pernet_operations __net_initdata netdev_net_ops = {
5811 .init = netdev_init,
5812 .exit = netdev_exit,
5815 static void __net_exit default_device_exit(struct net *net)
5817 struct net_device *dev, *aux;
5819 * Push all migratable network devices back to the
5820 * initial network namespace
5823 for_each_netdev_safe(net, dev, aux) {
5825 char fb_name[IFNAMSIZ];
5827 /* Ignore unmoveable devices (i.e. loopback) */
5828 if (dev->features & NETIF_F_NETNS_LOCAL)
5831 /* Leave virtual devices for the generic cleanup */
5832 if (dev->rtnl_link_ops)
5835 /* Push remaing network devices to init_net */
5836 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5837 err = dev_change_net_namespace(dev, &init_net, fb_name);
5839 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5840 __func__, dev->name, err);
5847 static void __net_exit default_device_exit_batch(struct list_head *net_list)
5849 /* At exit all network devices most be removed from a network
5850 * namespace. Do this in the reverse order of registeration.
5851 * Do this across as many network namespaces as possible to
5852 * improve batching efficiency.
5854 struct net_device *dev;
5856 LIST_HEAD(dev_kill_list);
5859 list_for_each_entry(net, net_list, exit_list) {
5860 for_each_netdev_reverse(net, dev) {
5861 if (dev->rtnl_link_ops)
5862 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5864 unregister_netdevice_queue(dev, &dev_kill_list);
5867 unregister_netdevice_many(&dev_kill_list);
5871 static struct pernet_operations __net_initdata default_device_ops = {
5872 .exit = default_device_exit,
5873 .exit_batch = default_device_exit_batch,
5877 * Initialize the DEV module. At boot time this walks the device list and
5878 * unhooks any devices that fail to initialise (normally hardware not
5879 * present) and leaves us with a valid list of present and active devices.
5884 * This is called single threaded during boot, so no need
5885 * to take the rtnl semaphore.
5887 static int __init net_dev_init(void)
5889 int i, rc = -ENOMEM;
5891 BUG_ON(!dev_boot_phase);
5893 if (dev_proc_init())
5896 if (netdev_kobject_init())
5899 INIT_LIST_HEAD(&ptype_all);
5900 for (i = 0; i < PTYPE_HASH_SIZE; i++)
5901 INIT_LIST_HEAD(&ptype_base[i]);
5903 if (register_pernet_subsys(&netdev_net_ops))
5907 * Initialise the packet receive queues.
5910 for_each_possible_cpu(i) {
5911 struct softnet_data *sd = &per_cpu(softnet_data, i);
5913 memset(sd, 0, sizeof(*sd));
5914 skb_queue_head_init(&sd->input_pkt_queue);
5915 skb_queue_head_init(&sd->process_queue);
5916 sd->completion_queue = NULL;
5917 INIT_LIST_HEAD(&sd->poll_list);
5918 sd->output_queue = NULL;
5919 sd->output_queue_tailp = &sd->output_queue;
5921 sd->csd.func = rps_trigger_softirq;
5927 sd->backlog.poll = process_backlog;
5928 sd->backlog.weight = weight_p;
5929 sd->backlog.gro_list = NULL;
5930 sd->backlog.gro_count = 0;
5935 /* The loopback device is special if any other network devices
5936 * is present in a network namespace the loopback device must
5937 * be present. Since we now dynamically allocate and free the
5938 * loopback device ensure this invariant is maintained by
5939 * keeping the loopback device as the first device on the
5940 * list of network devices. Ensuring the loopback devices
5941 * is the first device that appears and the last network device
5944 if (register_pernet_device(&loopback_net_ops))
5947 if (register_pernet_device(&default_device_ops))
5950 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5951 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5953 hotcpu_notifier(dev_cpu_callback, 0);
5961 subsys_initcall(net_dev_init);
5963 static int __init initialize_hashrnd(void)
5965 get_random_bytes(&hashrnd, sizeof(hashrnd));
5969 late_initcall_sync(initialize_hashrnd);