2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
51 * Rudi Cilibrasi : Pass the right thing to
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <linux/if_bridge.h>
105 #include <linux/if_macvlan.h>
107 #include <net/pkt_sched.h>
108 #include <net/checksum.h>
109 #include <net/xfrm.h>
110 #include <linux/highmem.h>
111 #include <linux/init.h>
112 #include <linux/kmod.h>
113 #include <linux/module.h>
114 #include <linux/netpoll.h>
115 #include <linux/rcupdate.h>
116 #include <linux/delay.h>
117 #include <net/wext.h>
118 #include <net/iw_handler.h>
119 #include <asm/current.h>
120 #include <linux/audit.h>
121 #include <linux/dmaengine.h>
122 #include <linux/err.h>
123 #include <linux/ctype.h>
124 #include <linux/if_arp.h>
125 #include <linux/if_vlan.h>
126 #include <linux/ip.h>
128 #include <linux/ipv6.h>
129 #include <linux/in.h>
130 #include <linux/jhash.h>
131 #include <linux/random.h>
132 #include <trace/events/napi.h>
133 #include <linux/pci.h>
135 #include "net-sysfs.h"
137 /* Instead of increasing this, you should create a hash table. */
138 #define MAX_GRO_SKBS 8
140 /* This should be increased if a protocol with a bigger head is added. */
141 #define GRO_MAX_HEAD (MAX_HEADER + 128)
144 * The list of packet types we will receive (as opposed to discard)
145 * and the routines to invoke.
147 * Why 16. Because with 16 the only overlap we get on a hash of the
148 * low nibble of the protocol value is RARP/SNAP/X.25.
150 * NOTE: That is no longer true with the addition of VLAN tags. Not
151 * sure which should go first, but I bet it won't make much
152 * difference if we are running VLANs. The good news is that
153 * this protocol won't be in the list unless compiled in, so
154 * the average user (w/out VLANs) will not be adversely affected.
171 #define PTYPE_HASH_SIZE (16)
172 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
174 static DEFINE_SPINLOCK(ptype_lock);
175 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
176 static struct list_head ptype_all __read_mostly; /* Taps */
179 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
182 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
184 * Writers must hold the rtnl semaphore while they loop through the
185 * dev_base_head list, and hold dev_base_lock for writing when they do the
186 * actual updates. This allows pure readers to access the list even
187 * while a writer is preparing to update it.
189 * To put it another way, dev_base_lock is held for writing only to
190 * protect against pure readers; the rtnl semaphore provides the
191 * protection against other writers.
193 * See, for example usages, register_netdevice() and
194 * unregister_netdevice(), which must be called with the rtnl
197 DEFINE_RWLOCK(dev_base_lock);
198 EXPORT_SYMBOL(dev_base_lock);
200 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
202 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
203 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
206 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
208 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
211 static inline void rps_lock(struct softnet_data *queue)
214 spin_lock(&queue->input_pkt_queue.lock);
218 static inline void rps_unlock(struct softnet_data *queue)
221 spin_unlock(&queue->input_pkt_queue.lock);
225 /* Device list insertion */
226 static int list_netdevice(struct net_device *dev)
228 struct net *net = dev_net(dev);
232 write_lock_bh(&dev_base_lock);
233 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
234 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
235 hlist_add_head_rcu(&dev->index_hlist,
236 dev_index_hash(net, dev->ifindex));
237 write_unlock_bh(&dev_base_lock);
241 /* Device list removal
242 * caller must respect a RCU grace period before freeing/reusing dev
244 static void unlist_netdevice(struct net_device *dev)
248 /* Unlink dev from the device chain */
249 write_lock_bh(&dev_base_lock);
250 list_del_rcu(&dev->dev_list);
251 hlist_del_rcu(&dev->name_hlist);
252 hlist_del_rcu(&dev->index_hlist);
253 write_unlock_bh(&dev_base_lock);
260 static RAW_NOTIFIER_HEAD(netdev_chain);
263 * Device drivers call our routines to queue packets here. We empty the
264 * queue in the local softnet handler.
267 DEFINE_PER_CPU(struct softnet_data, softnet_data);
268 EXPORT_PER_CPU_SYMBOL(softnet_data);
270 #ifdef CONFIG_LOCKDEP
272 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
273 * according to dev->type
275 static const unsigned short netdev_lock_type[] =
276 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
277 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
278 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
279 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
280 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
281 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
282 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
283 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
284 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
285 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
286 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
287 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
288 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
289 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
290 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
291 ARPHRD_VOID, ARPHRD_NONE};
293 static const char *const netdev_lock_name[] =
294 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
295 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
296 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
297 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
298 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
299 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
300 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
301 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
302 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
303 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
304 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
305 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
306 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
307 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
308 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
309 "_xmit_VOID", "_xmit_NONE"};
311 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
312 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
314 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
318 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319 if (netdev_lock_type[i] == dev_type)
321 /* the last key is used by default */
322 return ARRAY_SIZE(netdev_lock_type) - 1;
325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 unsigned short dev_type)
330 i = netdev_lock_pos(dev_type);
331 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332 netdev_lock_name[i]);
335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
339 i = netdev_lock_pos(dev->type);
340 lockdep_set_class_and_name(&dev->addr_list_lock,
341 &netdev_addr_lock_key[i],
342 netdev_lock_name[i]);
345 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346 unsigned short dev_type)
349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
354 /*******************************************************************************
356 Protocol management and registration routines
358 *******************************************************************************/
361 * Add a protocol ID to the list. Now that the input handler is
362 * smarter we can dispense with all the messy stuff that used to be
365 * BEWARE!!! Protocol handlers, mangling input packets,
366 * MUST BE last in hash buckets and checking protocol handlers
367 * MUST start from promiscuous ptype_all chain in net_bh.
368 * It is true now, do not change it.
369 * Explanation follows: if protocol handler, mangling packet, will
370 * be the first on list, it is not able to sense, that packet
371 * is cloned and should be copied-on-write, so that it will
372 * change it and subsequent readers will get broken packet.
377 * dev_add_pack - add packet handler
378 * @pt: packet type declaration
380 * Add a protocol handler to the networking stack. The passed &packet_type
381 * is linked into kernel lists and may not be freed until it has been
382 * removed from the kernel lists.
384 * This call does not sleep therefore it can not
385 * guarantee all CPU's that are in middle of receiving packets
386 * will see the new packet type (until the next received packet).
389 void dev_add_pack(struct packet_type *pt)
393 spin_lock_bh(&ptype_lock);
394 if (pt->type == htons(ETH_P_ALL))
395 list_add_rcu(&pt->list, &ptype_all);
397 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
398 list_add_rcu(&pt->list, &ptype_base[hash]);
400 spin_unlock_bh(&ptype_lock);
402 EXPORT_SYMBOL(dev_add_pack);
405 * __dev_remove_pack - remove packet handler
406 * @pt: packet type declaration
408 * Remove a protocol handler that was previously added to the kernel
409 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
410 * from the kernel lists and can be freed or reused once this function
413 * The packet type might still be in use by receivers
414 * and must not be freed until after all the CPU's have gone
415 * through a quiescent state.
417 void __dev_remove_pack(struct packet_type *pt)
419 struct list_head *head;
420 struct packet_type *pt1;
422 spin_lock_bh(&ptype_lock);
424 if (pt->type == htons(ETH_P_ALL))
427 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
429 list_for_each_entry(pt1, head, list) {
431 list_del_rcu(&pt->list);
436 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
438 spin_unlock_bh(&ptype_lock);
440 EXPORT_SYMBOL(__dev_remove_pack);
443 * dev_remove_pack - remove packet handler
444 * @pt: packet type declaration
446 * Remove a protocol handler that was previously added to the kernel
447 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
448 * from the kernel lists and can be freed or reused once this function
451 * This call sleeps to guarantee that no CPU is looking at the packet
454 void dev_remove_pack(struct packet_type *pt)
456 __dev_remove_pack(pt);
460 EXPORT_SYMBOL(dev_remove_pack);
462 /******************************************************************************
464 Device Boot-time Settings Routines
466 *******************************************************************************/
468 /* Boot time configuration table */
469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
472 * netdev_boot_setup_add - add new setup entry
473 * @name: name of the device
474 * @map: configured settings for the device
476 * Adds new setup entry to the dev_boot_setup list. The function
477 * returns 0 on error and 1 on success. This is a generic routine to
480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
482 struct netdev_boot_setup *s;
486 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 memset(s[i].name, 0, sizeof(s[i].name));
489 strlcpy(s[i].name, name, IFNAMSIZ);
490 memcpy(&s[i].map, map, sizeof(s[i].map));
495 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
499 * netdev_boot_setup_check - check boot time settings
500 * @dev: the netdevice
502 * Check boot time settings for the device.
503 * The found settings are set for the device to be used
504 * later in the device probing.
505 * Returns 0 if no settings found, 1 if they are.
507 int netdev_boot_setup_check(struct net_device *dev)
509 struct netdev_boot_setup *s = dev_boot_setup;
512 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
514 !strcmp(dev->name, s[i].name)) {
515 dev->irq = s[i].map.irq;
516 dev->base_addr = s[i].map.base_addr;
517 dev->mem_start = s[i].map.mem_start;
518 dev->mem_end = s[i].map.mem_end;
524 EXPORT_SYMBOL(netdev_boot_setup_check);
528 * netdev_boot_base - get address from boot time settings
529 * @prefix: prefix for network device
530 * @unit: id for network device
532 * Check boot time settings for the base address of device.
533 * The found settings are set for the device to be used
534 * later in the device probing.
535 * Returns 0 if no settings found.
537 unsigned long netdev_boot_base(const char *prefix, int unit)
539 const struct netdev_boot_setup *s = dev_boot_setup;
543 sprintf(name, "%s%d", prefix, unit);
546 * If device already registered then return base of 1
547 * to indicate not to probe for this interface
549 if (__dev_get_by_name(&init_net, name))
552 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 if (!strcmp(name, s[i].name))
554 return s[i].map.base_addr;
559 * Saves at boot time configured settings for any netdevice.
561 int __init netdev_boot_setup(char *str)
566 str = get_options(str, ARRAY_SIZE(ints), ints);
571 memset(&map, 0, sizeof(map));
575 map.base_addr = ints[2];
577 map.mem_start = ints[3];
579 map.mem_end = ints[4];
581 /* Add new entry to the list */
582 return netdev_boot_setup_add(str, &map);
585 __setup("netdev=", netdev_boot_setup);
587 /*******************************************************************************
589 Device Interface Subroutines
591 *******************************************************************************/
594 * __dev_get_by_name - find a device by its name
595 * @net: the applicable net namespace
596 * @name: name to find
598 * Find an interface by name. Must be called under RTNL semaphore
599 * or @dev_base_lock. If the name is found a pointer to the device
600 * is returned. If the name is not found then %NULL is returned. The
601 * reference counters are not incremented so the caller must be
602 * careful with locks.
605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
607 struct hlist_node *p;
608 struct net_device *dev;
609 struct hlist_head *head = dev_name_hash(net, name);
611 hlist_for_each_entry(dev, p, head, name_hlist)
612 if (!strncmp(dev->name, name, IFNAMSIZ))
617 EXPORT_SYMBOL(__dev_get_by_name);
620 * dev_get_by_name_rcu - find a device by its name
621 * @net: the applicable net namespace
622 * @name: name to find
624 * Find an interface by name.
625 * If the name is found a pointer to the device is returned.
626 * If the name is not found then %NULL is returned.
627 * The reference counters are not incremented so the caller must be
628 * careful with locks. The caller must hold RCU lock.
631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
633 struct hlist_node *p;
634 struct net_device *dev;
635 struct hlist_head *head = dev_name_hash(net, name);
637 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 if (!strncmp(dev->name, name, IFNAMSIZ))
643 EXPORT_SYMBOL(dev_get_by_name_rcu);
646 * dev_get_by_name - find a device by its name
647 * @net: the applicable net namespace
648 * @name: name to find
650 * Find an interface by name. This can be called from any
651 * context and does its own locking. The returned handle has
652 * the usage count incremented and the caller must use dev_put() to
653 * release it when it is no longer needed. %NULL is returned if no
654 * matching device is found.
657 struct net_device *dev_get_by_name(struct net *net, const char *name)
659 struct net_device *dev;
662 dev = dev_get_by_name_rcu(net, name);
668 EXPORT_SYMBOL(dev_get_by_name);
671 * __dev_get_by_index - find a device by its ifindex
672 * @net: the applicable net namespace
673 * @ifindex: index of device
675 * Search for an interface by index. Returns %NULL if the device
676 * is not found or a pointer to the device. The device has not
677 * had its reference counter increased so the caller must be careful
678 * about locking. The caller must hold either the RTNL semaphore
682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
684 struct hlist_node *p;
685 struct net_device *dev;
686 struct hlist_head *head = dev_index_hash(net, ifindex);
688 hlist_for_each_entry(dev, p, head, index_hlist)
689 if (dev->ifindex == ifindex)
694 EXPORT_SYMBOL(__dev_get_by_index);
697 * dev_get_by_index_rcu - find a device by its ifindex
698 * @net: the applicable net namespace
699 * @ifindex: index of device
701 * Search for an interface by index. Returns %NULL if the device
702 * is not found or a pointer to the device. The device has not
703 * had its reference counter increased so the caller must be careful
704 * about locking. The caller must hold RCU lock.
707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
709 struct hlist_node *p;
710 struct net_device *dev;
711 struct hlist_head *head = dev_index_hash(net, ifindex);
713 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 if (dev->ifindex == ifindex)
719 EXPORT_SYMBOL(dev_get_by_index_rcu);
723 * dev_get_by_index - find a device by its ifindex
724 * @net: the applicable net namespace
725 * @ifindex: index of device
727 * Search for an interface by index. Returns NULL if the device
728 * is not found or a pointer to the device. The device returned has
729 * had a reference added and the pointer is safe until the user calls
730 * dev_put to indicate they have finished with it.
733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
735 struct net_device *dev;
738 dev = dev_get_by_index_rcu(net, ifindex);
744 EXPORT_SYMBOL(dev_get_by_index);
747 * dev_getbyhwaddr - find a device by its hardware address
748 * @net: the applicable net namespace
749 * @type: media type of device
750 * @ha: hardware address
752 * Search for an interface by MAC address. Returns NULL if the device
753 * is not found or a pointer to the device. The caller must hold the
754 * rtnl semaphore. The returned device has not had its ref count increased
755 * and the caller must therefore be careful about locking
758 * If the API was consistent this would be __dev_get_by_hwaddr
761 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
763 struct net_device *dev;
767 for_each_netdev(net, dev)
768 if (dev->type == type &&
769 !memcmp(dev->dev_addr, ha, dev->addr_len))
774 EXPORT_SYMBOL(dev_getbyhwaddr);
776 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
778 struct net_device *dev;
781 for_each_netdev(net, dev)
782 if (dev->type == type)
787 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
789 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
791 struct net_device *dev, *ret = NULL;
794 for_each_netdev_rcu(net, dev)
795 if (dev->type == type) {
803 EXPORT_SYMBOL(dev_getfirstbyhwtype);
806 * dev_get_by_flags - find any device with given flags
807 * @net: the applicable net namespace
808 * @if_flags: IFF_* values
809 * @mask: bitmask of bits in if_flags to check
811 * Search for any interface with the given flags. Returns NULL if a device
812 * is not found or a pointer to the device. The device returned has
813 * had a reference added and the pointer is safe until the user calls
814 * dev_put to indicate they have finished with it.
817 struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
820 struct net_device *dev, *ret;
824 for_each_netdev_rcu(net, dev) {
825 if (((dev->flags ^ if_flags) & mask) == 0) {
834 EXPORT_SYMBOL(dev_get_by_flags);
837 * dev_valid_name - check if name is okay for network device
840 * Network device names need to be valid file names to
841 * to allow sysfs to work. We also disallow any kind of
844 int dev_valid_name(const char *name)
848 if (strlen(name) >= IFNAMSIZ)
850 if (!strcmp(name, ".") || !strcmp(name, ".."))
854 if (*name == '/' || isspace(*name))
860 EXPORT_SYMBOL(dev_valid_name);
863 * __dev_alloc_name - allocate a name for a device
864 * @net: network namespace to allocate the device name in
865 * @name: name format string
866 * @buf: scratch buffer and result name string
868 * Passed a format string - eg "lt%d" it will try and find a suitable
869 * id. It scans list of devices to build up a free map, then chooses
870 * the first empty slot. The caller must hold the dev_base or rtnl lock
871 * while allocating the name and adding the device in order to avoid
873 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
874 * Returns the number of the unit assigned or a negative errno code.
877 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
881 const int max_netdevices = 8*PAGE_SIZE;
882 unsigned long *inuse;
883 struct net_device *d;
885 p = strnchr(name, IFNAMSIZ-1, '%');
888 * Verify the string as this thing may have come from
889 * the user. There must be either one "%d" and no other "%"
892 if (p[1] != 'd' || strchr(p + 2, '%'))
895 /* Use one page as a bit array of possible slots */
896 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
900 for_each_netdev(net, d) {
901 if (!sscanf(d->name, name, &i))
903 if (i < 0 || i >= max_netdevices)
906 /* avoid cases where sscanf is not exact inverse of printf */
907 snprintf(buf, IFNAMSIZ, name, i);
908 if (!strncmp(buf, d->name, IFNAMSIZ))
912 i = find_first_zero_bit(inuse, max_netdevices);
913 free_page((unsigned long) inuse);
917 snprintf(buf, IFNAMSIZ, name, i);
918 if (!__dev_get_by_name(net, buf))
921 /* It is possible to run out of possible slots
922 * when the name is long and there isn't enough space left
923 * for the digits, or if all bits are used.
929 * dev_alloc_name - allocate a name for a device
931 * @name: name format string
933 * Passed a format string - eg "lt%d" it will try and find a suitable
934 * id. It scans list of devices to build up a free map, then chooses
935 * the first empty slot. The caller must hold the dev_base or rtnl lock
936 * while allocating the name and adding the device in order to avoid
938 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
939 * Returns the number of the unit assigned or a negative errno code.
942 int dev_alloc_name(struct net_device *dev, const char *name)
948 BUG_ON(!dev_net(dev));
950 ret = __dev_alloc_name(net, name, buf);
952 strlcpy(dev->name, buf, IFNAMSIZ);
955 EXPORT_SYMBOL(dev_alloc_name);
957 static int dev_get_valid_name(struct net *net, const char *name, char *buf,
960 if (!dev_valid_name(name))
963 if (fmt && strchr(name, '%'))
964 return __dev_alloc_name(net, name, buf);
965 else if (__dev_get_by_name(net, name))
967 else if (buf != name)
968 strlcpy(buf, name, IFNAMSIZ);
974 * dev_change_name - change name of a device
976 * @newname: name (or format string) must be at least IFNAMSIZ
978 * Change name of a device, can pass format strings "eth%d".
981 int dev_change_name(struct net_device *dev, const char *newname)
983 char oldname[IFNAMSIZ];
989 BUG_ON(!dev_net(dev));
992 if (dev->flags & IFF_UP)
995 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
998 memcpy(oldname, dev->name, IFNAMSIZ);
1000 err = dev_get_valid_name(net, newname, dev->name, 1);
1005 /* For now only devices in the initial network namespace
1008 if (net_eq(net, &init_net)) {
1009 ret = device_rename(&dev->dev, dev->name);
1011 memcpy(dev->name, oldname, IFNAMSIZ);
1016 write_lock_bh(&dev_base_lock);
1017 hlist_del(&dev->name_hlist);
1018 write_unlock_bh(&dev_base_lock);
1022 write_lock_bh(&dev_base_lock);
1023 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1024 write_unlock_bh(&dev_base_lock);
1026 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1027 ret = notifier_to_errno(ret);
1030 /* err >= 0 after dev_alloc_name() or stores the first errno */
1033 memcpy(dev->name, oldname, IFNAMSIZ);
1037 "%s: name change rollback failed: %d.\n",
1046 * dev_set_alias - change ifalias of a device
1048 * @alias: name up to IFALIASZ
1049 * @len: limit of bytes to copy from info
1051 * Set ifalias for a device,
1053 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057 if (len >= IFALIASZ)
1062 kfree(dev->ifalias);
1063 dev->ifalias = NULL;
1068 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1072 strlcpy(dev->ifalias, alias, len+1);
1078 * netdev_features_change - device changes features
1079 * @dev: device to cause notification
1081 * Called to indicate a device has changed features.
1083 void netdev_features_change(struct net_device *dev)
1085 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1087 EXPORT_SYMBOL(netdev_features_change);
1090 * netdev_state_change - device changes state
1091 * @dev: device to cause notification
1093 * Called to indicate a device has changed state. This function calls
1094 * the notifier chains for netdev_chain and sends a NEWLINK message
1095 * to the routing socket.
1097 void netdev_state_change(struct net_device *dev)
1099 if (dev->flags & IFF_UP) {
1100 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1101 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1104 EXPORT_SYMBOL(netdev_state_change);
1106 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1108 return call_netdevice_notifiers(event, dev);
1110 EXPORT_SYMBOL(netdev_bonding_change);
1113 * dev_load - load a network module
1114 * @net: the applicable net namespace
1115 * @name: name of interface
1117 * If a network interface is not present and the process has suitable
1118 * privileges this function loads the module. If module loading is not
1119 * available in this kernel then it becomes a nop.
1122 void dev_load(struct net *net, const char *name)
1124 struct net_device *dev;
1127 dev = dev_get_by_name_rcu(net, name);
1130 if (!dev && capable(CAP_NET_ADMIN))
1131 request_module("%s", name);
1133 EXPORT_SYMBOL(dev_load);
1135 static int __dev_open(struct net_device *dev)
1137 const struct net_device_ops *ops = dev->netdev_ops;
1143 * Is it even present?
1145 if (!netif_device_present(dev))
1148 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1149 ret = notifier_to_errno(ret);
1154 * Call device private open method
1156 set_bit(__LINK_STATE_START, &dev->state);
1158 if (ops->ndo_validate_addr)
1159 ret = ops->ndo_validate_addr(dev);
1161 if (!ret && ops->ndo_open)
1162 ret = ops->ndo_open(dev);
1165 * If it went open OK then:
1169 clear_bit(__LINK_STATE_START, &dev->state);
1174 dev->flags |= IFF_UP;
1179 net_dmaengine_get();
1182 * Initialize multicasting status
1184 dev_set_rx_mode(dev);
1187 * Wakeup transmit queue engine
1196 * dev_open - prepare an interface for use.
1197 * @dev: device to open
1199 * Takes a device from down to up state. The device's private open
1200 * function is invoked and then the multicast lists are loaded. Finally
1201 * the device is moved into the up state and a %NETDEV_UP message is
1202 * sent to the netdev notifier chain.
1204 * Calling this function on an active interface is a nop. On a failure
1205 * a negative errno code is returned.
1207 int dev_open(struct net_device *dev)
1214 if (dev->flags & IFF_UP)
1220 ret = __dev_open(dev);
1225 * ... and announce new interface.
1227 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1228 call_netdevice_notifiers(NETDEV_UP, dev);
1232 EXPORT_SYMBOL(dev_open);
1234 static int __dev_close(struct net_device *dev)
1236 const struct net_device_ops *ops = dev->netdev_ops;
1242 * Tell people we are going down, so that they can
1243 * prepare to death, when device is still operating.
1245 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1247 clear_bit(__LINK_STATE_START, &dev->state);
1249 /* Synchronize to scheduled poll. We cannot touch poll list,
1250 * it can be even on different cpu. So just clear netif_running().
1252 * dev->stop() will invoke napi_disable() on all of it's
1253 * napi_struct instances on this device.
1255 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1257 dev_deactivate(dev);
1260 * Call the device specific close. This cannot fail.
1261 * Only if device is UP
1263 * We allow it to be called even after a DETACH hot-plug
1270 * Device is now down.
1273 dev->flags &= ~IFF_UP;
1278 net_dmaengine_put();
1284 * dev_close - shutdown an interface.
1285 * @dev: device to shutdown
1287 * This function moves an active device into down state. A
1288 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1289 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1292 int dev_close(struct net_device *dev)
1294 if (!(dev->flags & IFF_UP))
1300 * Tell people we are down
1302 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1303 call_netdevice_notifiers(NETDEV_DOWN, dev);
1307 EXPORT_SYMBOL(dev_close);
1311 * dev_disable_lro - disable Large Receive Offload on a device
1314 * Disable Large Receive Offload (LRO) on a net device. Must be
1315 * called under RTNL. This is needed if received packets may be
1316 * forwarded to another interface.
1318 void dev_disable_lro(struct net_device *dev)
1320 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1321 dev->ethtool_ops->set_flags) {
1322 u32 flags = dev->ethtool_ops->get_flags(dev);
1323 if (flags & ETH_FLAG_LRO) {
1324 flags &= ~ETH_FLAG_LRO;
1325 dev->ethtool_ops->set_flags(dev, flags);
1328 WARN_ON(dev->features & NETIF_F_LRO);
1330 EXPORT_SYMBOL(dev_disable_lro);
1333 static int dev_boot_phase = 1;
1336 * Device change register/unregister. These are not inline or static
1337 * as we export them to the world.
1341 * register_netdevice_notifier - register a network notifier block
1344 * Register a notifier to be called when network device events occur.
1345 * The notifier passed is linked into the kernel structures and must
1346 * not be reused until it has been unregistered. A negative errno code
1347 * is returned on a failure.
1349 * When registered all registration and up events are replayed
1350 * to the new notifier to allow device to have a race free
1351 * view of the network device list.
1354 int register_netdevice_notifier(struct notifier_block *nb)
1356 struct net_device *dev;
1357 struct net_device *last;
1362 err = raw_notifier_chain_register(&netdev_chain, nb);
1368 for_each_netdev(net, dev) {
1369 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1370 err = notifier_to_errno(err);
1374 if (!(dev->flags & IFF_UP))
1377 nb->notifier_call(nb, NETDEV_UP, dev);
1388 for_each_netdev(net, dev) {
1392 if (dev->flags & IFF_UP) {
1393 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1394 nb->notifier_call(nb, NETDEV_DOWN, dev);
1396 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1397 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1401 raw_notifier_chain_unregister(&netdev_chain, nb);
1404 EXPORT_SYMBOL(register_netdevice_notifier);
1407 * unregister_netdevice_notifier - unregister a network notifier block
1410 * Unregister a notifier previously registered by
1411 * register_netdevice_notifier(). The notifier is unlinked into the
1412 * kernel structures and may then be reused. A negative errno code
1413 * is returned on a failure.
1416 int unregister_netdevice_notifier(struct notifier_block *nb)
1421 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1425 EXPORT_SYMBOL(unregister_netdevice_notifier);
1428 * call_netdevice_notifiers - call all network notifier blocks
1429 * @val: value passed unmodified to notifier function
1430 * @dev: net_device pointer passed unmodified to notifier function
1432 * Call all network notifier blocks. Parameters and return value
1433 * are as for raw_notifier_call_chain().
1436 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1438 return raw_notifier_call_chain(&netdev_chain, val, dev);
1441 /* When > 0 there are consumers of rx skb time stamps */
1442 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1444 void net_enable_timestamp(void)
1446 atomic_inc(&netstamp_needed);
1448 EXPORT_SYMBOL(net_enable_timestamp);
1450 void net_disable_timestamp(void)
1452 atomic_dec(&netstamp_needed);
1454 EXPORT_SYMBOL(net_disable_timestamp);
1456 static inline void net_timestamp(struct sk_buff *skb)
1458 if (atomic_read(&netstamp_needed))
1459 __net_timestamp(skb);
1461 skb->tstamp.tv64 = 0;
1465 * dev_forward_skb - loopback an skb to another netif
1467 * @dev: destination network device
1468 * @skb: buffer to forward
1471 * NET_RX_SUCCESS (no congestion)
1472 * NET_RX_DROP (packet was dropped)
1474 * dev_forward_skb can be used for injecting an skb from the
1475 * start_xmit function of one device into the receive queue
1476 * of another device.
1478 * The receiving device may be in another namespace, so
1479 * we have to clear all information in the skb that could
1480 * impact namespace isolation.
1482 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1486 if (!(dev->flags & IFF_UP))
1489 if (skb->len > (dev->mtu + dev->hard_header_len))
1492 skb_set_dev(skb, dev);
1493 skb->tstamp.tv64 = 0;
1494 skb->pkt_type = PACKET_HOST;
1495 skb->protocol = eth_type_trans(skb, dev);
1496 return netif_rx(skb);
1498 EXPORT_SYMBOL_GPL(dev_forward_skb);
1501 * Support routine. Sends outgoing frames to any network
1502 * taps currently in use.
1505 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1507 struct packet_type *ptype;
1509 #ifdef CONFIG_NET_CLS_ACT
1510 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1517 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1518 /* Never send packets back to the socket
1519 * they originated from - MvS (miquels@drinkel.ow.org)
1521 if ((ptype->dev == dev || !ptype->dev) &&
1522 (ptype->af_packet_priv == NULL ||
1523 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1524 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1528 /* skb->nh should be correctly
1529 set by sender, so that the second statement is
1530 just protection against buggy protocols.
1532 skb_reset_mac_header(skb2);
1534 if (skb_network_header(skb2) < skb2->data ||
1535 skb2->network_header > skb2->tail) {
1536 if (net_ratelimit())
1537 printk(KERN_CRIT "protocol %04x is "
1539 skb2->protocol, dev->name);
1540 skb_reset_network_header(skb2);
1543 skb2->transport_header = skb2->network_header;
1544 skb2->pkt_type = PACKET_OUTGOING;
1545 ptype->func(skb2, skb->dev, ptype, skb->dev);
1552 static inline void __netif_reschedule(struct Qdisc *q)
1554 struct softnet_data *sd;
1555 unsigned long flags;
1557 local_irq_save(flags);
1558 sd = &__get_cpu_var(softnet_data);
1559 q->next_sched = sd->output_queue;
1560 sd->output_queue = q;
1561 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1562 local_irq_restore(flags);
1565 void __netif_schedule(struct Qdisc *q)
1567 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1568 __netif_reschedule(q);
1570 EXPORT_SYMBOL(__netif_schedule);
1572 void dev_kfree_skb_irq(struct sk_buff *skb)
1574 if (atomic_dec_and_test(&skb->users)) {
1575 struct softnet_data *sd;
1576 unsigned long flags;
1578 local_irq_save(flags);
1579 sd = &__get_cpu_var(softnet_data);
1580 skb->next = sd->completion_queue;
1581 sd->completion_queue = skb;
1582 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1583 local_irq_restore(flags);
1586 EXPORT_SYMBOL(dev_kfree_skb_irq);
1588 void dev_kfree_skb_any(struct sk_buff *skb)
1590 if (in_irq() || irqs_disabled())
1591 dev_kfree_skb_irq(skb);
1595 EXPORT_SYMBOL(dev_kfree_skb_any);
1599 * netif_device_detach - mark device as removed
1600 * @dev: network device
1602 * Mark device as removed from system and therefore no longer available.
1604 void netif_device_detach(struct net_device *dev)
1606 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1607 netif_running(dev)) {
1608 netif_tx_stop_all_queues(dev);
1611 EXPORT_SYMBOL(netif_device_detach);
1614 * netif_device_attach - mark device as attached
1615 * @dev: network device
1617 * Mark device as attached from system and restart if needed.
1619 void netif_device_attach(struct net_device *dev)
1621 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1622 netif_running(dev)) {
1623 netif_tx_wake_all_queues(dev);
1624 __netdev_watchdog_up(dev);
1627 EXPORT_SYMBOL(netif_device_attach);
1629 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1631 return ((features & NETIF_F_GEN_CSUM) ||
1632 ((features & NETIF_F_IP_CSUM) &&
1633 protocol == htons(ETH_P_IP)) ||
1634 ((features & NETIF_F_IPV6_CSUM) &&
1635 protocol == htons(ETH_P_IPV6)) ||
1636 ((features & NETIF_F_FCOE_CRC) &&
1637 protocol == htons(ETH_P_FCOE)));
1640 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1642 if (can_checksum_protocol(dev->features, skb->protocol))
1645 if (skb->protocol == htons(ETH_P_8021Q)) {
1646 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1647 if (can_checksum_protocol(dev->features & dev->vlan_features,
1648 veh->h_vlan_encapsulated_proto))
1656 * skb_dev_set -- assign a new device to a buffer
1657 * @skb: buffer for the new device
1658 * @dev: network device
1660 * If an skb is owned by a device already, we have to reset
1661 * all data private to the namespace a device belongs to
1662 * before assigning it a new device.
1664 #ifdef CONFIG_NET_NS
1665 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1668 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1671 skb_init_secmark(skb);
1675 skb->ipvs_property = 0;
1676 #ifdef CONFIG_NET_SCHED
1682 EXPORT_SYMBOL(skb_set_dev);
1683 #endif /* CONFIG_NET_NS */
1686 * Invalidate hardware checksum when packet is to be mangled, and
1687 * complete checksum manually on outgoing path.
1689 int skb_checksum_help(struct sk_buff *skb)
1692 int ret = 0, offset;
1694 if (skb->ip_summed == CHECKSUM_COMPLETE)
1695 goto out_set_summed;
1697 if (unlikely(skb_shinfo(skb)->gso_size)) {
1698 /* Let GSO fix up the checksum. */
1699 goto out_set_summed;
1702 offset = skb->csum_start - skb_headroom(skb);
1703 BUG_ON(offset >= skb_headlen(skb));
1704 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1706 offset += skb->csum_offset;
1707 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1709 if (skb_cloned(skb) &&
1710 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1711 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1716 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1718 skb->ip_summed = CHECKSUM_NONE;
1722 EXPORT_SYMBOL(skb_checksum_help);
1725 * skb_gso_segment - Perform segmentation on skb.
1726 * @skb: buffer to segment
1727 * @features: features for the output path (see dev->features)
1729 * This function segments the given skb and returns a list of segments.
1731 * It may return NULL if the skb requires no segmentation. This is
1732 * only possible when GSO is used for verifying header integrity.
1734 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1736 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1737 struct packet_type *ptype;
1738 __be16 type = skb->protocol;
1741 skb_reset_mac_header(skb);
1742 skb->mac_len = skb->network_header - skb->mac_header;
1743 __skb_pull(skb, skb->mac_len);
1745 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1746 struct net_device *dev = skb->dev;
1747 struct ethtool_drvinfo info = {};
1749 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1750 dev->ethtool_ops->get_drvinfo(dev, &info);
1752 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1754 info.driver, dev ? dev->features : 0L,
1755 skb->sk ? skb->sk->sk_route_caps : 0L,
1756 skb->len, skb->data_len, skb->ip_summed);
1758 if (skb_header_cloned(skb) &&
1759 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1760 return ERR_PTR(err);
1764 list_for_each_entry_rcu(ptype,
1765 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1766 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1767 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1768 err = ptype->gso_send_check(skb);
1769 segs = ERR_PTR(err);
1770 if (err || skb_gso_ok(skb, features))
1772 __skb_push(skb, (skb->data -
1773 skb_network_header(skb)));
1775 segs = ptype->gso_segment(skb, features);
1781 __skb_push(skb, skb->data - skb_mac_header(skb));
1785 EXPORT_SYMBOL(skb_gso_segment);
1787 /* Take action when hardware reception checksum errors are detected. */
1789 void netdev_rx_csum_fault(struct net_device *dev)
1791 if (net_ratelimit()) {
1792 printk(KERN_ERR "%s: hw csum failure.\n",
1793 dev ? dev->name : "<unknown>");
1797 EXPORT_SYMBOL(netdev_rx_csum_fault);
1800 /* Actually, we should eliminate this check as soon as we know, that:
1801 * 1. IOMMU is present and allows to map all the memory.
1802 * 2. No high memory really exists on this machine.
1805 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1807 #ifdef CONFIG_HIGHMEM
1809 if (!(dev->features & NETIF_F_HIGHDMA)) {
1810 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1811 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1815 if (PCI_DMA_BUS_IS_PHYS) {
1816 struct device *pdev = dev->dev.parent;
1820 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1821 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1822 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1831 void (*destructor)(struct sk_buff *skb);
1834 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1836 static void dev_gso_skb_destructor(struct sk_buff *skb)
1838 struct dev_gso_cb *cb;
1841 struct sk_buff *nskb = skb->next;
1843 skb->next = nskb->next;
1846 } while (skb->next);
1848 cb = DEV_GSO_CB(skb);
1850 cb->destructor(skb);
1854 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1855 * @skb: buffer to segment
1857 * This function segments the given skb and stores the list of segments
1860 static int dev_gso_segment(struct sk_buff *skb)
1862 struct net_device *dev = skb->dev;
1863 struct sk_buff *segs;
1864 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1867 segs = skb_gso_segment(skb, features);
1869 /* Verifying header integrity only. */
1874 return PTR_ERR(segs);
1877 DEV_GSO_CB(skb)->destructor = skb->destructor;
1878 skb->destructor = dev_gso_skb_destructor;
1883 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1884 struct netdev_queue *txq)
1886 const struct net_device_ops *ops = dev->netdev_ops;
1887 int rc = NETDEV_TX_OK;
1889 if (likely(!skb->next)) {
1890 if (!list_empty(&ptype_all))
1891 dev_queue_xmit_nit(skb, dev);
1893 if (netif_needs_gso(dev, skb)) {
1894 if (unlikely(dev_gso_segment(skb)))
1901 * If device doesnt need skb->dst, release it right now while
1902 * its hot in this cpu cache
1904 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1907 rc = ops->ndo_start_xmit(skb, dev);
1908 if (rc == NETDEV_TX_OK)
1909 txq_trans_update(txq);
1911 * TODO: if skb_orphan() was called by
1912 * dev->hard_start_xmit() (for example, the unmodified
1913 * igb driver does that; bnx2 doesn't), then
1914 * skb_tx_software_timestamp() will be unable to send
1915 * back the time stamp.
1917 * How can this be prevented? Always create another
1918 * reference to the socket before calling
1919 * dev->hard_start_xmit()? Prevent that skb_orphan()
1920 * does anything in dev->hard_start_xmit() by clearing
1921 * the skb destructor before the call and restoring it
1922 * afterwards, then doing the skb_orphan() ourselves?
1929 struct sk_buff *nskb = skb->next;
1931 skb->next = nskb->next;
1935 * If device doesnt need nskb->dst, release it right now while
1936 * its hot in this cpu cache
1938 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1941 rc = ops->ndo_start_xmit(nskb, dev);
1942 if (unlikely(rc != NETDEV_TX_OK)) {
1943 if (rc & ~NETDEV_TX_MASK)
1944 goto out_kfree_gso_skb;
1945 nskb->next = skb->next;
1949 txq_trans_update(txq);
1950 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1951 return NETDEV_TX_BUSY;
1952 } while (skb->next);
1955 if (likely(skb->next == NULL))
1956 skb->destructor = DEV_GSO_CB(skb)->destructor;
1962 static u32 hashrnd __read_mostly;
1964 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1968 if (skb_rx_queue_recorded(skb)) {
1969 hash = skb_get_rx_queue(skb);
1970 while (unlikely(hash >= dev->real_num_tx_queues))
1971 hash -= dev->real_num_tx_queues;
1975 if (skb->sk && skb->sk->sk_hash)
1976 hash = skb->sk->sk_hash;
1978 hash = skb->protocol;
1980 hash = jhash_1word(hash, hashrnd);
1982 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1984 EXPORT_SYMBOL(skb_tx_hash);
1986 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1988 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1989 if (net_ratelimit()) {
1990 pr_warning("%s selects TX queue %d, but "
1991 "real number of TX queues is %d\n",
1992 dev->name, queue_index, dev->real_num_tx_queues);
1999 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2000 struct sk_buff *skb)
2003 struct sock *sk = skb->sk;
2005 if (sk_tx_queue_recorded(sk)) {
2006 queue_index = sk_tx_queue_get(sk);
2008 const struct net_device_ops *ops = dev->netdev_ops;
2010 if (ops->ndo_select_queue) {
2011 queue_index = ops->ndo_select_queue(dev, skb);
2012 queue_index = dev_cap_txqueue(dev, queue_index);
2015 if (dev->real_num_tx_queues > 1)
2016 queue_index = skb_tx_hash(dev, skb);
2018 if (sk && rcu_dereference_check(sk->sk_dst_cache, 1))
2019 sk_tx_queue_set(sk, queue_index);
2023 skb_set_queue_mapping(skb, queue_index);
2024 return netdev_get_tx_queue(dev, queue_index);
2027 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2028 struct net_device *dev,
2029 struct netdev_queue *txq)
2031 spinlock_t *root_lock = qdisc_lock(q);
2034 spin_lock(root_lock);
2035 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2038 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2039 !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
2041 * This is a work-conserving queue; there are no old skbs
2042 * waiting to be sent out; and the qdisc is not running -
2043 * xmit the skb directly.
2045 __qdisc_update_bstats(q, skb->len);
2046 if (sch_direct_xmit(skb, q, dev, txq, root_lock))
2049 clear_bit(__QDISC_STATE_RUNNING, &q->state);
2051 rc = NET_XMIT_SUCCESS;
2053 rc = qdisc_enqueue_root(skb, q);
2056 spin_unlock(root_lock);
2062 * Returns true if either:
2063 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2064 * 2. skb is fragmented and the device does not support SG, or if
2065 * at least one of fragments is in highmem and device does not
2066 * support DMA from it.
2068 static inline int skb_needs_linearize(struct sk_buff *skb,
2069 struct net_device *dev)
2071 return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
2072 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
2073 illegal_highdma(dev, skb)));
2077 * dev_queue_xmit - transmit a buffer
2078 * @skb: buffer to transmit
2080 * Queue a buffer for transmission to a network device. The caller must
2081 * have set the device and priority and built the buffer before calling
2082 * this function. The function can be called from an interrupt.
2084 * A negative errno code is returned on a failure. A success does not
2085 * guarantee the frame will be transmitted as it may be dropped due
2086 * to congestion or traffic shaping.
2088 * -----------------------------------------------------------------------------------
2089 * I notice this method can also return errors from the queue disciplines,
2090 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2093 * Regardless of the return value, the skb is consumed, so it is currently
2094 * difficult to retry a send to this method. (You can bump the ref count
2095 * before sending to hold a reference for retry if you are careful.)
2097 * When calling this method, interrupts MUST be enabled. This is because
2098 * the BH enable code must have IRQs enabled so that it will not deadlock.
2101 int dev_queue_xmit(struct sk_buff *skb)
2103 struct net_device *dev = skb->dev;
2104 struct netdev_queue *txq;
2108 /* GSO will handle the following emulations directly. */
2109 if (netif_needs_gso(dev, skb))
2112 /* Convert a paged skb to linear, if required */
2113 if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
2116 /* If packet is not checksummed and device does not support
2117 * checksumming for this protocol, complete checksumming here.
2119 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2120 skb_set_transport_header(skb, skb->csum_start -
2122 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2127 /* Disable soft irqs for various locks below. Also
2128 * stops preemption for RCU.
2132 txq = dev_pick_tx(dev, skb);
2133 q = rcu_dereference_bh(txq->qdisc);
2135 #ifdef CONFIG_NET_CLS_ACT
2136 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2139 rc = __dev_xmit_skb(skb, q, dev, txq);
2143 /* The device has no queue. Common case for software devices:
2144 loopback, all the sorts of tunnels...
2146 Really, it is unlikely that netif_tx_lock protection is necessary
2147 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2149 However, it is possible, that they rely on protection
2152 Check this and shot the lock. It is not prone from deadlocks.
2153 Either shot noqueue qdisc, it is even simpler 8)
2155 if (dev->flags & IFF_UP) {
2156 int cpu = smp_processor_id(); /* ok because BHs are off */
2158 if (txq->xmit_lock_owner != cpu) {
2160 HARD_TX_LOCK(dev, txq, cpu);
2162 if (!netif_tx_queue_stopped(txq)) {
2163 rc = dev_hard_start_xmit(skb, dev, txq);
2164 if (dev_xmit_complete(rc)) {
2165 HARD_TX_UNLOCK(dev, txq);
2169 HARD_TX_UNLOCK(dev, txq);
2170 if (net_ratelimit())
2171 printk(KERN_CRIT "Virtual device %s asks to "
2172 "queue packet!\n", dev->name);
2174 /* Recursion is detected! It is possible,
2176 if (net_ratelimit())
2177 printk(KERN_CRIT "Dead loop on virtual device "
2178 "%s, fix it urgently!\n", dev->name);
2183 rcu_read_unlock_bh();
2189 rcu_read_unlock_bh();
2192 EXPORT_SYMBOL(dev_queue_xmit);
2195 /*=======================================================================
2197 =======================================================================*/
2199 int netdev_max_backlog __read_mostly = 1000;
2200 int netdev_budget __read_mostly = 300;
2201 int weight_p __read_mostly = 64; /* old backlog weight */
2203 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2207 * get_rps_cpu is called from netif_receive_skb and returns the target
2208 * CPU from the RPS map of the receiving queue for a given skb.
2209 * rcu_read_lock must be held on entry.
2211 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
2213 struct ipv6hdr *ip6;
2215 struct netdev_rx_queue *rxqueue;
2216 struct rps_map *map;
2219 u32 addr1, addr2, ports, ihl;
2221 if (skb_rx_queue_recorded(skb)) {
2222 u16 index = skb_get_rx_queue(skb);
2223 if (unlikely(index >= dev->num_rx_queues)) {
2224 if (net_ratelimit()) {
2225 pr_warning("%s received packet on queue "
2226 "%u, but number of RX queues is %u\n",
2227 dev->name, index, dev->num_rx_queues);
2231 rxqueue = dev->_rx + index;
2235 if (!rxqueue->rps_map)
2239 goto got_hash; /* Skip hash computation on packet header */
2241 switch (skb->protocol) {
2242 case __constant_htons(ETH_P_IP):
2243 if (!pskb_may_pull(skb, sizeof(*ip)))
2246 ip = (struct iphdr *) skb->data;
2247 ip_proto = ip->protocol;
2252 case __constant_htons(ETH_P_IPV6):
2253 if (!pskb_may_pull(skb, sizeof(*ip6)))
2256 ip6 = (struct ipv6hdr *) skb->data;
2257 ip_proto = ip6->nexthdr;
2258 addr1 = ip6->saddr.s6_addr32[3];
2259 addr2 = ip6->daddr.s6_addr32[3];
2273 case IPPROTO_UDPLITE:
2274 if (pskb_may_pull(skb, (ihl * 4) + 4))
2275 ports = *((u32 *) (skb->data + (ihl * 4)));
2282 skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd);
2287 map = rcu_dereference(rxqueue->rps_map);
2289 u16 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2291 if (cpu_online(tcpu)) {
2302 * This structure holds the per-CPU mask of CPUs for which IPIs are scheduled
2303 * to be sent to kick remote softirq processing. There are two masks since
2304 * the sending of IPIs must be done with interrupts enabled. The select field
2305 * indicates the current mask that enqueue_backlog uses to schedule IPIs.
2306 * select is flipped before net_rps_action is called while still under lock,
2307 * net_rps_action then uses the non-selected mask to send the IPIs and clears
2308 * it without conflicting with enqueue_backlog operation.
2310 struct rps_remote_softirq_cpus {
2314 static DEFINE_PER_CPU(struct rps_remote_softirq_cpus, rps_remote_softirq_cpus);
2316 /* Called from hardirq (IPI) context */
2317 static void trigger_softirq(void *data)
2319 struct softnet_data *queue = data;
2320 __napi_schedule(&queue->backlog);
2321 __get_cpu_var(netdev_rx_stat).received_rps++;
2323 #endif /* CONFIG_SMP */
2326 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2327 * queue (may be a remote CPU queue).
2329 static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
2331 struct softnet_data *queue;
2332 unsigned long flags;
2334 queue = &per_cpu(softnet_data, cpu);
2336 local_irq_save(flags);
2337 __get_cpu_var(netdev_rx_stat).total++;
2340 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2341 if (queue->input_pkt_queue.qlen) {
2343 __skb_queue_tail(&queue->input_pkt_queue, skb);
2345 local_irq_restore(flags);
2346 return NET_RX_SUCCESS;
2349 /* Schedule NAPI for backlog device */
2350 if (napi_schedule_prep(&queue->backlog)) {
2352 if (cpu != smp_processor_id()) {
2353 struct rps_remote_softirq_cpus *rcpus =
2354 &__get_cpu_var(rps_remote_softirq_cpus);
2356 cpu_set(cpu, rcpus->mask[rcpus->select]);
2357 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2359 __napi_schedule(&queue->backlog);
2361 __napi_schedule(&queue->backlog);
2369 __get_cpu_var(netdev_rx_stat).dropped++;
2370 local_irq_restore(flags);
2377 * netif_rx - post buffer to the network code
2378 * @skb: buffer to post
2380 * This function receives a packet from a device driver and queues it for
2381 * the upper (protocol) levels to process. It always succeeds. The buffer
2382 * may be dropped during processing for congestion control or by the
2386 * NET_RX_SUCCESS (no congestion)
2387 * NET_RX_DROP (packet was dropped)
2391 int netif_rx(struct sk_buff *skb)
2395 /* if netpoll wants it, pretend we never saw it */
2396 if (netpoll_rx(skb))
2399 if (!skb->tstamp.tv64)
2407 cpu = get_rps_cpu(skb->dev, skb);
2409 cpu = smp_processor_id();
2410 ret = enqueue_to_backlog(skb, cpu);
2414 ret = enqueue_to_backlog(skb, get_cpu());
2419 EXPORT_SYMBOL(netif_rx);
2421 int netif_rx_ni(struct sk_buff *skb)
2426 err = netif_rx(skb);
2427 if (local_softirq_pending())
2433 EXPORT_SYMBOL(netif_rx_ni);
2435 static void net_tx_action(struct softirq_action *h)
2437 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2439 if (sd->completion_queue) {
2440 struct sk_buff *clist;
2442 local_irq_disable();
2443 clist = sd->completion_queue;
2444 sd->completion_queue = NULL;
2448 struct sk_buff *skb = clist;
2449 clist = clist->next;
2451 WARN_ON(atomic_read(&skb->users));
2456 if (sd->output_queue) {
2459 local_irq_disable();
2460 head = sd->output_queue;
2461 sd->output_queue = NULL;
2465 struct Qdisc *q = head;
2466 spinlock_t *root_lock;
2468 head = head->next_sched;
2470 root_lock = qdisc_lock(q);
2471 if (spin_trylock(root_lock)) {
2472 smp_mb__before_clear_bit();
2473 clear_bit(__QDISC_STATE_SCHED,
2476 spin_unlock(root_lock);
2478 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2480 __netif_reschedule(q);
2482 smp_mb__before_clear_bit();
2483 clear_bit(__QDISC_STATE_SCHED,
2491 static inline int deliver_skb(struct sk_buff *skb,
2492 struct packet_type *pt_prev,
2493 struct net_device *orig_dev)
2495 atomic_inc(&skb->users);
2496 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2499 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2501 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2502 /* This hook is defined here for ATM LANE */
2503 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2504 unsigned char *addr) __read_mostly;
2505 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2509 * If bridge module is loaded call bridging hook.
2510 * returns NULL if packet was consumed.
2512 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2513 struct sk_buff *skb) __read_mostly;
2514 EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2516 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2517 struct packet_type **pt_prev, int *ret,
2518 struct net_device *orig_dev)
2520 struct net_bridge_port *port;
2522 if (skb->pkt_type == PACKET_LOOPBACK ||
2523 (port = rcu_dereference(skb->dev->br_port)) == NULL)
2527 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2531 return br_handle_frame_hook(port, skb);
2534 #define handle_bridge(skb, pt_prev, ret, orig_dev) (skb)
2537 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2538 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2539 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2541 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2542 struct packet_type **pt_prev,
2544 struct net_device *orig_dev)
2546 if (skb->dev->macvlan_port == NULL)
2550 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2553 return macvlan_handle_frame_hook(skb);
2556 #define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb)
2559 #ifdef CONFIG_NET_CLS_ACT
2560 /* TODO: Maybe we should just force sch_ingress to be compiled in
2561 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2562 * a compare and 2 stores extra right now if we dont have it on
2563 * but have CONFIG_NET_CLS_ACT
2564 * NOTE: This doesnt stop any functionality; if you dont have
2565 * the ingress scheduler, you just cant add policies on ingress.
2568 static int ing_filter(struct sk_buff *skb)
2570 struct net_device *dev = skb->dev;
2571 u32 ttl = G_TC_RTTL(skb->tc_verd);
2572 struct netdev_queue *rxq;
2573 int result = TC_ACT_OK;
2576 if (MAX_RED_LOOP < ttl++) {
2578 "Redir loop detected Dropping packet (%d->%d)\n",
2579 skb->skb_iif, dev->ifindex);
2583 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2584 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2586 rxq = &dev->rx_queue;
2589 if (q != &noop_qdisc) {
2590 spin_lock(qdisc_lock(q));
2591 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2592 result = qdisc_enqueue_root(skb, q);
2593 spin_unlock(qdisc_lock(q));
2599 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2600 struct packet_type **pt_prev,
2601 int *ret, struct net_device *orig_dev)
2603 if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2607 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2610 /* Huh? Why does turning on AF_PACKET affect this? */
2611 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2614 switch (ing_filter(skb)) {
2628 * netif_nit_deliver - deliver received packets to network taps
2631 * This function is used to deliver incoming packets to network
2632 * taps. It should be used when the normal netif_receive_skb path
2633 * is bypassed, for example because of VLAN acceleration.
2635 void netif_nit_deliver(struct sk_buff *skb)
2637 struct packet_type *ptype;
2639 if (list_empty(&ptype_all))
2642 skb_reset_network_header(skb);
2643 skb_reset_transport_header(skb);
2644 skb->mac_len = skb->network_header - skb->mac_header;
2647 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2648 if (!ptype->dev || ptype->dev == skb->dev)
2649 deliver_skb(skb, ptype, skb->dev);
2654 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2655 struct net_device *master)
2657 if (skb->pkt_type == PACKET_HOST) {
2658 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2660 memcpy(dest, master->dev_addr, ETH_ALEN);
2664 /* On bonding slaves other than the currently active slave, suppress
2665 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2666 * ARP on active-backup slaves with arp_validate enabled.
2668 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2670 struct net_device *dev = skb->dev;
2672 if (master->priv_flags & IFF_MASTER_ARPMON)
2673 dev->last_rx = jiffies;
2675 if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
2676 /* Do address unmangle. The local destination address
2677 * will be always the one master has. Provides the right
2678 * functionality in a bridge.
2680 skb_bond_set_mac_by_master(skb, master);
2683 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2684 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2685 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2688 if (master->priv_flags & IFF_MASTER_ALB) {
2689 if (skb->pkt_type != PACKET_BROADCAST &&
2690 skb->pkt_type != PACKET_MULTICAST)
2693 if (master->priv_flags & IFF_MASTER_8023AD &&
2694 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2701 EXPORT_SYMBOL(__skb_bond_should_drop);
2703 static int __netif_receive_skb(struct sk_buff *skb)
2705 struct packet_type *ptype, *pt_prev;
2706 struct net_device *orig_dev;
2707 struct net_device *master;
2708 struct net_device *null_or_orig;
2709 struct net_device *null_or_bond;
2710 int ret = NET_RX_DROP;
2713 if (!skb->tstamp.tv64)
2716 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2717 return NET_RX_SUCCESS;
2719 /* if we've gotten here through NAPI, check netpoll */
2720 if (netpoll_receive_skb(skb))
2724 skb->skb_iif = skb->dev->ifindex;
2726 null_or_orig = NULL;
2727 orig_dev = skb->dev;
2728 master = ACCESS_ONCE(orig_dev->master);
2730 if (skb_bond_should_drop(skb, master))
2731 null_or_orig = orig_dev; /* deliver only exact match */
2736 __get_cpu_var(netdev_rx_stat).total++;
2738 skb_reset_network_header(skb);
2739 skb_reset_transport_header(skb);
2740 skb->mac_len = skb->network_header - skb->mac_header;
2746 #ifdef CONFIG_NET_CLS_ACT
2747 if (skb->tc_verd & TC_NCLS) {
2748 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2753 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2754 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2755 ptype->dev == orig_dev) {
2757 ret = deliver_skb(skb, pt_prev, orig_dev);
2762 #ifdef CONFIG_NET_CLS_ACT
2763 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2769 skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2772 skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2777 * Make sure frames received on VLAN interfaces stacked on
2778 * bonding interfaces still make their way to any base bonding
2779 * device that may have registered for a specific ptype. The
2780 * handler may have to adjust skb->dev and orig_dev.
2782 null_or_bond = NULL;
2783 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2784 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2785 null_or_bond = vlan_dev_real_dev(skb->dev);
2788 type = skb->protocol;
2789 list_for_each_entry_rcu(ptype,
2790 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2791 if (ptype->type == type && (ptype->dev == null_or_orig ||
2792 ptype->dev == skb->dev || ptype->dev == orig_dev ||
2793 ptype->dev == null_or_bond)) {
2795 ret = deliver_skb(skb, pt_prev, orig_dev);
2801 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2804 /* Jamal, now you will not able to escape explaining
2805 * me how you were going to use this. :-)
2816 * netif_receive_skb - process receive buffer from network
2817 * @skb: buffer to process
2819 * netif_receive_skb() is the main receive data processing function.
2820 * It always succeeds. The buffer may be dropped during processing
2821 * for congestion control or by the protocol layers.
2823 * This function may only be called from softirq context and interrupts
2824 * should be enabled.
2826 * Return values (usually ignored):
2827 * NET_RX_SUCCESS: no congestion
2828 * NET_RX_DROP: packet was dropped
2830 int netif_receive_skb(struct sk_buff *skb)
2835 cpu = get_rps_cpu(skb->dev, skb);
2838 return __netif_receive_skb(skb);
2840 return enqueue_to_backlog(skb, cpu);
2842 return __netif_receive_skb(skb);
2845 EXPORT_SYMBOL(netif_receive_skb);
2847 /* Network device is going away, flush any packets still pending */
2848 static void flush_backlog(void *arg)
2850 struct net_device *dev = arg;
2851 struct softnet_data *queue = &__get_cpu_var(softnet_data);
2852 struct sk_buff *skb, *tmp;
2855 skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2856 if (skb->dev == dev) {
2857 __skb_unlink(skb, &queue->input_pkt_queue);
2863 static int napi_gro_complete(struct sk_buff *skb)
2865 struct packet_type *ptype;
2866 __be16 type = skb->protocol;
2867 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2870 if (NAPI_GRO_CB(skb)->count == 1) {
2871 skb_shinfo(skb)->gso_size = 0;
2876 list_for_each_entry_rcu(ptype, head, list) {
2877 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2880 err = ptype->gro_complete(skb);
2886 WARN_ON(&ptype->list == head);
2888 return NET_RX_SUCCESS;
2892 return netif_receive_skb(skb);
2895 static void napi_gro_flush(struct napi_struct *napi)
2897 struct sk_buff *skb, *next;
2899 for (skb = napi->gro_list; skb; skb = next) {
2902 napi_gro_complete(skb);
2905 napi->gro_count = 0;
2906 napi->gro_list = NULL;
2909 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2911 struct sk_buff **pp = NULL;
2912 struct packet_type *ptype;
2913 __be16 type = skb->protocol;
2914 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2917 enum gro_result ret;
2919 if (!(skb->dev->features & NETIF_F_GRO))
2922 if (skb_is_gso(skb) || skb_has_frags(skb))
2926 list_for_each_entry_rcu(ptype, head, list) {
2927 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2930 skb_set_network_header(skb, skb_gro_offset(skb));
2931 mac_len = skb->network_header - skb->mac_header;
2932 skb->mac_len = mac_len;
2933 NAPI_GRO_CB(skb)->same_flow = 0;
2934 NAPI_GRO_CB(skb)->flush = 0;
2935 NAPI_GRO_CB(skb)->free = 0;
2937 pp = ptype->gro_receive(&napi->gro_list, skb);
2942 if (&ptype->list == head)
2945 same_flow = NAPI_GRO_CB(skb)->same_flow;
2946 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2949 struct sk_buff *nskb = *pp;
2953 napi_gro_complete(nskb);
2960 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2964 NAPI_GRO_CB(skb)->count = 1;
2965 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2966 skb->next = napi->gro_list;
2967 napi->gro_list = skb;
2971 if (skb_headlen(skb) < skb_gro_offset(skb)) {
2972 int grow = skb_gro_offset(skb) - skb_headlen(skb);
2974 BUG_ON(skb->end - skb->tail < grow);
2976 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
2979 skb->data_len -= grow;
2981 skb_shinfo(skb)->frags[0].page_offset += grow;
2982 skb_shinfo(skb)->frags[0].size -= grow;
2984 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
2985 put_page(skb_shinfo(skb)->frags[0].page);
2986 memmove(skb_shinfo(skb)->frags,
2987 skb_shinfo(skb)->frags + 1,
2988 --skb_shinfo(skb)->nr_frags);
2999 EXPORT_SYMBOL(dev_gro_receive);
3002 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3006 if (netpoll_rx_on(skb))
3009 for (p = napi->gro_list; p; p = p->next) {
3010 NAPI_GRO_CB(p)->same_flow =
3011 (p->dev == skb->dev) &&
3012 !compare_ether_header(skb_mac_header(p),
3013 skb_gro_mac_header(skb));
3014 NAPI_GRO_CB(p)->flush = 0;
3017 return dev_gro_receive(napi, skb);
3020 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3024 if (netif_receive_skb(skb))
3029 case GRO_MERGED_FREE:
3040 EXPORT_SYMBOL(napi_skb_finish);
3042 void skb_gro_reset_offset(struct sk_buff *skb)
3044 NAPI_GRO_CB(skb)->data_offset = 0;
3045 NAPI_GRO_CB(skb)->frag0 = NULL;
3046 NAPI_GRO_CB(skb)->frag0_len = 0;
3048 if (skb->mac_header == skb->tail &&
3049 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3050 NAPI_GRO_CB(skb)->frag0 =
3051 page_address(skb_shinfo(skb)->frags[0].page) +
3052 skb_shinfo(skb)->frags[0].page_offset;
3053 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3056 EXPORT_SYMBOL(skb_gro_reset_offset);
3058 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3060 skb_gro_reset_offset(skb);
3062 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3064 EXPORT_SYMBOL(napi_gro_receive);
3066 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3068 __skb_pull(skb, skb_headlen(skb));
3069 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3073 EXPORT_SYMBOL(napi_reuse_skb);
3075 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3077 struct sk_buff *skb = napi->skb;
3080 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3086 EXPORT_SYMBOL(napi_get_frags);
3088 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3094 skb->protocol = eth_type_trans(skb, skb->dev);
3096 if (ret == GRO_HELD)
3097 skb_gro_pull(skb, -ETH_HLEN);
3098 else if (netif_receive_skb(skb))
3103 case GRO_MERGED_FREE:
3104 napi_reuse_skb(napi, skb);
3113 EXPORT_SYMBOL(napi_frags_finish);
3115 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3117 struct sk_buff *skb = napi->skb;
3124 skb_reset_mac_header(skb);
3125 skb_gro_reset_offset(skb);
3127 off = skb_gro_offset(skb);
3128 hlen = off + sizeof(*eth);
3129 eth = skb_gro_header_fast(skb, off);
3130 if (skb_gro_header_hard(skb, hlen)) {
3131 eth = skb_gro_header_slow(skb, hlen, off);
3132 if (unlikely(!eth)) {
3133 napi_reuse_skb(napi, skb);
3139 skb_gro_pull(skb, sizeof(*eth));
3142 * This works because the only protocols we care about don't require
3143 * special handling. We'll fix it up properly at the end.
3145 skb->protocol = eth->h_proto;
3150 EXPORT_SYMBOL(napi_frags_skb);
3152 gro_result_t napi_gro_frags(struct napi_struct *napi)
3154 struct sk_buff *skb = napi_frags_skb(napi);
3159 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3161 EXPORT_SYMBOL(napi_gro_frags);
3163 static int process_backlog(struct napi_struct *napi, int quota)
3166 struct softnet_data *queue = &__get_cpu_var(softnet_data);
3167 unsigned long start_time = jiffies;
3169 napi->weight = weight_p;
3171 struct sk_buff *skb;
3173 local_irq_disable();
3175 skb = __skb_dequeue(&queue->input_pkt_queue);
3177 __napi_complete(napi);
3185 __netif_receive_skb(skb);
3186 } while (++work < quota && jiffies == start_time);
3192 * __napi_schedule - schedule for receive
3193 * @n: entry to schedule
3195 * The entry's receive function will be scheduled to run
3197 void __napi_schedule(struct napi_struct *n)
3199 unsigned long flags;
3201 local_irq_save(flags);
3202 list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
3203 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3204 local_irq_restore(flags);
3206 EXPORT_SYMBOL(__napi_schedule);
3208 void __napi_complete(struct napi_struct *n)
3210 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3211 BUG_ON(n->gro_list);
3213 list_del(&n->poll_list);
3214 smp_mb__before_clear_bit();
3215 clear_bit(NAPI_STATE_SCHED, &n->state);
3217 EXPORT_SYMBOL(__napi_complete);
3219 void napi_complete(struct napi_struct *n)
3221 unsigned long flags;
3224 * don't let napi dequeue from the cpu poll list
3225 * just in case its running on a different cpu
3227 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3231 local_irq_save(flags);
3233 local_irq_restore(flags);
3235 EXPORT_SYMBOL(napi_complete);
3237 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3238 int (*poll)(struct napi_struct *, int), int weight)
3240 INIT_LIST_HEAD(&napi->poll_list);
3241 napi->gro_count = 0;
3242 napi->gro_list = NULL;
3245 napi->weight = weight;
3246 list_add(&napi->dev_list, &dev->napi_list);
3248 #ifdef CONFIG_NETPOLL
3249 spin_lock_init(&napi->poll_lock);
3250 napi->poll_owner = -1;
3252 set_bit(NAPI_STATE_SCHED, &napi->state);
3254 EXPORT_SYMBOL(netif_napi_add);
3256 void netif_napi_del(struct napi_struct *napi)
3258 struct sk_buff *skb, *next;
3260 list_del_init(&napi->dev_list);
3261 napi_free_frags(napi);
3263 for (skb = napi->gro_list; skb; skb = next) {
3269 napi->gro_list = NULL;
3270 napi->gro_count = 0;
3272 EXPORT_SYMBOL(netif_napi_del);
3276 * net_rps_action sends any pending IPI's for rps. This is only called from
3277 * softirq and interrupts must be enabled.
3279 static void net_rps_action(cpumask_t *mask)
3283 /* Send pending IPI's to kick RPS processing on remote cpus. */
3284 for_each_cpu_mask_nr(cpu, *mask) {
3285 struct softnet_data *queue = &per_cpu(softnet_data, cpu);
3286 if (cpu_online(cpu))
3287 __smp_call_function_single(cpu, &queue->csd, 0);
3293 static void net_rx_action(struct softirq_action *h)
3295 struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
3296 unsigned long time_limit = jiffies + 2;
3297 int budget = netdev_budget;
3301 struct rps_remote_softirq_cpus *rcpus;
3304 local_irq_disable();
3306 while (!list_empty(list)) {
3307 struct napi_struct *n;
3310 /* If softirq window is exhuasted then punt.
3311 * Allow this to run for 2 jiffies since which will allow
3312 * an average latency of 1.5/HZ.
3314 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3319 /* Even though interrupts have been re-enabled, this
3320 * access is safe because interrupts can only add new
3321 * entries to the tail of this list, and only ->poll()
3322 * calls can remove this head entry from the list.
3324 n = list_first_entry(list, struct napi_struct, poll_list);
3326 have = netpoll_poll_lock(n);
3330 /* This NAPI_STATE_SCHED test is for avoiding a race
3331 * with netpoll's poll_napi(). Only the entity which
3332 * obtains the lock and sees NAPI_STATE_SCHED set will
3333 * actually make the ->poll() call. Therefore we avoid
3334 * accidently calling ->poll() when NAPI is not scheduled.
3337 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3338 work = n->poll(n, weight);
3342 WARN_ON_ONCE(work > weight);
3346 local_irq_disable();
3348 /* Drivers must not modify the NAPI state if they
3349 * consume the entire weight. In such cases this code
3350 * still "owns" the NAPI instance and therefore can
3351 * move the instance around on the list at-will.
3353 if (unlikely(work == weight)) {
3354 if (unlikely(napi_disable_pending(n))) {
3357 local_irq_disable();
3359 list_move_tail(&n->poll_list, list);
3362 netpoll_poll_unlock(have);
3366 rcpus = &__get_cpu_var(rps_remote_softirq_cpus);
3367 select = rcpus->select;
3372 net_rps_action(&rcpus->mask[select]);
3377 #ifdef CONFIG_NET_DMA
3379 * There may not be any more sk_buffs coming right now, so push
3380 * any pending DMA copies to hardware
3382 dma_issue_pending_all();
3388 __get_cpu_var(netdev_rx_stat).time_squeeze++;
3389 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3393 static gifconf_func_t *gifconf_list[NPROTO];
3396 * register_gifconf - register a SIOCGIF handler
3397 * @family: Address family
3398 * @gifconf: Function handler
3400 * Register protocol dependent address dumping routines. The handler
3401 * that is passed must not be freed or reused until it has been replaced
3402 * by another handler.
3404 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3406 if (family >= NPROTO)
3408 gifconf_list[family] = gifconf;
3411 EXPORT_SYMBOL(register_gifconf);
3415 * Map an interface index to its name (SIOCGIFNAME)
3419 * We need this ioctl for efficient implementation of the
3420 * if_indextoname() function required by the IPv6 API. Without
3421 * it, we would have to search all the interfaces to find a
3425 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3427 struct net_device *dev;
3431 * Fetch the caller's info block.
3434 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3438 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3444 strcpy(ifr.ifr_name, dev->name);
3447 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3453 * Perform a SIOCGIFCONF call. This structure will change
3454 * size eventually, and there is nothing I can do about it.
3455 * Thus we will need a 'compatibility mode'.
3458 static int dev_ifconf(struct net *net, char __user *arg)
3461 struct net_device *dev;
3468 * Fetch the caller's info block.
3471 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3478 * Loop over the interfaces, and write an info block for each.
3482 for_each_netdev(net, dev) {
3483 for (i = 0; i < NPROTO; i++) {
3484 if (gifconf_list[i]) {
3487 done = gifconf_list[i](dev, NULL, 0);
3489 done = gifconf_list[i](dev, pos + total,
3499 * All done. Write the updated control block back to the caller.
3501 ifc.ifc_len = total;
3504 * Both BSD and Solaris return 0 here, so we do too.
3506 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3509 #ifdef CONFIG_PROC_FS
3511 * This is invoked by the /proc filesystem handler to display a device
3514 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3517 struct net *net = seq_file_net(seq);
3519 struct net_device *dev;
3523 return SEQ_START_TOKEN;
3526 for_each_netdev_rcu(net, dev)
3533 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3535 struct net_device *dev = (v == SEQ_START_TOKEN) ?
3536 first_net_device(seq_file_net(seq)) :
3537 next_net_device((struct net_device *)v);
3540 return rcu_dereference(dev);
3543 void dev_seq_stop(struct seq_file *seq, void *v)
3549 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3551 const struct net_device_stats *stats = dev_get_stats(dev);
3553 seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3554 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3555 dev->name, stats->rx_bytes, stats->rx_packets,
3557 stats->rx_dropped + stats->rx_missed_errors,
3558 stats->rx_fifo_errors,
3559 stats->rx_length_errors + stats->rx_over_errors +
3560 stats->rx_crc_errors + stats->rx_frame_errors,
3561 stats->rx_compressed, stats->multicast,
3562 stats->tx_bytes, stats->tx_packets,
3563 stats->tx_errors, stats->tx_dropped,
3564 stats->tx_fifo_errors, stats->collisions,
3565 stats->tx_carrier_errors +
3566 stats->tx_aborted_errors +
3567 stats->tx_window_errors +
3568 stats->tx_heartbeat_errors,
3569 stats->tx_compressed);
3573 * Called from the PROCfs module. This now uses the new arbitrary sized
3574 * /proc/net interface to create /proc/net/dev
3576 static int dev_seq_show(struct seq_file *seq, void *v)
3578 if (v == SEQ_START_TOKEN)
3579 seq_puts(seq, "Inter-| Receive "
3581 " face |bytes packets errs drop fifo frame "
3582 "compressed multicast|bytes packets errs "
3583 "drop fifo colls carrier compressed\n");
3585 dev_seq_printf_stats(seq, v);
3589 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3591 struct netif_rx_stats *rc = NULL;
3593 while (*pos < nr_cpu_ids)
3594 if (cpu_online(*pos)) {
3595 rc = &per_cpu(netdev_rx_stat, *pos);
3602 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3604 return softnet_get_online(pos);
3607 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3610 return softnet_get_online(pos);
3613 static void softnet_seq_stop(struct seq_file *seq, void *v)
3617 static int softnet_seq_show(struct seq_file *seq, void *v)
3619 struct netif_rx_stats *s = v;
3621 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3622 s->total, s->dropped, s->time_squeeze, 0,
3623 0, 0, 0, 0, /* was fastroute */
3624 s->cpu_collision, s->received_rps);
3628 static const struct seq_operations dev_seq_ops = {
3629 .start = dev_seq_start,
3630 .next = dev_seq_next,
3631 .stop = dev_seq_stop,
3632 .show = dev_seq_show,
3635 static int dev_seq_open(struct inode *inode, struct file *file)
3637 return seq_open_net(inode, file, &dev_seq_ops,
3638 sizeof(struct seq_net_private));
3641 static const struct file_operations dev_seq_fops = {
3642 .owner = THIS_MODULE,
3643 .open = dev_seq_open,
3645 .llseek = seq_lseek,
3646 .release = seq_release_net,
3649 static const struct seq_operations softnet_seq_ops = {
3650 .start = softnet_seq_start,
3651 .next = softnet_seq_next,
3652 .stop = softnet_seq_stop,
3653 .show = softnet_seq_show,
3656 static int softnet_seq_open(struct inode *inode, struct file *file)
3658 return seq_open(file, &softnet_seq_ops);
3661 static const struct file_operations softnet_seq_fops = {
3662 .owner = THIS_MODULE,
3663 .open = softnet_seq_open,
3665 .llseek = seq_lseek,
3666 .release = seq_release,
3669 static void *ptype_get_idx(loff_t pos)
3671 struct packet_type *pt = NULL;
3675 list_for_each_entry_rcu(pt, &ptype_all, list) {
3681 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3682 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3691 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3695 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3698 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3700 struct packet_type *pt;
3701 struct list_head *nxt;
3705 if (v == SEQ_START_TOKEN)
3706 return ptype_get_idx(0);
3709 nxt = pt->list.next;
3710 if (pt->type == htons(ETH_P_ALL)) {
3711 if (nxt != &ptype_all)
3714 nxt = ptype_base[0].next;
3716 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3718 while (nxt == &ptype_base[hash]) {
3719 if (++hash >= PTYPE_HASH_SIZE)
3721 nxt = ptype_base[hash].next;
3724 return list_entry(nxt, struct packet_type, list);
3727 static void ptype_seq_stop(struct seq_file *seq, void *v)
3733 static int ptype_seq_show(struct seq_file *seq, void *v)
3735 struct packet_type *pt = v;
3737 if (v == SEQ_START_TOKEN)
3738 seq_puts(seq, "Type Device Function\n");
3739 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3740 if (pt->type == htons(ETH_P_ALL))
3741 seq_puts(seq, "ALL ");
3743 seq_printf(seq, "%04x", ntohs(pt->type));
3745 seq_printf(seq, " %-8s %pF\n",
3746 pt->dev ? pt->dev->name : "", pt->func);
3752 static const struct seq_operations ptype_seq_ops = {
3753 .start = ptype_seq_start,
3754 .next = ptype_seq_next,
3755 .stop = ptype_seq_stop,
3756 .show = ptype_seq_show,
3759 static int ptype_seq_open(struct inode *inode, struct file *file)
3761 return seq_open_net(inode, file, &ptype_seq_ops,
3762 sizeof(struct seq_net_private));
3765 static const struct file_operations ptype_seq_fops = {
3766 .owner = THIS_MODULE,
3767 .open = ptype_seq_open,
3769 .llseek = seq_lseek,
3770 .release = seq_release_net,
3774 static int __net_init dev_proc_net_init(struct net *net)
3778 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3780 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3782 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3785 if (wext_proc_init(net))
3791 proc_net_remove(net, "ptype");
3793 proc_net_remove(net, "softnet_stat");
3795 proc_net_remove(net, "dev");
3799 static void __net_exit dev_proc_net_exit(struct net *net)
3801 wext_proc_exit(net);
3803 proc_net_remove(net, "ptype");
3804 proc_net_remove(net, "softnet_stat");
3805 proc_net_remove(net, "dev");
3808 static struct pernet_operations __net_initdata dev_proc_ops = {
3809 .init = dev_proc_net_init,
3810 .exit = dev_proc_net_exit,
3813 static int __init dev_proc_init(void)
3815 return register_pernet_subsys(&dev_proc_ops);
3818 #define dev_proc_init() 0
3819 #endif /* CONFIG_PROC_FS */
3823 * netdev_set_master - set up master/slave pair
3824 * @slave: slave device
3825 * @master: new master device
3827 * Changes the master device of the slave. Pass %NULL to break the
3828 * bonding. The caller must hold the RTNL semaphore. On a failure
3829 * a negative errno code is returned. On success the reference counts
3830 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3831 * function returns zero.
3833 int netdev_set_master(struct net_device *slave, struct net_device *master)
3835 struct net_device *old = slave->master;
3845 slave->master = master;
3852 slave->flags |= IFF_SLAVE;
3854 slave->flags &= ~IFF_SLAVE;
3856 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3859 EXPORT_SYMBOL(netdev_set_master);
3861 static void dev_change_rx_flags(struct net_device *dev, int flags)
3863 const struct net_device_ops *ops = dev->netdev_ops;
3865 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3866 ops->ndo_change_rx_flags(dev, flags);
3869 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3871 unsigned short old_flags = dev->flags;
3877 dev->flags |= IFF_PROMISC;
3878 dev->promiscuity += inc;
3879 if (dev->promiscuity == 0) {
3882 * If inc causes overflow, untouch promisc and return error.
3885 dev->flags &= ~IFF_PROMISC;
3887 dev->promiscuity -= inc;
3888 printk(KERN_WARNING "%s: promiscuity touches roof, "
3889 "set promiscuity failed, promiscuity feature "
3890 "of device might be broken.\n", dev->name);
3894 if (dev->flags != old_flags) {
3895 printk(KERN_INFO "device %s %s promiscuous mode\n",
3896 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3898 if (audit_enabled) {
3899 current_uid_gid(&uid, &gid);
3900 audit_log(current->audit_context, GFP_ATOMIC,
3901 AUDIT_ANOM_PROMISCUOUS,
3902 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3903 dev->name, (dev->flags & IFF_PROMISC),
3904 (old_flags & IFF_PROMISC),
3905 audit_get_loginuid(current),
3907 audit_get_sessionid(current));
3910 dev_change_rx_flags(dev, IFF_PROMISC);
3916 * dev_set_promiscuity - update promiscuity count on a device
3920 * Add or remove promiscuity from a device. While the count in the device
3921 * remains above zero the interface remains promiscuous. Once it hits zero
3922 * the device reverts back to normal filtering operation. A negative inc
3923 * value is used to drop promiscuity on the device.
3924 * Return 0 if successful or a negative errno code on error.
3926 int dev_set_promiscuity(struct net_device *dev, int inc)
3928 unsigned short old_flags = dev->flags;
3931 err = __dev_set_promiscuity(dev, inc);
3934 if (dev->flags != old_flags)
3935 dev_set_rx_mode(dev);
3938 EXPORT_SYMBOL(dev_set_promiscuity);
3941 * dev_set_allmulti - update allmulti count on a device
3945 * Add or remove reception of all multicast frames to a device. While the
3946 * count in the device remains above zero the interface remains listening
3947 * to all interfaces. Once it hits zero the device reverts back to normal
3948 * filtering operation. A negative @inc value is used to drop the counter
3949 * when releasing a resource needing all multicasts.
3950 * Return 0 if successful or a negative errno code on error.
3953 int dev_set_allmulti(struct net_device *dev, int inc)
3955 unsigned short old_flags = dev->flags;
3959 dev->flags |= IFF_ALLMULTI;
3960 dev->allmulti += inc;
3961 if (dev->allmulti == 0) {
3964 * If inc causes overflow, untouch allmulti and return error.
3967 dev->flags &= ~IFF_ALLMULTI;
3969 dev->allmulti -= inc;
3970 printk(KERN_WARNING "%s: allmulti touches roof, "
3971 "set allmulti failed, allmulti feature of "
3972 "device might be broken.\n", dev->name);
3976 if (dev->flags ^ old_flags) {
3977 dev_change_rx_flags(dev, IFF_ALLMULTI);
3978 dev_set_rx_mode(dev);
3982 EXPORT_SYMBOL(dev_set_allmulti);
3985 * Upload unicast and multicast address lists to device and
3986 * configure RX filtering. When the device doesn't support unicast
3987 * filtering it is put in promiscuous mode while unicast addresses
3990 void __dev_set_rx_mode(struct net_device *dev)
3992 const struct net_device_ops *ops = dev->netdev_ops;
3994 /* dev_open will call this function so the list will stay sane. */
3995 if (!(dev->flags&IFF_UP))
3998 if (!netif_device_present(dev))
4001 if (ops->ndo_set_rx_mode)
4002 ops->ndo_set_rx_mode(dev);
4004 /* Unicast addresses changes may only happen under the rtnl,
4005 * therefore calling __dev_set_promiscuity here is safe.
4007 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4008 __dev_set_promiscuity(dev, 1);
4009 dev->uc_promisc = 1;
4010 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4011 __dev_set_promiscuity(dev, -1);
4012 dev->uc_promisc = 0;
4015 if (ops->ndo_set_multicast_list)
4016 ops->ndo_set_multicast_list(dev);
4020 void dev_set_rx_mode(struct net_device *dev)
4022 netif_addr_lock_bh(dev);
4023 __dev_set_rx_mode(dev);
4024 netif_addr_unlock_bh(dev);
4028 * dev_get_flags - get flags reported to userspace
4031 * Get the combination of flag bits exported through APIs to userspace.
4033 unsigned dev_get_flags(const struct net_device *dev)
4037 flags = (dev->flags & ~(IFF_PROMISC |
4042 (dev->gflags & (IFF_PROMISC |
4045 if (netif_running(dev)) {
4046 if (netif_oper_up(dev))
4047 flags |= IFF_RUNNING;
4048 if (netif_carrier_ok(dev))
4049 flags |= IFF_LOWER_UP;
4050 if (netif_dormant(dev))
4051 flags |= IFF_DORMANT;
4056 EXPORT_SYMBOL(dev_get_flags);
4058 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4060 int old_flags = dev->flags;
4066 * Set the flags on our device.
4069 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4070 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4072 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4076 * Load in the correct multicast list now the flags have changed.
4079 if ((old_flags ^ flags) & IFF_MULTICAST)
4080 dev_change_rx_flags(dev, IFF_MULTICAST);
4082 dev_set_rx_mode(dev);
4085 * Have we downed the interface. We handle IFF_UP ourselves
4086 * according to user attempts to set it, rather than blindly
4091 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4092 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4095 dev_set_rx_mode(dev);
4098 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4099 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4101 dev->gflags ^= IFF_PROMISC;
4102 dev_set_promiscuity(dev, inc);
4105 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4106 is important. Some (broken) drivers set IFF_PROMISC, when
4107 IFF_ALLMULTI is requested not asking us and not reporting.
4109 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4110 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4112 dev->gflags ^= IFF_ALLMULTI;
4113 dev_set_allmulti(dev, inc);
4119 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4121 unsigned int changes = dev->flags ^ old_flags;
4123 if (changes & IFF_UP) {
4124 if (dev->flags & IFF_UP)
4125 call_netdevice_notifiers(NETDEV_UP, dev);
4127 call_netdevice_notifiers(NETDEV_DOWN, dev);
4130 if (dev->flags & IFF_UP &&
4131 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4132 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4136 * dev_change_flags - change device settings
4138 * @flags: device state flags
4140 * Change settings on device based state flags. The flags are
4141 * in the userspace exported format.
4143 int dev_change_flags(struct net_device *dev, unsigned flags)
4146 int old_flags = dev->flags;
4148 ret = __dev_change_flags(dev, flags);
4152 changes = old_flags ^ dev->flags;
4154 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4156 __dev_notify_flags(dev, old_flags);
4159 EXPORT_SYMBOL(dev_change_flags);
4162 * dev_set_mtu - Change maximum transfer unit
4164 * @new_mtu: new transfer unit
4166 * Change the maximum transfer size of the network device.
4168 int dev_set_mtu(struct net_device *dev, int new_mtu)
4170 const struct net_device_ops *ops = dev->netdev_ops;
4173 if (new_mtu == dev->mtu)
4176 /* MTU must be positive. */
4180 if (!netif_device_present(dev))
4184 if (ops->ndo_change_mtu)
4185 err = ops->ndo_change_mtu(dev, new_mtu);
4189 if (!err && dev->flags & IFF_UP)
4190 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4193 EXPORT_SYMBOL(dev_set_mtu);
4196 * dev_set_mac_address - Change Media Access Control Address
4200 * Change the hardware (MAC) address of the device
4202 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4204 const struct net_device_ops *ops = dev->netdev_ops;
4207 if (!ops->ndo_set_mac_address)
4209 if (sa->sa_family != dev->type)
4211 if (!netif_device_present(dev))
4213 err = ops->ndo_set_mac_address(dev, sa);
4215 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4218 EXPORT_SYMBOL(dev_set_mac_address);
4221 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4223 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4226 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4232 case SIOCGIFFLAGS: /* Get interface flags */
4233 ifr->ifr_flags = (short) dev_get_flags(dev);
4236 case SIOCGIFMETRIC: /* Get the metric on the interface
4237 (currently unused) */
4238 ifr->ifr_metric = 0;
4241 case SIOCGIFMTU: /* Get the MTU of a device */
4242 ifr->ifr_mtu = dev->mtu;
4247 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4249 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4250 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4251 ifr->ifr_hwaddr.sa_family = dev->type;
4259 ifr->ifr_map.mem_start = dev->mem_start;
4260 ifr->ifr_map.mem_end = dev->mem_end;
4261 ifr->ifr_map.base_addr = dev->base_addr;
4262 ifr->ifr_map.irq = dev->irq;
4263 ifr->ifr_map.dma = dev->dma;
4264 ifr->ifr_map.port = dev->if_port;
4268 ifr->ifr_ifindex = dev->ifindex;
4272 ifr->ifr_qlen = dev->tx_queue_len;
4276 /* dev_ioctl() should ensure this case
4288 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4290 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4293 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4294 const struct net_device_ops *ops;
4299 ops = dev->netdev_ops;
4302 case SIOCSIFFLAGS: /* Set interface flags */
4303 return dev_change_flags(dev, ifr->ifr_flags);
4305 case SIOCSIFMETRIC: /* Set the metric on the interface
4306 (currently unused) */
4309 case SIOCSIFMTU: /* Set the MTU of a device */
4310 return dev_set_mtu(dev, ifr->ifr_mtu);
4313 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4315 case SIOCSIFHWBROADCAST:
4316 if (ifr->ifr_hwaddr.sa_family != dev->type)
4318 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4319 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4320 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4324 if (ops->ndo_set_config) {
4325 if (!netif_device_present(dev))
4327 return ops->ndo_set_config(dev, &ifr->ifr_map);
4332 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4333 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4335 if (!netif_device_present(dev))
4337 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4340 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4341 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4343 if (!netif_device_present(dev))
4345 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4348 if (ifr->ifr_qlen < 0)
4350 dev->tx_queue_len = ifr->ifr_qlen;
4354 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4355 return dev_change_name(dev, ifr->ifr_newname);
4358 * Unknown or private ioctl
4361 if ((cmd >= SIOCDEVPRIVATE &&
4362 cmd <= SIOCDEVPRIVATE + 15) ||
4363 cmd == SIOCBONDENSLAVE ||
4364 cmd == SIOCBONDRELEASE ||
4365 cmd == SIOCBONDSETHWADDR ||
4366 cmd == SIOCBONDSLAVEINFOQUERY ||
4367 cmd == SIOCBONDINFOQUERY ||
4368 cmd == SIOCBONDCHANGEACTIVE ||
4369 cmd == SIOCGMIIPHY ||
4370 cmd == SIOCGMIIREG ||
4371 cmd == SIOCSMIIREG ||
4372 cmd == SIOCBRADDIF ||
4373 cmd == SIOCBRDELIF ||
4374 cmd == SIOCSHWTSTAMP ||
4375 cmd == SIOCWANDEV) {
4377 if (ops->ndo_do_ioctl) {
4378 if (netif_device_present(dev))
4379 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4391 * This function handles all "interface"-type I/O control requests. The actual
4392 * 'doing' part of this is dev_ifsioc above.
4396 * dev_ioctl - network device ioctl
4397 * @net: the applicable net namespace
4398 * @cmd: command to issue
4399 * @arg: pointer to a struct ifreq in user space
4401 * Issue ioctl functions to devices. This is normally called by the
4402 * user space syscall interfaces but can sometimes be useful for
4403 * other purposes. The return value is the return from the syscall if
4404 * positive or a negative errno code on error.
4407 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4413 /* One special case: SIOCGIFCONF takes ifconf argument
4414 and requires shared lock, because it sleeps writing
4418 if (cmd == SIOCGIFCONF) {
4420 ret = dev_ifconf(net, (char __user *) arg);
4424 if (cmd == SIOCGIFNAME)
4425 return dev_ifname(net, (struct ifreq __user *)arg);
4427 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4430 ifr.ifr_name[IFNAMSIZ-1] = 0;
4432 colon = strchr(ifr.ifr_name, ':');
4437 * See which interface the caller is talking about.
4442 * These ioctl calls:
4443 * - can be done by all.
4444 * - atomic and do not require locking.
4455 dev_load(net, ifr.ifr_name);
4457 ret = dev_ifsioc_locked(net, &ifr, cmd);
4462 if (copy_to_user(arg, &ifr,
4463 sizeof(struct ifreq)))
4469 dev_load(net, ifr.ifr_name);
4471 ret = dev_ethtool(net, &ifr);
4476 if (copy_to_user(arg, &ifr,
4477 sizeof(struct ifreq)))
4483 * These ioctl calls:
4484 * - require superuser power.
4485 * - require strict serialization.
4491 if (!capable(CAP_NET_ADMIN))
4493 dev_load(net, ifr.ifr_name);
4495 ret = dev_ifsioc(net, &ifr, cmd);
4500 if (copy_to_user(arg, &ifr,
4501 sizeof(struct ifreq)))
4507 * These ioctl calls:
4508 * - require superuser power.
4509 * - require strict serialization.
4510 * - do not return a value
4520 case SIOCSIFHWBROADCAST:
4523 case SIOCBONDENSLAVE:
4524 case SIOCBONDRELEASE:
4525 case SIOCBONDSETHWADDR:
4526 case SIOCBONDCHANGEACTIVE:
4530 if (!capable(CAP_NET_ADMIN))
4533 case SIOCBONDSLAVEINFOQUERY:
4534 case SIOCBONDINFOQUERY:
4535 dev_load(net, ifr.ifr_name);
4537 ret = dev_ifsioc(net, &ifr, cmd);
4542 /* Get the per device memory space. We can add this but
4543 * currently do not support it */
4545 /* Set the per device memory buffer space.
4546 * Not applicable in our case */
4551 * Unknown or private ioctl.
4554 if (cmd == SIOCWANDEV ||
4555 (cmd >= SIOCDEVPRIVATE &&
4556 cmd <= SIOCDEVPRIVATE + 15)) {
4557 dev_load(net, ifr.ifr_name);
4559 ret = dev_ifsioc(net, &ifr, cmd);
4561 if (!ret && copy_to_user(arg, &ifr,
4562 sizeof(struct ifreq)))
4566 /* Take care of Wireless Extensions */
4567 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4568 return wext_handle_ioctl(net, &ifr, cmd, arg);
4575 * dev_new_index - allocate an ifindex
4576 * @net: the applicable net namespace
4578 * Returns a suitable unique value for a new device interface
4579 * number. The caller must hold the rtnl semaphore or the
4580 * dev_base_lock to be sure it remains unique.
4582 static int dev_new_index(struct net *net)
4588 if (!__dev_get_by_index(net, ifindex))
4593 /* Delayed registration/unregisteration */
4594 static LIST_HEAD(net_todo_list);
4596 static void net_set_todo(struct net_device *dev)
4598 list_add_tail(&dev->todo_list, &net_todo_list);
4601 static void rollback_registered_many(struct list_head *head)
4603 struct net_device *dev, *tmp;
4605 BUG_ON(dev_boot_phase);
4608 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4609 /* Some devices call without registering
4610 * for initialization unwind. Remove those
4611 * devices and proceed with the remaining.
4613 if (dev->reg_state == NETREG_UNINITIALIZED) {
4614 pr_debug("unregister_netdevice: device %s/%p never "
4615 "was registered\n", dev->name, dev);
4618 list_del(&dev->unreg_list);
4622 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4624 /* If device is running, close it first. */
4627 /* And unlink it from device chain. */
4628 unlist_netdevice(dev);
4630 dev->reg_state = NETREG_UNREGISTERING;
4635 list_for_each_entry(dev, head, unreg_list) {
4636 /* Shutdown queueing discipline. */
4640 /* Notify protocols, that we are about to destroy
4641 this device. They should clean all the things.
4643 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4645 if (!dev->rtnl_link_ops ||
4646 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4647 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4650 * Flush the unicast and multicast chains
4655 if (dev->netdev_ops->ndo_uninit)
4656 dev->netdev_ops->ndo_uninit(dev);
4658 /* Notifier chain MUST detach us from master device. */
4659 WARN_ON(dev->master);
4661 /* Remove entries from kobject tree */
4662 netdev_unregister_kobject(dev);
4665 /* Process any work delayed until the end of the batch */
4666 dev = list_first_entry(head, struct net_device, unreg_list);
4667 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4671 list_for_each_entry(dev, head, unreg_list)
4675 static void rollback_registered(struct net_device *dev)
4679 list_add(&dev->unreg_list, &single);
4680 rollback_registered_many(&single);
4683 static void __netdev_init_queue_locks_one(struct net_device *dev,
4684 struct netdev_queue *dev_queue,
4687 spin_lock_init(&dev_queue->_xmit_lock);
4688 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4689 dev_queue->xmit_lock_owner = -1;
4692 static void netdev_init_queue_locks(struct net_device *dev)
4694 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4695 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4698 unsigned long netdev_fix_features(unsigned long features, const char *name)
4700 /* Fix illegal SG+CSUM combinations. */
4701 if ((features & NETIF_F_SG) &&
4702 !(features & NETIF_F_ALL_CSUM)) {
4704 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4705 "checksum feature.\n", name);
4706 features &= ~NETIF_F_SG;
4709 /* TSO requires that SG is present as well. */
4710 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4712 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4713 "SG feature.\n", name);
4714 features &= ~NETIF_F_TSO;
4717 if (features & NETIF_F_UFO) {
4718 if (!(features & NETIF_F_GEN_CSUM)) {
4720 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4721 "since no NETIF_F_HW_CSUM feature.\n",
4723 features &= ~NETIF_F_UFO;
4726 if (!(features & NETIF_F_SG)) {
4728 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4729 "since no NETIF_F_SG feature.\n", name);
4730 features &= ~NETIF_F_UFO;
4736 EXPORT_SYMBOL(netdev_fix_features);
4739 * netif_stacked_transfer_operstate - transfer operstate
4740 * @rootdev: the root or lower level device to transfer state from
4741 * @dev: the device to transfer operstate to
4743 * Transfer operational state from root to device. This is normally
4744 * called when a stacking relationship exists between the root
4745 * device and the device(a leaf device).
4747 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4748 struct net_device *dev)
4750 if (rootdev->operstate == IF_OPER_DORMANT)
4751 netif_dormant_on(dev);
4753 netif_dormant_off(dev);
4755 if (netif_carrier_ok(rootdev)) {
4756 if (!netif_carrier_ok(dev))
4757 netif_carrier_on(dev);
4759 if (netif_carrier_ok(dev))
4760 netif_carrier_off(dev);
4763 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4766 * register_netdevice - register a network device
4767 * @dev: device to register
4769 * Take a completed network device structure and add it to the kernel
4770 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4771 * chain. 0 is returned on success. A negative errno code is returned
4772 * on a failure to set up the device, or if the name is a duplicate.
4774 * Callers must hold the rtnl semaphore. You may want
4775 * register_netdev() instead of this.
4778 * The locking appears insufficient to guarantee two parallel registers
4779 * will not get the same name.
4782 int register_netdevice(struct net_device *dev)
4785 struct net *net = dev_net(dev);
4787 BUG_ON(dev_boot_phase);
4792 /* When net_device's are persistent, this will be fatal. */
4793 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4796 spin_lock_init(&dev->addr_list_lock);
4797 netdev_set_addr_lockdep_class(dev);
4798 netdev_init_queue_locks(dev);
4803 if (!dev->num_rx_queues) {
4805 * Allocate a single RX queue if driver never called
4809 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
4815 dev->_rx->first = dev->_rx;
4816 atomic_set(&dev->_rx->count, 1);
4817 dev->num_rx_queues = 1;
4820 /* Init, if this function is available */
4821 if (dev->netdev_ops->ndo_init) {
4822 ret = dev->netdev_ops->ndo_init(dev);
4830 ret = dev_get_valid_name(net, dev->name, dev->name, 0);
4834 dev->ifindex = dev_new_index(net);
4835 if (dev->iflink == -1)
4836 dev->iflink = dev->ifindex;
4838 /* Fix illegal checksum combinations */
4839 if ((dev->features & NETIF_F_HW_CSUM) &&
4840 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4841 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4843 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4846 if ((dev->features & NETIF_F_NO_CSUM) &&
4847 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4848 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4850 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4853 dev->features = netdev_fix_features(dev->features, dev->name);
4855 /* Enable software GSO if SG is supported. */
4856 if (dev->features & NETIF_F_SG)
4857 dev->features |= NETIF_F_GSO;
4859 netdev_initialize_kobject(dev);
4861 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
4862 ret = notifier_to_errno(ret);
4866 ret = netdev_register_kobject(dev);
4869 dev->reg_state = NETREG_REGISTERED;
4872 * Default initial state at registry is that the
4873 * device is present.
4876 set_bit(__LINK_STATE_PRESENT, &dev->state);
4878 dev_init_scheduler(dev);
4880 list_netdevice(dev);
4882 /* Notify protocols, that a new device appeared. */
4883 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4884 ret = notifier_to_errno(ret);
4886 rollback_registered(dev);
4887 dev->reg_state = NETREG_UNREGISTERED;
4890 * Prevent userspace races by waiting until the network
4891 * device is fully setup before sending notifications.
4893 if (!dev->rtnl_link_ops ||
4894 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4895 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
4901 if (dev->netdev_ops->ndo_uninit)
4902 dev->netdev_ops->ndo_uninit(dev);
4905 EXPORT_SYMBOL(register_netdevice);
4908 * init_dummy_netdev - init a dummy network device for NAPI
4909 * @dev: device to init
4911 * This takes a network device structure and initialize the minimum
4912 * amount of fields so it can be used to schedule NAPI polls without
4913 * registering a full blown interface. This is to be used by drivers
4914 * that need to tie several hardware interfaces to a single NAPI
4915 * poll scheduler due to HW limitations.
4917 int init_dummy_netdev(struct net_device *dev)
4919 /* Clear everything. Note we don't initialize spinlocks
4920 * are they aren't supposed to be taken by any of the
4921 * NAPI code and this dummy netdev is supposed to be
4922 * only ever used for NAPI polls
4924 memset(dev, 0, sizeof(struct net_device));
4926 /* make sure we BUG if trying to hit standard
4927 * register/unregister code path
4929 dev->reg_state = NETREG_DUMMY;
4931 /* initialize the ref count */
4932 atomic_set(&dev->refcnt, 1);
4934 /* NAPI wants this */
4935 INIT_LIST_HEAD(&dev->napi_list);
4937 /* a dummy interface is started by default */
4938 set_bit(__LINK_STATE_PRESENT, &dev->state);
4939 set_bit(__LINK_STATE_START, &dev->state);
4943 EXPORT_SYMBOL_GPL(init_dummy_netdev);
4947 * register_netdev - register a network device
4948 * @dev: device to register
4950 * Take a completed network device structure and add it to the kernel
4951 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4952 * chain. 0 is returned on success. A negative errno code is returned
4953 * on a failure to set up the device, or if the name is a duplicate.
4955 * This is a wrapper around register_netdevice that takes the rtnl semaphore
4956 * and expands the device name if you passed a format string to
4959 int register_netdev(struct net_device *dev)
4966 * If the name is a format string the caller wants us to do a
4969 if (strchr(dev->name, '%')) {
4970 err = dev_alloc_name(dev, dev->name);
4975 err = register_netdevice(dev);
4980 EXPORT_SYMBOL(register_netdev);
4983 * netdev_wait_allrefs - wait until all references are gone.
4985 * This is called when unregistering network devices.
4987 * Any protocol or device that holds a reference should register
4988 * for netdevice notification, and cleanup and put back the
4989 * reference if they receive an UNREGISTER event.
4990 * We can get stuck here if buggy protocols don't correctly
4993 static void netdev_wait_allrefs(struct net_device *dev)
4995 unsigned long rebroadcast_time, warning_time;
4997 linkwatch_forget_dev(dev);
4999 rebroadcast_time = warning_time = jiffies;
5000 while (atomic_read(&dev->refcnt) != 0) {
5001 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5004 /* Rebroadcast unregister notification */
5005 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5006 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5007 * should have already handle it the first time */
5009 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5011 /* We must not have linkwatch events
5012 * pending on unregister. If this
5013 * happens, we simply run the queue
5014 * unscheduled, resulting in a noop
5017 linkwatch_run_queue();
5022 rebroadcast_time = jiffies;
5027 if (time_after(jiffies, warning_time + 10 * HZ)) {
5028 printk(KERN_EMERG "unregister_netdevice: "
5029 "waiting for %s to become free. Usage "
5031 dev->name, atomic_read(&dev->refcnt));
5032 warning_time = jiffies;
5041 * register_netdevice(x1);
5042 * register_netdevice(x2);
5044 * unregister_netdevice(y1);
5045 * unregister_netdevice(y2);
5051 * We are invoked by rtnl_unlock().
5052 * This allows us to deal with problems:
5053 * 1) We can delete sysfs objects which invoke hotplug
5054 * without deadlocking with linkwatch via keventd.
5055 * 2) Since we run with the RTNL semaphore not held, we can sleep
5056 * safely in order to wait for the netdev refcnt to drop to zero.
5058 * We must not return until all unregister events added during
5059 * the interval the lock was held have been completed.
5061 void netdev_run_todo(void)
5063 struct list_head list;
5065 /* Snapshot list, allow later requests */
5066 list_replace_init(&net_todo_list, &list);
5070 while (!list_empty(&list)) {
5071 struct net_device *dev
5072 = list_first_entry(&list, struct net_device, todo_list);
5073 list_del(&dev->todo_list);
5075 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5076 printk(KERN_ERR "network todo '%s' but state %d\n",
5077 dev->name, dev->reg_state);
5082 dev->reg_state = NETREG_UNREGISTERED;
5084 on_each_cpu(flush_backlog, dev, 1);
5086 netdev_wait_allrefs(dev);
5089 BUG_ON(atomic_read(&dev->refcnt));
5090 WARN_ON(dev->ip_ptr);
5091 WARN_ON(dev->ip6_ptr);
5092 WARN_ON(dev->dn_ptr);
5094 if (dev->destructor)
5095 dev->destructor(dev);
5097 /* Free network device */
5098 kobject_put(&dev->dev.kobj);
5103 * dev_txq_stats_fold - fold tx_queues stats
5104 * @dev: device to get statistics from
5105 * @stats: struct net_device_stats to hold results
5107 void dev_txq_stats_fold(const struct net_device *dev,
5108 struct net_device_stats *stats)
5110 unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5112 struct netdev_queue *txq;
5114 for (i = 0; i < dev->num_tx_queues; i++) {
5115 txq = netdev_get_tx_queue(dev, i);
5116 tx_bytes += txq->tx_bytes;
5117 tx_packets += txq->tx_packets;
5118 tx_dropped += txq->tx_dropped;
5120 if (tx_bytes || tx_packets || tx_dropped) {
5121 stats->tx_bytes = tx_bytes;
5122 stats->tx_packets = tx_packets;
5123 stats->tx_dropped = tx_dropped;
5126 EXPORT_SYMBOL(dev_txq_stats_fold);
5129 * dev_get_stats - get network device statistics
5130 * @dev: device to get statistics from
5132 * Get network statistics from device. The device driver may provide
5133 * its own method by setting dev->netdev_ops->get_stats; otherwise
5134 * the internal statistics structure is used.
5136 const struct net_device_stats *dev_get_stats(struct net_device *dev)
5138 const struct net_device_ops *ops = dev->netdev_ops;
5140 if (ops->ndo_get_stats)
5141 return ops->ndo_get_stats(dev);
5143 dev_txq_stats_fold(dev, &dev->stats);
5146 EXPORT_SYMBOL(dev_get_stats);
5148 static void netdev_init_one_queue(struct net_device *dev,
5149 struct netdev_queue *queue,
5155 static void netdev_init_queues(struct net_device *dev)
5157 netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5158 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5159 spin_lock_init(&dev->tx_global_lock);
5163 * alloc_netdev_mq - allocate network device
5164 * @sizeof_priv: size of private data to allocate space for
5165 * @name: device name format string
5166 * @setup: callback to initialize device
5167 * @queue_count: the number of subqueues to allocate
5169 * Allocates a struct net_device with private data area for driver use
5170 * and performs basic initialization. Also allocates subquue structs
5171 * for each queue on the device at the end of the netdevice.
5173 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5174 void (*setup)(struct net_device *), unsigned int queue_count)
5176 struct netdev_queue *tx;
5177 struct net_device *dev;
5179 struct net_device *p;
5181 struct netdev_rx_queue *rx;
5185 BUG_ON(strlen(name) >= sizeof(dev->name));
5187 alloc_size = sizeof(struct net_device);
5189 /* ensure 32-byte alignment of private area */
5190 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5191 alloc_size += sizeof_priv;
5193 /* ensure 32-byte alignment of whole construct */
5194 alloc_size += NETDEV_ALIGN - 1;
5196 p = kzalloc(alloc_size, GFP_KERNEL);
5198 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5202 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5204 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5210 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5212 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5217 atomic_set(&rx->count, queue_count);
5220 * Set a pointer to first element in the array which holds the
5223 for (i = 0; i < queue_count; i++)
5227 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5228 dev->padded = (char *)dev - (char *)p;
5230 if (dev_addr_init(dev))
5236 dev_net_set(dev, &init_net);
5239 dev->num_tx_queues = queue_count;
5240 dev->real_num_tx_queues = queue_count;
5244 dev->num_rx_queues = queue_count;
5247 dev->gso_max_size = GSO_MAX_SIZE;
5249 netdev_init_queues(dev);
5251 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5252 dev->ethtool_ntuple_list.count = 0;
5253 INIT_LIST_HEAD(&dev->napi_list);
5254 INIT_LIST_HEAD(&dev->unreg_list);
5255 INIT_LIST_HEAD(&dev->link_watch_list);
5256 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5258 strcpy(dev->name, name);
5271 EXPORT_SYMBOL(alloc_netdev_mq);
5274 * free_netdev - free network device
5277 * This function does the last stage of destroying an allocated device
5278 * interface. The reference to the device object is released.
5279 * If this is the last reference then it will be freed.
5281 void free_netdev(struct net_device *dev)
5283 struct napi_struct *p, *n;
5285 release_net(dev_net(dev));
5289 /* Flush device addresses */
5290 dev_addr_flush(dev);
5292 /* Clear ethtool n-tuple list */
5293 ethtool_ntuple_flush(dev);
5295 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5298 /* Compatibility with error handling in drivers */
5299 if (dev->reg_state == NETREG_UNINITIALIZED) {
5300 kfree((char *)dev - dev->padded);
5304 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5305 dev->reg_state = NETREG_RELEASED;
5307 /* will free via device release */
5308 put_device(&dev->dev);
5310 EXPORT_SYMBOL(free_netdev);
5313 * synchronize_net - Synchronize with packet receive processing
5315 * Wait for packets currently being received to be done.
5316 * Does not block later packets from starting.
5318 void synchronize_net(void)
5323 EXPORT_SYMBOL(synchronize_net);
5326 * unregister_netdevice_queue - remove device from the kernel
5330 * This function shuts down a device interface and removes it
5331 * from the kernel tables.
5332 * If head not NULL, device is queued to be unregistered later.
5334 * Callers must hold the rtnl semaphore. You may want
5335 * unregister_netdev() instead of this.
5338 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5343 list_move_tail(&dev->unreg_list, head);
5345 rollback_registered(dev);
5346 /* Finish processing unregister after unlock */
5350 EXPORT_SYMBOL(unregister_netdevice_queue);
5353 * unregister_netdevice_many - unregister many devices
5354 * @head: list of devices
5356 void unregister_netdevice_many(struct list_head *head)
5358 struct net_device *dev;
5360 if (!list_empty(head)) {
5361 rollback_registered_many(head);
5362 list_for_each_entry(dev, head, unreg_list)
5366 EXPORT_SYMBOL(unregister_netdevice_many);
5369 * unregister_netdev - remove device from the kernel
5372 * This function shuts down a device interface and removes it
5373 * from the kernel tables.
5375 * This is just a wrapper for unregister_netdevice that takes
5376 * the rtnl semaphore. In general you want to use this and not
5377 * unregister_netdevice.
5379 void unregister_netdev(struct net_device *dev)
5382 unregister_netdevice(dev);
5385 EXPORT_SYMBOL(unregister_netdev);
5388 * dev_change_net_namespace - move device to different nethost namespace
5390 * @net: network namespace
5391 * @pat: If not NULL name pattern to try if the current device name
5392 * is already taken in the destination network namespace.
5394 * This function shuts down a device interface and moves it
5395 * to a new network namespace. On success 0 is returned, on
5396 * a failure a netagive errno code is returned.
5398 * Callers must hold the rtnl semaphore.
5401 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5407 /* Don't allow namespace local devices to be moved. */
5409 if (dev->features & NETIF_F_NETNS_LOCAL)
5413 /* Don't allow real devices to be moved when sysfs
5417 if (dev->dev.parent)
5421 /* Ensure the device has been registrered */
5423 if (dev->reg_state != NETREG_REGISTERED)
5426 /* Get out if there is nothing todo */
5428 if (net_eq(dev_net(dev), net))
5431 /* Pick the destination device name, and ensure
5432 * we can use it in the destination network namespace.
5435 if (__dev_get_by_name(net, dev->name)) {
5436 /* We get here if we can't use the current device name */
5439 if (dev_get_valid_name(net, pat, dev->name, 1))
5444 * And now a mini version of register_netdevice unregister_netdevice.
5447 /* If device is running close it first. */
5450 /* And unlink it from device chain */
5452 unlist_netdevice(dev);
5456 /* Shutdown queueing discipline. */
5459 /* Notify protocols, that we are about to destroy
5460 this device. They should clean all the things.
5462 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5463 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5466 * Flush the unicast and multicast chains
5471 netdev_unregister_kobject(dev);
5473 /* Actually switch the network namespace */
5474 dev_net_set(dev, net);
5476 /* If there is an ifindex conflict assign a new one */
5477 if (__dev_get_by_index(net, dev->ifindex)) {
5478 int iflink = (dev->iflink == dev->ifindex);
5479 dev->ifindex = dev_new_index(net);
5481 dev->iflink = dev->ifindex;
5484 /* Fixup kobjects */
5485 err = netdev_register_kobject(dev);
5488 /* Add the device back in the hashes */
5489 list_netdevice(dev);
5491 /* Notify protocols, that a new device appeared. */
5492 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5495 * Prevent userspace races by waiting until the network
5496 * device is fully setup before sending notifications.
5498 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5505 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5507 static int dev_cpu_callback(struct notifier_block *nfb,
5508 unsigned long action,
5511 struct sk_buff **list_skb;
5512 struct Qdisc **list_net;
5513 struct sk_buff *skb;
5514 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5515 struct softnet_data *sd, *oldsd;
5517 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5520 local_irq_disable();
5521 cpu = smp_processor_id();
5522 sd = &per_cpu(softnet_data, cpu);
5523 oldsd = &per_cpu(softnet_data, oldcpu);
5525 /* Find end of our completion_queue. */
5526 list_skb = &sd->completion_queue;
5528 list_skb = &(*list_skb)->next;
5529 /* Append completion queue from offline CPU. */
5530 *list_skb = oldsd->completion_queue;
5531 oldsd->completion_queue = NULL;
5533 /* Find end of our output_queue. */
5534 list_net = &sd->output_queue;
5536 list_net = &(*list_net)->next_sched;
5537 /* Append output queue from offline CPU. */
5538 *list_net = oldsd->output_queue;
5539 oldsd->output_queue = NULL;
5541 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5544 /* Process offline CPU's input_pkt_queue */
5545 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5553 * netdev_increment_features - increment feature set by one
5554 * @all: current feature set
5555 * @one: new feature set
5556 * @mask: mask feature set
5558 * Computes a new feature set after adding a device with feature set
5559 * @one to the master device with current feature set @all. Will not
5560 * enable anything that is off in @mask. Returns the new feature set.
5562 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5565 /* If device needs checksumming, downgrade to it. */
5566 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5567 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5568 else if (mask & NETIF_F_ALL_CSUM) {
5569 /* If one device supports v4/v6 checksumming, set for all. */
5570 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5571 !(all & NETIF_F_GEN_CSUM)) {
5572 all &= ~NETIF_F_ALL_CSUM;
5573 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5576 /* If one device supports hw checksumming, set for all. */
5577 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5578 all &= ~NETIF_F_ALL_CSUM;
5579 all |= NETIF_F_HW_CSUM;
5583 one |= NETIF_F_ALL_CSUM;
5585 one |= all & NETIF_F_ONE_FOR_ALL;
5586 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5587 all |= one & mask & NETIF_F_ONE_FOR_ALL;
5591 EXPORT_SYMBOL(netdev_increment_features);
5593 static struct hlist_head *netdev_create_hash(void)
5596 struct hlist_head *hash;
5598 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5600 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5601 INIT_HLIST_HEAD(&hash[i]);
5606 /* Initialize per network namespace state */
5607 static int __net_init netdev_init(struct net *net)
5609 INIT_LIST_HEAD(&net->dev_base_head);
5611 net->dev_name_head = netdev_create_hash();
5612 if (net->dev_name_head == NULL)
5615 net->dev_index_head = netdev_create_hash();
5616 if (net->dev_index_head == NULL)
5622 kfree(net->dev_name_head);
5628 * netdev_drivername - network driver for the device
5629 * @dev: network device
5630 * @buffer: buffer for resulting name
5631 * @len: size of buffer
5633 * Determine network driver for device.
5635 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5637 const struct device_driver *driver;
5638 const struct device *parent;
5640 if (len <= 0 || !buffer)
5644 parent = dev->dev.parent;
5649 driver = parent->driver;
5650 if (driver && driver->name)
5651 strlcpy(buffer, driver->name, len);
5655 static void __net_exit netdev_exit(struct net *net)
5657 kfree(net->dev_name_head);
5658 kfree(net->dev_index_head);
5661 static struct pernet_operations __net_initdata netdev_net_ops = {
5662 .init = netdev_init,
5663 .exit = netdev_exit,
5666 static void __net_exit default_device_exit(struct net *net)
5668 struct net_device *dev, *aux;
5670 * Push all migratable network devices back to the
5671 * initial network namespace
5674 for_each_netdev_safe(net, dev, aux) {
5676 char fb_name[IFNAMSIZ];
5678 /* Ignore unmoveable devices (i.e. loopback) */
5679 if (dev->features & NETIF_F_NETNS_LOCAL)
5682 /* Leave virtual devices for the generic cleanup */
5683 if (dev->rtnl_link_ops)
5686 /* Push remaing network devices to init_net */
5687 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5688 err = dev_change_net_namespace(dev, &init_net, fb_name);
5690 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5691 __func__, dev->name, err);
5698 static void __net_exit default_device_exit_batch(struct list_head *net_list)
5700 /* At exit all network devices most be removed from a network
5701 * namespace. Do this in the reverse order of registeration.
5702 * Do this across as many network namespaces as possible to
5703 * improve batching efficiency.
5705 struct net_device *dev;
5707 LIST_HEAD(dev_kill_list);
5710 list_for_each_entry(net, net_list, exit_list) {
5711 for_each_netdev_reverse(net, dev) {
5712 if (dev->rtnl_link_ops)
5713 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5715 unregister_netdevice_queue(dev, &dev_kill_list);
5718 unregister_netdevice_many(&dev_kill_list);
5722 static struct pernet_operations __net_initdata default_device_ops = {
5723 .exit = default_device_exit,
5724 .exit_batch = default_device_exit_batch,
5728 * Initialize the DEV module. At boot time this walks the device list and
5729 * unhooks any devices that fail to initialise (normally hardware not
5730 * present) and leaves us with a valid list of present and active devices.
5735 * This is called single threaded during boot, so no need
5736 * to take the rtnl semaphore.
5738 static int __init net_dev_init(void)
5740 int i, rc = -ENOMEM;
5742 BUG_ON(!dev_boot_phase);
5744 if (dev_proc_init())
5747 if (netdev_kobject_init())
5750 INIT_LIST_HEAD(&ptype_all);
5751 for (i = 0; i < PTYPE_HASH_SIZE; i++)
5752 INIT_LIST_HEAD(&ptype_base[i]);
5754 if (register_pernet_subsys(&netdev_net_ops))
5758 * Initialise the packet receive queues.
5761 for_each_possible_cpu(i) {
5762 struct softnet_data *queue;
5764 queue = &per_cpu(softnet_data, i);
5765 skb_queue_head_init(&queue->input_pkt_queue);
5766 queue->completion_queue = NULL;
5767 INIT_LIST_HEAD(&queue->poll_list);
5770 queue->csd.func = trigger_softirq;
5771 queue->csd.info = queue;
5772 queue->csd.flags = 0;
5775 queue->backlog.poll = process_backlog;
5776 queue->backlog.weight = weight_p;
5777 queue->backlog.gro_list = NULL;
5778 queue->backlog.gro_count = 0;
5783 /* The loopback device is special if any other network devices
5784 * is present in a network namespace the loopback device must
5785 * be present. Since we now dynamically allocate and free the
5786 * loopback device ensure this invariant is maintained by
5787 * keeping the loopback device as the first device on the
5788 * list of network devices. Ensuring the loopback devices
5789 * is the first device that appears and the last network device
5792 if (register_pernet_device(&loopback_net_ops))
5795 if (register_pernet_device(&default_device_ops))
5798 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5799 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5801 hotcpu_notifier(dev_cpu_callback, 0);
5809 subsys_initcall(net_dev_init);
5811 static int __init initialize_hashrnd(void)
5813 get_random_bytes(&hashrnd, sizeof(hashrnd));
5817 late_initcall_sync(initialize_hashrnd);