]> bbs.cooldavid.org Git - net-next-2.6.git/blame_incremental - net/core/dev.c
cxgb4vf: Advertise NETIF_F_TSO_ECN.
[net-next-2.6.git] / net / core / dev.c
... / ...
CommitLineData
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <asm/system.h>
77#include <linux/bitops.h>
78#include <linux/capability.h>
79#include <linux/cpu.h>
80#include <linux/types.h>
81#include <linux/kernel.h>
82#include <linux/hash.h>
83#include <linux/slab.h>
84#include <linux/sched.h>
85#include <linux/mutex.h>
86#include <linux/string.h>
87#include <linux/mm.h>
88#include <linux/socket.h>
89#include <linux/sockios.h>
90#include <linux/errno.h>
91#include <linux/interrupt.h>
92#include <linux/if_ether.h>
93#include <linux/netdevice.h>
94#include <linux/etherdevice.h>
95#include <linux/ethtool.h>
96#include <linux/notifier.h>
97#include <linux/skbuff.h>
98#include <net/net_namespace.h>
99#include <net/sock.h>
100#include <linux/rtnetlink.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/stat.h>
104#include <net/dst.h>
105#include <net/pkt_sched.h>
106#include <net/checksum.h>
107#include <net/xfrm.h>
108#include <linux/highmem.h>
109#include <linux/init.h>
110#include <linux/kmod.h>
111#include <linux/module.h>
112#include <linux/netpoll.h>
113#include <linux/rcupdate.h>
114#include <linux/delay.h>
115#include <net/wext.h>
116#include <net/iw_handler.h>
117#include <asm/current.h>
118#include <linux/audit.h>
119#include <linux/dmaengine.h>
120#include <linux/err.h>
121#include <linux/ctype.h>
122#include <linux/if_arp.h>
123#include <linux/if_vlan.h>
124#include <linux/ip.h>
125#include <net/ip.h>
126#include <linux/ipv6.h>
127#include <linux/in.h>
128#include <linux/jhash.h>
129#include <linux/random.h>
130#include <trace/events/napi.h>
131#include <trace/events/net.h>
132#include <trace/events/skb.h>
133#include <linux/pci.h>
134#include <linux/inetdevice.h>
135
136#include "net-sysfs.h"
137
138/* Instead of increasing this, you should create a hash table. */
139#define MAX_GRO_SKBS 8
140
141/* This should be increased if a protocol with a bigger head is added. */
142#define GRO_MAX_HEAD (MAX_HEADER + 128)
143
144/*
145 * The list of packet types we will receive (as opposed to discard)
146 * and the routines to invoke.
147 *
148 * Why 16. Because with 16 the only overlap we get on a hash of the
149 * low nibble of the protocol value is RARP/SNAP/X.25.
150 *
151 * NOTE: That is no longer true with the addition of VLAN tags. Not
152 * sure which should go first, but I bet it won't make much
153 * difference if we are running VLANs. The good news is that
154 * this protocol won't be in the list unless compiled in, so
155 * the average user (w/out VLANs) will not be adversely affected.
156 * --BLG
157 *
158 * 0800 IP
159 * 8100 802.1Q VLAN
160 * 0001 802.3
161 * 0002 AX.25
162 * 0004 802.2
163 * 8035 RARP
164 * 0005 SNAP
165 * 0805 X.25
166 * 0806 ARP
167 * 8137 IPX
168 * 0009 Localtalk
169 * 86DD IPv6
170 */
171
172#define PTYPE_HASH_SIZE (16)
173#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
174
175static DEFINE_SPINLOCK(ptype_lock);
176static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
177static struct list_head ptype_all __read_mostly; /* Taps */
178
179/*
180 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
181 * semaphore.
182 *
183 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
184 *
185 * Writers must hold the rtnl semaphore while they loop through the
186 * dev_base_head list, and hold dev_base_lock for writing when they do the
187 * actual updates. This allows pure readers to access the list even
188 * while a writer is preparing to update it.
189 *
190 * To put it another way, dev_base_lock is held for writing only to
191 * protect against pure readers; the rtnl semaphore provides the
192 * protection against other writers.
193 *
194 * See, for example usages, register_netdevice() and
195 * unregister_netdevice(), which must be called with the rtnl
196 * semaphore held.
197 */
198DEFINE_RWLOCK(dev_base_lock);
199EXPORT_SYMBOL(dev_base_lock);
200
201static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
202{
203 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
204 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
205}
206
207static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
208{
209 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
210}
211
212static inline void rps_lock(struct softnet_data *sd)
213{
214#ifdef CONFIG_RPS
215 spin_lock(&sd->input_pkt_queue.lock);
216#endif
217}
218
219static inline void rps_unlock(struct softnet_data *sd)
220{
221#ifdef CONFIG_RPS
222 spin_unlock(&sd->input_pkt_queue.lock);
223#endif
224}
225
226/* Device list insertion */
227static int list_netdevice(struct net_device *dev)
228{
229 struct net *net = dev_net(dev);
230
231 ASSERT_RTNL();
232
233 write_lock_bh(&dev_base_lock);
234 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
235 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
236 hlist_add_head_rcu(&dev->index_hlist,
237 dev_index_hash(net, dev->ifindex));
238 write_unlock_bh(&dev_base_lock);
239 return 0;
240}
241
242/* Device list removal
243 * caller must respect a RCU grace period before freeing/reusing dev
244 */
245static void unlist_netdevice(struct net_device *dev)
246{
247 ASSERT_RTNL();
248
249 /* Unlink dev from the device chain */
250 write_lock_bh(&dev_base_lock);
251 list_del_rcu(&dev->dev_list);
252 hlist_del_rcu(&dev->name_hlist);
253 hlist_del_rcu(&dev->index_hlist);
254 write_unlock_bh(&dev_base_lock);
255}
256
257/*
258 * Our notifier list
259 */
260
261static RAW_NOTIFIER_HEAD(netdev_chain);
262
263/*
264 * Device drivers call our routines to queue packets here. We empty the
265 * queue in the local softnet handler.
266 */
267
268DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
269EXPORT_PER_CPU_SYMBOL(softnet_data);
270
271#ifdef CONFIG_LOCKDEP
272/*
273 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
274 * according to dev->type
275 */
276static const unsigned short netdev_lock_type[] =
277 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
278 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
279 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
280 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
281 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
282 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
283 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
284 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
285 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
286 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
287 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
288 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
289 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
290 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
291 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
292 ARPHRD_VOID, ARPHRD_NONE};
293
294static const char *const netdev_lock_name[] =
295 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
308 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
309 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
310 "_xmit_VOID", "_xmit_NONE"};
311
312static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
313static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
314
315static inline unsigned short netdev_lock_pos(unsigned short dev_type)
316{
317 int i;
318
319 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
320 if (netdev_lock_type[i] == dev_type)
321 return i;
322 /* the last key is used by default */
323 return ARRAY_SIZE(netdev_lock_type) - 1;
324}
325
326static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
327 unsigned short dev_type)
328{
329 int i;
330
331 i = netdev_lock_pos(dev_type);
332 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
333 netdev_lock_name[i]);
334}
335
336static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
337{
338 int i;
339
340 i = netdev_lock_pos(dev->type);
341 lockdep_set_class_and_name(&dev->addr_list_lock,
342 &netdev_addr_lock_key[i],
343 netdev_lock_name[i]);
344}
345#else
346static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
347 unsigned short dev_type)
348{
349}
350static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
351{
352}
353#endif
354
355/*******************************************************************************
356
357 Protocol management and registration routines
358
359*******************************************************************************/
360
361/*
362 * Add a protocol ID to the list. Now that the input handler is
363 * smarter we can dispense with all the messy stuff that used to be
364 * here.
365 *
366 * BEWARE!!! Protocol handlers, mangling input packets,
367 * MUST BE last in hash buckets and checking protocol handlers
368 * MUST start from promiscuous ptype_all chain in net_bh.
369 * It is true now, do not change it.
370 * Explanation follows: if protocol handler, mangling packet, will
371 * be the first on list, it is not able to sense, that packet
372 * is cloned and should be copied-on-write, so that it will
373 * change it and subsequent readers will get broken packet.
374 * --ANK (980803)
375 */
376
377static inline struct list_head *ptype_head(const struct packet_type *pt)
378{
379 if (pt->type == htons(ETH_P_ALL))
380 return &ptype_all;
381 else
382 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383}
384
385/**
386 * dev_add_pack - add packet handler
387 * @pt: packet type declaration
388 *
389 * Add a protocol handler to the networking stack. The passed &packet_type
390 * is linked into kernel lists and may not be freed until it has been
391 * removed from the kernel lists.
392 *
393 * This call does not sleep therefore it can not
394 * guarantee all CPU's that are in middle of receiving packets
395 * will see the new packet type (until the next received packet).
396 */
397
398void dev_add_pack(struct packet_type *pt)
399{
400 struct list_head *head = ptype_head(pt);
401
402 spin_lock(&ptype_lock);
403 list_add_rcu(&pt->list, head);
404 spin_unlock(&ptype_lock);
405}
406EXPORT_SYMBOL(dev_add_pack);
407
408/**
409 * __dev_remove_pack - remove packet handler
410 * @pt: packet type declaration
411 *
412 * Remove a protocol handler that was previously added to the kernel
413 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
414 * from the kernel lists and can be freed or reused once this function
415 * returns.
416 *
417 * The packet type might still be in use by receivers
418 * and must not be freed until after all the CPU's have gone
419 * through a quiescent state.
420 */
421void __dev_remove_pack(struct packet_type *pt)
422{
423 struct list_head *head = ptype_head(pt);
424 struct packet_type *pt1;
425
426 spin_lock(&ptype_lock);
427
428 list_for_each_entry(pt1, head, list) {
429 if (pt == pt1) {
430 list_del_rcu(&pt->list);
431 goto out;
432 }
433 }
434
435 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
436out:
437 spin_unlock(&ptype_lock);
438}
439EXPORT_SYMBOL(__dev_remove_pack);
440
441/**
442 * dev_remove_pack - remove packet handler
443 * @pt: packet type declaration
444 *
445 * Remove a protocol handler that was previously added to the kernel
446 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
447 * from the kernel lists and can be freed or reused once this function
448 * returns.
449 *
450 * This call sleeps to guarantee that no CPU is looking at the packet
451 * type after return.
452 */
453void dev_remove_pack(struct packet_type *pt)
454{
455 __dev_remove_pack(pt);
456
457 synchronize_net();
458}
459EXPORT_SYMBOL(dev_remove_pack);
460
461/******************************************************************************
462
463 Device Boot-time Settings Routines
464
465*******************************************************************************/
466
467/* Boot time configuration table */
468static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
469
470/**
471 * netdev_boot_setup_add - add new setup entry
472 * @name: name of the device
473 * @map: configured settings for the device
474 *
475 * Adds new setup entry to the dev_boot_setup list. The function
476 * returns 0 on error and 1 on success. This is a generic routine to
477 * all netdevices.
478 */
479static int netdev_boot_setup_add(char *name, struct ifmap *map)
480{
481 struct netdev_boot_setup *s;
482 int i;
483
484 s = dev_boot_setup;
485 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
486 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
487 memset(s[i].name, 0, sizeof(s[i].name));
488 strlcpy(s[i].name, name, IFNAMSIZ);
489 memcpy(&s[i].map, map, sizeof(s[i].map));
490 break;
491 }
492 }
493
494 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
495}
496
497/**
498 * netdev_boot_setup_check - check boot time settings
499 * @dev: the netdevice
500 *
501 * Check boot time settings for the device.
502 * The found settings are set for the device to be used
503 * later in the device probing.
504 * Returns 0 if no settings found, 1 if they are.
505 */
506int netdev_boot_setup_check(struct net_device *dev)
507{
508 struct netdev_boot_setup *s = dev_boot_setup;
509 int i;
510
511 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
512 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
513 !strcmp(dev->name, s[i].name)) {
514 dev->irq = s[i].map.irq;
515 dev->base_addr = s[i].map.base_addr;
516 dev->mem_start = s[i].map.mem_start;
517 dev->mem_end = s[i].map.mem_end;
518 return 1;
519 }
520 }
521 return 0;
522}
523EXPORT_SYMBOL(netdev_boot_setup_check);
524
525
526/**
527 * netdev_boot_base - get address from boot time settings
528 * @prefix: prefix for network device
529 * @unit: id for network device
530 *
531 * Check boot time settings for the base address of device.
532 * The found settings are set for the device to be used
533 * later in the device probing.
534 * Returns 0 if no settings found.
535 */
536unsigned long netdev_boot_base(const char *prefix, int unit)
537{
538 const struct netdev_boot_setup *s = dev_boot_setup;
539 char name[IFNAMSIZ];
540 int i;
541
542 sprintf(name, "%s%d", prefix, unit);
543
544 /*
545 * If device already registered then return base of 1
546 * to indicate not to probe for this interface
547 */
548 if (__dev_get_by_name(&init_net, name))
549 return 1;
550
551 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
552 if (!strcmp(name, s[i].name))
553 return s[i].map.base_addr;
554 return 0;
555}
556
557/*
558 * Saves at boot time configured settings for any netdevice.
559 */
560int __init netdev_boot_setup(char *str)
561{
562 int ints[5];
563 struct ifmap map;
564
565 str = get_options(str, ARRAY_SIZE(ints), ints);
566 if (!str || !*str)
567 return 0;
568
569 /* Save settings */
570 memset(&map, 0, sizeof(map));
571 if (ints[0] > 0)
572 map.irq = ints[1];
573 if (ints[0] > 1)
574 map.base_addr = ints[2];
575 if (ints[0] > 2)
576 map.mem_start = ints[3];
577 if (ints[0] > 3)
578 map.mem_end = ints[4];
579
580 /* Add new entry to the list */
581 return netdev_boot_setup_add(str, &map);
582}
583
584__setup("netdev=", netdev_boot_setup);
585
586/*******************************************************************************
587
588 Device Interface Subroutines
589
590*******************************************************************************/
591
592/**
593 * __dev_get_by_name - find a device by its name
594 * @net: the applicable net namespace
595 * @name: name to find
596 *
597 * Find an interface by name. Must be called under RTNL semaphore
598 * or @dev_base_lock. If the name is found a pointer to the device
599 * is returned. If the name is not found then %NULL is returned. The
600 * reference counters are not incremented so the caller must be
601 * careful with locks.
602 */
603
604struct net_device *__dev_get_by_name(struct net *net, const char *name)
605{
606 struct hlist_node *p;
607 struct net_device *dev;
608 struct hlist_head *head = dev_name_hash(net, name);
609
610 hlist_for_each_entry(dev, p, head, name_hlist)
611 if (!strncmp(dev->name, name, IFNAMSIZ))
612 return dev;
613
614 return NULL;
615}
616EXPORT_SYMBOL(__dev_get_by_name);
617
618/**
619 * dev_get_by_name_rcu - find a device by its name
620 * @net: the applicable net namespace
621 * @name: name to find
622 *
623 * Find an interface by name.
624 * If the name is found a pointer to the device is returned.
625 * If the name is not found then %NULL is returned.
626 * The reference counters are not incremented so the caller must be
627 * careful with locks. The caller must hold RCU lock.
628 */
629
630struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
631{
632 struct hlist_node *p;
633 struct net_device *dev;
634 struct hlist_head *head = dev_name_hash(net, name);
635
636 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
637 if (!strncmp(dev->name, name, IFNAMSIZ))
638 return dev;
639
640 return NULL;
641}
642EXPORT_SYMBOL(dev_get_by_name_rcu);
643
644/**
645 * dev_get_by_name - find a device by its name
646 * @net: the applicable net namespace
647 * @name: name to find
648 *
649 * Find an interface by name. This can be called from any
650 * context and does its own locking. The returned handle has
651 * the usage count incremented and the caller must use dev_put() to
652 * release it when it is no longer needed. %NULL is returned if no
653 * matching device is found.
654 */
655
656struct net_device *dev_get_by_name(struct net *net, const char *name)
657{
658 struct net_device *dev;
659
660 rcu_read_lock();
661 dev = dev_get_by_name_rcu(net, name);
662 if (dev)
663 dev_hold(dev);
664 rcu_read_unlock();
665 return dev;
666}
667EXPORT_SYMBOL(dev_get_by_name);
668
669/**
670 * __dev_get_by_index - find a device by its ifindex
671 * @net: the applicable net namespace
672 * @ifindex: index of device
673 *
674 * Search for an interface by index. Returns %NULL if the device
675 * is not found or a pointer to the device. The device has not
676 * had its reference counter increased so the caller must be careful
677 * about locking. The caller must hold either the RTNL semaphore
678 * or @dev_base_lock.
679 */
680
681struct net_device *__dev_get_by_index(struct net *net, int ifindex)
682{
683 struct hlist_node *p;
684 struct net_device *dev;
685 struct hlist_head *head = dev_index_hash(net, ifindex);
686
687 hlist_for_each_entry(dev, p, head, index_hlist)
688 if (dev->ifindex == ifindex)
689 return dev;
690
691 return NULL;
692}
693EXPORT_SYMBOL(__dev_get_by_index);
694
695/**
696 * dev_get_by_index_rcu - find a device by its ifindex
697 * @net: the applicable net namespace
698 * @ifindex: index of device
699 *
700 * Search for an interface by index. Returns %NULL if the device
701 * is not found or a pointer to the device. The device has not
702 * had its reference counter increased so the caller must be careful
703 * about locking. The caller must hold RCU lock.
704 */
705
706struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
707{
708 struct hlist_node *p;
709 struct net_device *dev;
710 struct hlist_head *head = dev_index_hash(net, ifindex);
711
712 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
713 if (dev->ifindex == ifindex)
714 return dev;
715
716 return NULL;
717}
718EXPORT_SYMBOL(dev_get_by_index_rcu);
719
720
721/**
722 * dev_get_by_index - find a device by its ifindex
723 * @net: the applicable net namespace
724 * @ifindex: index of device
725 *
726 * Search for an interface by index. Returns NULL if the device
727 * is not found or a pointer to the device. The device returned has
728 * had a reference added and the pointer is safe until the user calls
729 * dev_put to indicate they have finished with it.
730 */
731
732struct net_device *dev_get_by_index(struct net *net, int ifindex)
733{
734 struct net_device *dev;
735
736 rcu_read_lock();
737 dev = dev_get_by_index_rcu(net, ifindex);
738 if (dev)
739 dev_hold(dev);
740 rcu_read_unlock();
741 return dev;
742}
743EXPORT_SYMBOL(dev_get_by_index);
744
745/**
746 * dev_getbyhwaddr - find a device by its hardware address
747 * @net: the applicable net namespace
748 * @type: media type of device
749 * @ha: hardware address
750 *
751 * Search for an interface by MAC address. Returns NULL if the device
752 * is not found or a pointer to the device. The caller must hold the
753 * rtnl semaphore. The returned device has not had its ref count increased
754 * and the caller must therefore be careful about locking
755 *
756 * BUGS:
757 * If the API was consistent this would be __dev_get_by_hwaddr
758 */
759
760struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
761{
762 struct net_device *dev;
763
764 ASSERT_RTNL();
765
766 for_each_netdev(net, dev)
767 if (dev->type == type &&
768 !memcmp(dev->dev_addr, ha, dev->addr_len))
769 return dev;
770
771 return NULL;
772}
773EXPORT_SYMBOL(dev_getbyhwaddr);
774
775struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
776{
777 struct net_device *dev;
778
779 ASSERT_RTNL();
780 for_each_netdev(net, dev)
781 if (dev->type == type)
782 return dev;
783
784 return NULL;
785}
786EXPORT_SYMBOL(__dev_getfirstbyhwtype);
787
788struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
789{
790 struct net_device *dev, *ret = NULL;
791
792 rcu_read_lock();
793 for_each_netdev_rcu(net, dev)
794 if (dev->type == type) {
795 dev_hold(dev);
796 ret = dev;
797 break;
798 }
799 rcu_read_unlock();
800 return ret;
801}
802EXPORT_SYMBOL(dev_getfirstbyhwtype);
803
804/**
805 * dev_get_by_flags_rcu - find any device with given flags
806 * @net: the applicable net namespace
807 * @if_flags: IFF_* values
808 * @mask: bitmask of bits in if_flags to check
809 *
810 * Search for any interface with the given flags. Returns NULL if a device
811 * is not found or a pointer to the device. Must be called inside
812 * rcu_read_lock(), and result refcount is unchanged.
813 */
814
815struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
816 unsigned short mask)
817{
818 struct net_device *dev, *ret;
819
820 ret = NULL;
821 for_each_netdev_rcu(net, dev) {
822 if (((dev->flags ^ if_flags) & mask) == 0) {
823 ret = dev;
824 break;
825 }
826 }
827 return ret;
828}
829EXPORT_SYMBOL(dev_get_by_flags_rcu);
830
831/**
832 * dev_valid_name - check if name is okay for network device
833 * @name: name string
834 *
835 * Network device names need to be valid file names to
836 * to allow sysfs to work. We also disallow any kind of
837 * whitespace.
838 */
839int dev_valid_name(const char *name)
840{
841 if (*name == '\0')
842 return 0;
843 if (strlen(name) >= IFNAMSIZ)
844 return 0;
845 if (!strcmp(name, ".") || !strcmp(name, ".."))
846 return 0;
847
848 while (*name) {
849 if (*name == '/' || isspace(*name))
850 return 0;
851 name++;
852 }
853 return 1;
854}
855EXPORT_SYMBOL(dev_valid_name);
856
857/**
858 * __dev_alloc_name - allocate a name for a device
859 * @net: network namespace to allocate the device name in
860 * @name: name format string
861 * @buf: scratch buffer and result name string
862 *
863 * Passed a format string - eg "lt%d" it will try and find a suitable
864 * id. It scans list of devices to build up a free map, then chooses
865 * the first empty slot. The caller must hold the dev_base or rtnl lock
866 * while allocating the name and adding the device in order to avoid
867 * duplicates.
868 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
869 * Returns the number of the unit assigned or a negative errno code.
870 */
871
872static int __dev_alloc_name(struct net *net, const char *name, char *buf)
873{
874 int i = 0;
875 const char *p;
876 const int max_netdevices = 8*PAGE_SIZE;
877 unsigned long *inuse;
878 struct net_device *d;
879
880 p = strnchr(name, IFNAMSIZ-1, '%');
881 if (p) {
882 /*
883 * Verify the string as this thing may have come from
884 * the user. There must be either one "%d" and no other "%"
885 * characters.
886 */
887 if (p[1] != 'd' || strchr(p + 2, '%'))
888 return -EINVAL;
889
890 /* Use one page as a bit array of possible slots */
891 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
892 if (!inuse)
893 return -ENOMEM;
894
895 for_each_netdev(net, d) {
896 if (!sscanf(d->name, name, &i))
897 continue;
898 if (i < 0 || i >= max_netdevices)
899 continue;
900
901 /* avoid cases where sscanf is not exact inverse of printf */
902 snprintf(buf, IFNAMSIZ, name, i);
903 if (!strncmp(buf, d->name, IFNAMSIZ))
904 set_bit(i, inuse);
905 }
906
907 i = find_first_zero_bit(inuse, max_netdevices);
908 free_page((unsigned long) inuse);
909 }
910
911 if (buf != name)
912 snprintf(buf, IFNAMSIZ, name, i);
913 if (!__dev_get_by_name(net, buf))
914 return i;
915
916 /* It is possible to run out of possible slots
917 * when the name is long and there isn't enough space left
918 * for the digits, or if all bits are used.
919 */
920 return -ENFILE;
921}
922
923/**
924 * dev_alloc_name - allocate a name for a device
925 * @dev: device
926 * @name: name format string
927 *
928 * Passed a format string - eg "lt%d" it will try and find a suitable
929 * id. It scans list of devices to build up a free map, then chooses
930 * the first empty slot. The caller must hold the dev_base or rtnl lock
931 * while allocating the name and adding the device in order to avoid
932 * duplicates.
933 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
934 * Returns the number of the unit assigned or a negative errno code.
935 */
936
937int dev_alloc_name(struct net_device *dev, const char *name)
938{
939 char buf[IFNAMSIZ];
940 struct net *net;
941 int ret;
942
943 BUG_ON(!dev_net(dev));
944 net = dev_net(dev);
945 ret = __dev_alloc_name(net, name, buf);
946 if (ret >= 0)
947 strlcpy(dev->name, buf, IFNAMSIZ);
948 return ret;
949}
950EXPORT_SYMBOL(dev_alloc_name);
951
952static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
953{
954 struct net *net;
955
956 BUG_ON(!dev_net(dev));
957 net = dev_net(dev);
958
959 if (!dev_valid_name(name))
960 return -EINVAL;
961
962 if (fmt && strchr(name, '%'))
963 return dev_alloc_name(dev, name);
964 else if (__dev_get_by_name(net, name))
965 return -EEXIST;
966 else if (dev->name != name)
967 strlcpy(dev->name, name, IFNAMSIZ);
968
969 return 0;
970}
971
972/**
973 * dev_change_name - change name of a device
974 * @dev: device
975 * @newname: name (or format string) must be at least IFNAMSIZ
976 *
977 * Change name of a device, can pass format strings "eth%d".
978 * for wildcarding.
979 */
980int dev_change_name(struct net_device *dev, const char *newname)
981{
982 char oldname[IFNAMSIZ];
983 int err = 0;
984 int ret;
985 struct net *net;
986
987 ASSERT_RTNL();
988 BUG_ON(!dev_net(dev));
989
990 net = dev_net(dev);
991 if (dev->flags & IFF_UP)
992 return -EBUSY;
993
994 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
995 return 0;
996
997 memcpy(oldname, dev->name, IFNAMSIZ);
998
999 err = dev_get_valid_name(dev, newname, 1);
1000 if (err < 0)
1001 return err;
1002
1003rollback:
1004 ret = device_rename(&dev->dev, dev->name);
1005 if (ret) {
1006 memcpy(dev->name, oldname, IFNAMSIZ);
1007 return ret;
1008 }
1009
1010 write_lock_bh(&dev_base_lock);
1011 hlist_del(&dev->name_hlist);
1012 write_unlock_bh(&dev_base_lock);
1013
1014 synchronize_rcu();
1015
1016 write_lock_bh(&dev_base_lock);
1017 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1018 write_unlock_bh(&dev_base_lock);
1019
1020 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1021 ret = notifier_to_errno(ret);
1022
1023 if (ret) {
1024 /* err >= 0 after dev_alloc_name() or stores the first errno */
1025 if (err >= 0) {
1026 err = ret;
1027 memcpy(dev->name, oldname, IFNAMSIZ);
1028 goto rollback;
1029 } else {
1030 printk(KERN_ERR
1031 "%s: name change rollback failed: %d.\n",
1032 dev->name, ret);
1033 }
1034 }
1035
1036 return err;
1037}
1038
1039/**
1040 * dev_set_alias - change ifalias of a device
1041 * @dev: device
1042 * @alias: name up to IFALIASZ
1043 * @len: limit of bytes to copy from info
1044 *
1045 * Set ifalias for a device,
1046 */
1047int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1048{
1049 ASSERT_RTNL();
1050
1051 if (len >= IFALIASZ)
1052 return -EINVAL;
1053
1054 if (!len) {
1055 if (dev->ifalias) {
1056 kfree(dev->ifalias);
1057 dev->ifalias = NULL;
1058 }
1059 return 0;
1060 }
1061
1062 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1063 if (!dev->ifalias)
1064 return -ENOMEM;
1065
1066 strlcpy(dev->ifalias, alias, len+1);
1067 return len;
1068}
1069
1070
1071/**
1072 * netdev_features_change - device changes features
1073 * @dev: device to cause notification
1074 *
1075 * Called to indicate a device has changed features.
1076 */
1077void netdev_features_change(struct net_device *dev)
1078{
1079 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1080}
1081EXPORT_SYMBOL(netdev_features_change);
1082
1083/**
1084 * netdev_state_change - device changes state
1085 * @dev: device to cause notification
1086 *
1087 * Called to indicate a device has changed state. This function calls
1088 * the notifier chains for netdev_chain and sends a NEWLINK message
1089 * to the routing socket.
1090 */
1091void netdev_state_change(struct net_device *dev)
1092{
1093 if (dev->flags & IFF_UP) {
1094 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1095 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1096 }
1097}
1098EXPORT_SYMBOL(netdev_state_change);
1099
1100int netdev_bonding_change(struct net_device *dev, unsigned long event)
1101{
1102 return call_netdevice_notifiers(event, dev);
1103}
1104EXPORT_SYMBOL(netdev_bonding_change);
1105
1106/**
1107 * dev_load - load a network module
1108 * @net: the applicable net namespace
1109 * @name: name of interface
1110 *
1111 * If a network interface is not present and the process has suitable
1112 * privileges this function loads the module. If module loading is not
1113 * available in this kernel then it becomes a nop.
1114 */
1115
1116void dev_load(struct net *net, const char *name)
1117{
1118 struct net_device *dev;
1119
1120 rcu_read_lock();
1121 dev = dev_get_by_name_rcu(net, name);
1122 rcu_read_unlock();
1123
1124 if (!dev && capable(CAP_NET_ADMIN))
1125 request_module("%s", name);
1126}
1127EXPORT_SYMBOL(dev_load);
1128
1129static int __dev_open(struct net_device *dev)
1130{
1131 const struct net_device_ops *ops = dev->netdev_ops;
1132 int ret;
1133
1134 ASSERT_RTNL();
1135
1136 /*
1137 * Is it even present?
1138 */
1139 if (!netif_device_present(dev))
1140 return -ENODEV;
1141
1142 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1143 ret = notifier_to_errno(ret);
1144 if (ret)
1145 return ret;
1146
1147 /*
1148 * Call device private open method
1149 */
1150 set_bit(__LINK_STATE_START, &dev->state);
1151
1152 if (ops->ndo_validate_addr)
1153 ret = ops->ndo_validate_addr(dev);
1154
1155 if (!ret && ops->ndo_open)
1156 ret = ops->ndo_open(dev);
1157
1158 /*
1159 * If it went open OK then:
1160 */
1161
1162 if (ret)
1163 clear_bit(__LINK_STATE_START, &dev->state);
1164 else {
1165 /*
1166 * Set the flags.
1167 */
1168 dev->flags |= IFF_UP;
1169
1170 /*
1171 * Enable NET_DMA
1172 */
1173 net_dmaengine_get();
1174
1175 /*
1176 * Initialize multicasting status
1177 */
1178 dev_set_rx_mode(dev);
1179
1180 /*
1181 * Wakeup transmit queue engine
1182 */
1183 dev_activate(dev);
1184 }
1185
1186 return ret;
1187}
1188
1189/**
1190 * dev_open - prepare an interface for use.
1191 * @dev: device to open
1192 *
1193 * Takes a device from down to up state. The device's private open
1194 * function is invoked and then the multicast lists are loaded. Finally
1195 * the device is moved into the up state and a %NETDEV_UP message is
1196 * sent to the netdev notifier chain.
1197 *
1198 * Calling this function on an active interface is a nop. On a failure
1199 * a negative errno code is returned.
1200 */
1201int dev_open(struct net_device *dev)
1202{
1203 int ret;
1204
1205 /*
1206 * Is it already up?
1207 */
1208 if (dev->flags & IFF_UP)
1209 return 0;
1210
1211 /*
1212 * Open device
1213 */
1214 ret = __dev_open(dev);
1215 if (ret < 0)
1216 return ret;
1217
1218 /*
1219 * ... and announce new interface.
1220 */
1221 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1222 call_netdevice_notifiers(NETDEV_UP, dev);
1223
1224 return ret;
1225}
1226EXPORT_SYMBOL(dev_open);
1227
1228static int __dev_close(struct net_device *dev)
1229{
1230 const struct net_device_ops *ops = dev->netdev_ops;
1231
1232 ASSERT_RTNL();
1233 might_sleep();
1234
1235 /*
1236 * Tell people we are going down, so that they can
1237 * prepare to death, when device is still operating.
1238 */
1239 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1240
1241 clear_bit(__LINK_STATE_START, &dev->state);
1242
1243 /* Synchronize to scheduled poll. We cannot touch poll list,
1244 * it can be even on different cpu. So just clear netif_running().
1245 *
1246 * dev->stop() will invoke napi_disable() on all of it's
1247 * napi_struct instances on this device.
1248 */
1249 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1250
1251 dev_deactivate(dev);
1252
1253 /*
1254 * Call the device specific close. This cannot fail.
1255 * Only if device is UP
1256 *
1257 * We allow it to be called even after a DETACH hot-plug
1258 * event.
1259 */
1260 if (ops->ndo_stop)
1261 ops->ndo_stop(dev);
1262
1263 /*
1264 * Device is now down.
1265 */
1266
1267 dev->flags &= ~IFF_UP;
1268
1269 /*
1270 * Shutdown NET_DMA
1271 */
1272 net_dmaengine_put();
1273
1274 return 0;
1275}
1276
1277/**
1278 * dev_close - shutdown an interface.
1279 * @dev: device to shutdown
1280 *
1281 * This function moves an active device into down state. A
1282 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1283 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1284 * chain.
1285 */
1286int dev_close(struct net_device *dev)
1287{
1288 if (!(dev->flags & IFF_UP))
1289 return 0;
1290
1291 __dev_close(dev);
1292
1293 /*
1294 * Tell people we are down
1295 */
1296 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1297 call_netdevice_notifiers(NETDEV_DOWN, dev);
1298
1299 return 0;
1300}
1301EXPORT_SYMBOL(dev_close);
1302
1303
1304/**
1305 * dev_disable_lro - disable Large Receive Offload on a device
1306 * @dev: device
1307 *
1308 * Disable Large Receive Offload (LRO) on a net device. Must be
1309 * called under RTNL. This is needed if received packets may be
1310 * forwarded to another interface.
1311 */
1312void dev_disable_lro(struct net_device *dev)
1313{
1314 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1315 dev->ethtool_ops->set_flags) {
1316 u32 flags = dev->ethtool_ops->get_flags(dev);
1317 if (flags & ETH_FLAG_LRO) {
1318 flags &= ~ETH_FLAG_LRO;
1319 dev->ethtool_ops->set_flags(dev, flags);
1320 }
1321 }
1322 WARN_ON(dev->features & NETIF_F_LRO);
1323}
1324EXPORT_SYMBOL(dev_disable_lro);
1325
1326
1327static int dev_boot_phase = 1;
1328
1329/*
1330 * Device change register/unregister. These are not inline or static
1331 * as we export them to the world.
1332 */
1333
1334/**
1335 * register_netdevice_notifier - register a network notifier block
1336 * @nb: notifier
1337 *
1338 * Register a notifier to be called when network device events occur.
1339 * The notifier passed is linked into the kernel structures and must
1340 * not be reused until it has been unregistered. A negative errno code
1341 * is returned on a failure.
1342 *
1343 * When registered all registration and up events are replayed
1344 * to the new notifier to allow device to have a race free
1345 * view of the network device list.
1346 */
1347
1348int register_netdevice_notifier(struct notifier_block *nb)
1349{
1350 struct net_device *dev;
1351 struct net_device *last;
1352 struct net *net;
1353 int err;
1354
1355 rtnl_lock();
1356 err = raw_notifier_chain_register(&netdev_chain, nb);
1357 if (err)
1358 goto unlock;
1359 if (dev_boot_phase)
1360 goto unlock;
1361 for_each_net(net) {
1362 for_each_netdev(net, dev) {
1363 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1364 err = notifier_to_errno(err);
1365 if (err)
1366 goto rollback;
1367
1368 if (!(dev->flags & IFF_UP))
1369 continue;
1370
1371 nb->notifier_call(nb, NETDEV_UP, dev);
1372 }
1373 }
1374
1375unlock:
1376 rtnl_unlock();
1377 return err;
1378
1379rollback:
1380 last = dev;
1381 for_each_net(net) {
1382 for_each_netdev(net, dev) {
1383 if (dev == last)
1384 break;
1385
1386 if (dev->flags & IFF_UP) {
1387 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1388 nb->notifier_call(nb, NETDEV_DOWN, dev);
1389 }
1390 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1391 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1392 }
1393 }
1394
1395 raw_notifier_chain_unregister(&netdev_chain, nb);
1396 goto unlock;
1397}
1398EXPORT_SYMBOL(register_netdevice_notifier);
1399
1400/**
1401 * unregister_netdevice_notifier - unregister a network notifier block
1402 * @nb: notifier
1403 *
1404 * Unregister a notifier previously registered by
1405 * register_netdevice_notifier(). The notifier is unlinked into the
1406 * kernel structures and may then be reused. A negative errno code
1407 * is returned on a failure.
1408 */
1409
1410int unregister_netdevice_notifier(struct notifier_block *nb)
1411{
1412 int err;
1413
1414 rtnl_lock();
1415 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1416 rtnl_unlock();
1417 return err;
1418}
1419EXPORT_SYMBOL(unregister_netdevice_notifier);
1420
1421/**
1422 * call_netdevice_notifiers - call all network notifier blocks
1423 * @val: value passed unmodified to notifier function
1424 * @dev: net_device pointer passed unmodified to notifier function
1425 *
1426 * Call all network notifier blocks. Parameters and return value
1427 * are as for raw_notifier_call_chain().
1428 */
1429
1430int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1431{
1432 ASSERT_RTNL();
1433 return raw_notifier_call_chain(&netdev_chain, val, dev);
1434}
1435
1436/* When > 0 there are consumers of rx skb time stamps */
1437static atomic_t netstamp_needed = ATOMIC_INIT(0);
1438
1439void net_enable_timestamp(void)
1440{
1441 atomic_inc(&netstamp_needed);
1442}
1443EXPORT_SYMBOL(net_enable_timestamp);
1444
1445void net_disable_timestamp(void)
1446{
1447 atomic_dec(&netstamp_needed);
1448}
1449EXPORT_SYMBOL(net_disable_timestamp);
1450
1451static inline void net_timestamp_set(struct sk_buff *skb)
1452{
1453 if (atomic_read(&netstamp_needed))
1454 __net_timestamp(skb);
1455 else
1456 skb->tstamp.tv64 = 0;
1457}
1458
1459static inline void net_timestamp_check(struct sk_buff *skb)
1460{
1461 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1462 __net_timestamp(skb);
1463}
1464
1465/**
1466 * dev_forward_skb - loopback an skb to another netif
1467 *
1468 * @dev: destination network device
1469 * @skb: buffer to forward
1470 *
1471 * return values:
1472 * NET_RX_SUCCESS (no congestion)
1473 * NET_RX_DROP (packet was dropped, but freed)
1474 *
1475 * dev_forward_skb can be used for injecting an skb from the
1476 * start_xmit function of one device into the receive queue
1477 * of another device.
1478 *
1479 * The receiving device may be in another namespace, so
1480 * we have to clear all information in the skb that could
1481 * impact namespace isolation.
1482 */
1483int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1484{
1485 skb_orphan(skb);
1486 nf_reset(skb);
1487
1488 if (unlikely(!(dev->flags & IFF_UP) ||
1489 (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1490 atomic_long_inc(&dev->rx_dropped);
1491 kfree_skb(skb);
1492 return NET_RX_DROP;
1493 }
1494 skb_set_dev(skb, dev);
1495 skb->tstamp.tv64 = 0;
1496 skb->pkt_type = PACKET_HOST;
1497 skb->protocol = eth_type_trans(skb, dev);
1498 return netif_rx(skb);
1499}
1500EXPORT_SYMBOL_GPL(dev_forward_skb);
1501
1502/*
1503 * Support routine. Sends outgoing frames to any network
1504 * taps currently in use.
1505 */
1506
1507static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1508{
1509 struct packet_type *ptype;
1510
1511#ifdef CONFIG_NET_CLS_ACT
1512 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1513 net_timestamp_set(skb);
1514#else
1515 net_timestamp_set(skb);
1516#endif
1517
1518 rcu_read_lock();
1519 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1520 /* Never send packets back to the socket
1521 * they originated from - MvS (miquels@drinkel.ow.org)
1522 */
1523 if ((ptype->dev == dev || !ptype->dev) &&
1524 (ptype->af_packet_priv == NULL ||
1525 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1526 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1527 if (!skb2)
1528 break;
1529
1530 /* skb->nh should be correctly
1531 set by sender, so that the second statement is
1532 just protection against buggy protocols.
1533 */
1534 skb_reset_mac_header(skb2);
1535
1536 if (skb_network_header(skb2) < skb2->data ||
1537 skb2->network_header > skb2->tail) {
1538 if (net_ratelimit())
1539 printk(KERN_CRIT "protocol %04x is "
1540 "buggy, dev %s\n",
1541 ntohs(skb2->protocol),
1542 dev->name);
1543 skb_reset_network_header(skb2);
1544 }
1545
1546 skb2->transport_header = skb2->network_header;
1547 skb2->pkt_type = PACKET_OUTGOING;
1548 ptype->func(skb2, skb->dev, ptype, skb->dev);
1549 }
1550 }
1551 rcu_read_unlock();
1552}
1553
1554/*
1555 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1556 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1557 */
1558int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1559{
1560 if (txq < 1 || txq > dev->num_tx_queues)
1561 return -EINVAL;
1562
1563 if (dev->reg_state == NETREG_REGISTERED) {
1564 ASSERT_RTNL();
1565
1566 if (txq < dev->real_num_tx_queues)
1567 qdisc_reset_all_tx_gt(dev, txq);
1568 }
1569
1570 dev->real_num_tx_queues = txq;
1571 return 0;
1572}
1573EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1574
1575#ifdef CONFIG_RPS
1576/**
1577 * netif_set_real_num_rx_queues - set actual number of RX queues used
1578 * @dev: Network device
1579 * @rxq: Actual number of RX queues
1580 *
1581 * This must be called either with the rtnl_lock held or before
1582 * registration of the net device. Returns 0 on success, or a
1583 * negative error code. If called before registration, it always
1584 * succeeds.
1585 */
1586int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1587{
1588 int rc;
1589
1590 if (rxq < 1 || rxq > dev->num_rx_queues)
1591 return -EINVAL;
1592
1593 if (dev->reg_state == NETREG_REGISTERED) {
1594 ASSERT_RTNL();
1595
1596 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1597 rxq);
1598 if (rc)
1599 return rc;
1600 }
1601
1602 dev->real_num_rx_queues = rxq;
1603 return 0;
1604}
1605EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1606#endif
1607
1608static inline void __netif_reschedule(struct Qdisc *q)
1609{
1610 struct softnet_data *sd;
1611 unsigned long flags;
1612
1613 local_irq_save(flags);
1614 sd = &__get_cpu_var(softnet_data);
1615 q->next_sched = NULL;
1616 *sd->output_queue_tailp = q;
1617 sd->output_queue_tailp = &q->next_sched;
1618 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1619 local_irq_restore(flags);
1620}
1621
1622void __netif_schedule(struct Qdisc *q)
1623{
1624 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1625 __netif_reschedule(q);
1626}
1627EXPORT_SYMBOL(__netif_schedule);
1628
1629void dev_kfree_skb_irq(struct sk_buff *skb)
1630{
1631 if (atomic_dec_and_test(&skb->users)) {
1632 struct softnet_data *sd;
1633 unsigned long flags;
1634
1635 local_irq_save(flags);
1636 sd = &__get_cpu_var(softnet_data);
1637 skb->next = sd->completion_queue;
1638 sd->completion_queue = skb;
1639 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1640 local_irq_restore(flags);
1641 }
1642}
1643EXPORT_SYMBOL(dev_kfree_skb_irq);
1644
1645void dev_kfree_skb_any(struct sk_buff *skb)
1646{
1647 if (in_irq() || irqs_disabled())
1648 dev_kfree_skb_irq(skb);
1649 else
1650 dev_kfree_skb(skb);
1651}
1652EXPORT_SYMBOL(dev_kfree_skb_any);
1653
1654
1655/**
1656 * netif_device_detach - mark device as removed
1657 * @dev: network device
1658 *
1659 * Mark device as removed from system and therefore no longer available.
1660 */
1661void netif_device_detach(struct net_device *dev)
1662{
1663 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1664 netif_running(dev)) {
1665 netif_tx_stop_all_queues(dev);
1666 }
1667}
1668EXPORT_SYMBOL(netif_device_detach);
1669
1670/**
1671 * netif_device_attach - mark device as attached
1672 * @dev: network device
1673 *
1674 * Mark device as attached from system and restart if needed.
1675 */
1676void netif_device_attach(struct net_device *dev)
1677{
1678 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1679 netif_running(dev)) {
1680 netif_tx_wake_all_queues(dev);
1681 __netdev_watchdog_up(dev);
1682 }
1683}
1684EXPORT_SYMBOL(netif_device_attach);
1685
1686static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1687{
1688 return ((features & NETIF_F_NO_CSUM) ||
1689 ((features & NETIF_F_V4_CSUM) &&
1690 protocol == htons(ETH_P_IP)) ||
1691 ((features & NETIF_F_V6_CSUM) &&
1692 protocol == htons(ETH_P_IPV6)) ||
1693 ((features & NETIF_F_FCOE_CRC) &&
1694 protocol == htons(ETH_P_FCOE)));
1695}
1696
1697static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1698{
1699 __be16 protocol = skb->protocol;
1700 int features = dev->features;
1701
1702 if (vlan_tx_tag_present(skb)) {
1703 features &= dev->vlan_features;
1704 } else if (protocol == htons(ETH_P_8021Q)) {
1705 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1706 protocol = veh->h_vlan_encapsulated_proto;
1707 features &= dev->vlan_features;
1708 }
1709
1710 return can_checksum_protocol(features, protocol);
1711}
1712
1713/**
1714 * skb_dev_set -- assign a new device to a buffer
1715 * @skb: buffer for the new device
1716 * @dev: network device
1717 *
1718 * If an skb is owned by a device already, we have to reset
1719 * all data private to the namespace a device belongs to
1720 * before assigning it a new device.
1721 */
1722#ifdef CONFIG_NET_NS
1723void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1724{
1725 skb_dst_drop(skb);
1726 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1727 secpath_reset(skb);
1728 nf_reset(skb);
1729 skb_init_secmark(skb);
1730 skb->mark = 0;
1731 skb->priority = 0;
1732 skb->nf_trace = 0;
1733 skb->ipvs_property = 0;
1734#ifdef CONFIG_NET_SCHED
1735 skb->tc_index = 0;
1736#endif
1737 }
1738 skb->dev = dev;
1739}
1740EXPORT_SYMBOL(skb_set_dev);
1741#endif /* CONFIG_NET_NS */
1742
1743/*
1744 * Invalidate hardware checksum when packet is to be mangled, and
1745 * complete checksum manually on outgoing path.
1746 */
1747int skb_checksum_help(struct sk_buff *skb)
1748{
1749 __wsum csum;
1750 int ret = 0, offset;
1751
1752 if (skb->ip_summed == CHECKSUM_COMPLETE)
1753 goto out_set_summed;
1754
1755 if (unlikely(skb_shinfo(skb)->gso_size)) {
1756 /* Let GSO fix up the checksum. */
1757 goto out_set_summed;
1758 }
1759
1760 offset = skb->csum_start - skb_headroom(skb);
1761 BUG_ON(offset >= skb_headlen(skb));
1762 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1763
1764 offset += skb->csum_offset;
1765 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1766
1767 if (skb_cloned(skb) &&
1768 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1769 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1770 if (ret)
1771 goto out;
1772 }
1773
1774 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1775out_set_summed:
1776 skb->ip_summed = CHECKSUM_NONE;
1777out:
1778 return ret;
1779}
1780EXPORT_SYMBOL(skb_checksum_help);
1781
1782/**
1783 * skb_gso_segment - Perform segmentation on skb.
1784 * @skb: buffer to segment
1785 * @features: features for the output path (see dev->features)
1786 *
1787 * This function segments the given skb and returns a list of segments.
1788 *
1789 * It may return NULL if the skb requires no segmentation. This is
1790 * only possible when GSO is used for verifying header integrity.
1791 */
1792struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1793{
1794 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1795 struct packet_type *ptype;
1796 __be16 type = skb->protocol;
1797 int err;
1798
1799 if (type == htons(ETH_P_8021Q)) {
1800 struct vlan_ethhdr *veh;
1801
1802 if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
1803 return ERR_PTR(-EINVAL);
1804
1805 veh = (struct vlan_ethhdr *)skb->data;
1806 type = veh->h_vlan_encapsulated_proto;
1807 }
1808
1809 skb_reset_mac_header(skb);
1810 skb->mac_len = skb->network_header - skb->mac_header;
1811 __skb_pull(skb, skb->mac_len);
1812
1813 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1814 struct net_device *dev = skb->dev;
1815 struct ethtool_drvinfo info = {};
1816
1817 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1818 dev->ethtool_ops->get_drvinfo(dev, &info);
1819
1820 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1821 info.driver, dev ? dev->features : 0L,
1822 skb->sk ? skb->sk->sk_route_caps : 0L,
1823 skb->len, skb->data_len, skb->ip_summed);
1824
1825 if (skb_header_cloned(skb) &&
1826 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1827 return ERR_PTR(err);
1828 }
1829
1830 rcu_read_lock();
1831 list_for_each_entry_rcu(ptype,
1832 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1833 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1834 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1835 err = ptype->gso_send_check(skb);
1836 segs = ERR_PTR(err);
1837 if (err || skb_gso_ok(skb, features))
1838 break;
1839 __skb_push(skb, (skb->data -
1840 skb_network_header(skb)));
1841 }
1842 segs = ptype->gso_segment(skb, features);
1843 break;
1844 }
1845 }
1846 rcu_read_unlock();
1847
1848 __skb_push(skb, skb->data - skb_mac_header(skb));
1849
1850 return segs;
1851}
1852EXPORT_SYMBOL(skb_gso_segment);
1853
1854/* Take action when hardware reception checksum errors are detected. */
1855#ifdef CONFIG_BUG
1856void netdev_rx_csum_fault(struct net_device *dev)
1857{
1858 if (net_ratelimit()) {
1859 printk(KERN_ERR "%s: hw csum failure.\n",
1860 dev ? dev->name : "<unknown>");
1861 dump_stack();
1862 }
1863}
1864EXPORT_SYMBOL(netdev_rx_csum_fault);
1865#endif
1866
1867/* Actually, we should eliminate this check as soon as we know, that:
1868 * 1. IOMMU is present and allows to map all the memory.
1869 * 2. No high memory really exists on this machine.
1870 */
1871
1872static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1873{
1874#ifdef CONFIG_HIGHMEM
1875 int i;
1876 if (!(dev->features & NETIF_F_HIGHDMA)) {
1877 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1878 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1879 return 1;
1880 }
1881
1882 if (PCI_DMA_BUS_IS_PHYS) {
1883 struct device *pdev = dev->dev.parent;
1884
1885 if (!pdev)
1886 return 0;
1887 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1888 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1889 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1890 return 1;
1891 }
1892 }
1893#endif
1894 return 0;
1895}
1896
1897struct dev_gso_cb {
1898 void (*destructor)(struct sk_buff *skb);
1899};
1900
1901#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1902
1903static void dev_gso_skb_destructor(struct sk_buff *skb)
1904{
1905 struct dev_gso_cb *cb;
1906
1907 do {
1908 struct sk_buff *nskb = skb->next;
1909
1910 skb->next = nskb->next;
1911 nskb->next = NULL;
1912 kfree_skb(nskb);
1913 } while (skb->next);
1914
1915 cb = DEV_GSO_CB(skb);
1916 if (cb->destructor)
1917 cb->destructor(skb);
1918}
1919
1920/**
1921 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1922 * @skb: buffer to segment
1923 *
1924 * This function segments the given skb and stores the list of segments
1925 * in skb->next.
1926 */
1927static int dev_gso_segment(struct sk_buff *skb)
1928{
1929 struct net_device *dev = skb->dev;
1930 struct sk_buff *segs;
1931 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1932 NETIF_F_SG : 0);
1933
1934 segs = skb_gso_segment(skb, features);
1935
1936 /* Verifying header integrity only. */
1937 if (!segs)
1938 return 0;
1939
1940 if (IS_ERR(segs))
1941 return PTR_ERR(segs);
1942
1943 skb->next = segs;
1944 DEV_GSO_CB(skb)->destructor = skb->destructor;
1945 skb->destructor = dev_gso_skb_destructor;
1946
1947 return 0;
1948}
1949
1950/*
1951 * Try to orphan skb early, right before transmission by the device.
1952 * We cannot orphan skb if tx timestamp is requested or the sk-reference
1953 * is needed on driver level for other reasons, e.g. see net/can/raw.c
1954 */
1955static inline void skb_orphan_try(struct sk_buff *skb)
1956{
1957 struct sock *sk = skb->sk;
1958
1959 if (sk && !skb_shinfo(skb)->tx_flags) {
1960 /* skb_tx_hash() wont be able to get sk.
1961 * We copy sk_hash into skb->rxhash
1962 */
1963 if (!skb->rxhash)
1964 skb->rxhash = sk->sk_hash;
1965 skb_orphan(skb);
1966 }
1967}
1968
1969/*
1970 * Returns true if either:
1971 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
1972 * 2. skb is fragmented and the device does not support SG, or if
1973 * at least one of fragments is in highmem and device does not
1974 * support DMA from it.
1975 */
1976static inline int skb_needs_linearize(struct sk_buff *skb,
1977 struct net_device *dev)
1978{
1979 int features = dev->features;
1980
1981 if (skb->protocol == htons(ETH_P_8021Q) || vlan_tx_tag_present(skb))
1982 features &= dev->vlan_features;
1983
1984 return skb_is_nonlinear(skb) &&
1985 ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) ||
1986 (skb_shinfo(skb)->nr_frags && (!(features & NETIF_F_SG) ||
1987 illegal_highdma(dev, skb))));
1988}
1989
1990int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1991 struct netdev_queue *txq)
1992{
1993 const struct net_device_ops *ops = dev->netdev_ops;
1994 int rc = NETDEV_TX_OK;
1995
1996 if (likely(!skb->next)) {
1997 if (!list_empty(&ptype_all))
1998 dev_queue_xmit_nit(skb, dev);
1999
2000 /*
2001 * If device doesnt need skb->dst, release it right now while
2002 * its hot in this cpu cache
2003 */
2004 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2005 skb_dst_drop(skb);
2006
2007 skb_orphan_try(skb);
2008
2009 if (vlan_tx_tag_present(skb) &&
2010 !(dev->features & NETIF_F_HW_VLAN_TX)) {
2011 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2012 if (unlikely(!skb))
2013 goto out;
2014
2015 skb->vlan_tci = 0;
2016 }
2017
2018 if (netif_needs_gso(dev, skb)) {
2019 if (unlikely(dev_gso_segment(skb)))
2020 goto out_kfree_skb;
2021 if (skb->next)
2022 goto gso;
2023 } else {
2024 if (skb_needs_linearize(skb, dev) &&
2025 __skb_linearize(skb))
2026 goto out_kfree_skb;
2027
2028 /* If packet is not checksummed and device does not
2029 * support checksumming for this protocol, complete
2030 * checksumming here.
2031 */
2032 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2033 skb_set_transport_header(skb, skb->csum_start -
2034 skb_headroom(skb));
2035 if (!dev_can_checksum(dev, skb) &&
2036 skb_checksum_help(skb))
2037 goto out_kfree_skb;
2038 }
2039 }
2040
2041 rc = ops->ndo_start_xmit(skb, dev);
2042 trace_net_dev_xmit(skb, rc);
2043 if (rc == NETDEV_TX_OK)
2044 txq_trans_update(txq);
2045 return rc;
2046 }
2047
2048gso:
2049 do {
2050 struct sk_buff *nskb = skb->next;
2051
2052 skb->next = nskb->next;
2053 nskb->next = NULL;
2054
2055 /*
2056 * If device doesnt need nskb->dst, release it right now while
2057 * its hot in this cpu cache
2058 */
2059 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2060 skb_dst_drop(nskb);
2061
2062 rc = ops->ndo_start_xmit(nskb, dev);
2063 trace_net_dev_xmit(nskb, rc);
2064 if (unlikely(rc != NETDEV_TX_OK)) {
2065 if (rc & ~NETDEV_TX_MASK)
2066 goto out_kfree_gso_skb;
2067 nskb->next = skb->next;
2068 skb->next = nskb;
2069 return rc;
2070 }
2071 txq_trans_update(txq);
2072 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2073 return NETDEV_TX_BUSY;
2074 } while (skb->next);
2075
2076out_kfree_gso_skb:
2077 if (likely(skb->next == NULL))
2078 skb->destructor = DEV_GSO_CB(skb)->destructor;
2079out_kfree_skb:
2080 kfree_skb(skb);
2081out:
2082 return rc;
2083}
2084
2085static u32 hashrnd __read_mostly;
2086
2087u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
2088{
2089 u32 hash;
2090
2091 if (skb_rx_queue_recorded(skb)) {
2092 hash = skb_get_rx_queue(skb);
2093 while (unlikely(hash >= dev->real_num_tx_queues))
2094 hash -= dev->real_num_tx_queues;
2095 return hash;
2096 }
2097
2098 if (skb->sk && skb->sk->sk_hash)
2099 hash = skb->sk->sk_hash;
2100 else
2101 hash = (__force u16) skb->protocol ^ skb->rxhash;
2102 hash = jhash_1word(hash, hashrnd);
2103
2104 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
2105}
2106EXPORT_SYMBOL(skb_tx_hash);
2107
2108static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2109{
2110 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2111 if (net_ratelimit()) {
2112 pr_warning("%s selects TX queue %d, but "
2113 "real number of TX queues is %d\n",
2114 dev->name, queue_index, dev->real_num_tx_queues);
2115 }
2116 return 0;
2117 }
2118 return queue_index;
2119}
2120
2121static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2122 struct sk_buff *skb)
2123{
2124 int queue_index;
2125 const struct net_device_ops *ops = dev->netdev_ops;
2126
2127 if (ops->ndo_select_queue) {
2128 queue_index = ops->ndo_select_queue(dev, skb);
2129 queue_index = dev_cap_txqueue(dev, queue_index);
2130 } else {
2131 struct sock *sk = skb->sk;
2132 queue_index = sk_tx_queue_get(sk);
2133 if (queue_index < 0 || queue_index >= dev->real_num_tx_queues) {
2134
2135 queue_index = 0;
2136 if (dev->real_num_tx_queues > 1)
2137 queue_index = skb_tx_hash(dev, skb);
2138
2139 if (sk) {
2140 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
2141
2142 if (dst && skb_dst(skb) == dst)
2143 sk_tx_queue_set(sk, queue_index);
2144 }
2145 }
2146 }
2147
2148 skb_set_queue_mapping(skb, queue_index);
2149 return netdev_get_tx_queue(dev, queue_index);
2150}
2151
2152static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2153 struct net_device *dev,
2154 struct netdev_queue *txq)
2155{
2156 spinlock_t *root_lock = qdisc_lock(q);
2157 bool contended = qdisc_is_running(q);
2158 int rc;
2159
2160 /*
2161 * Heuristic to force contended enqueues to serialize on a
2162 * separate lock before trying to get qdisc main lock.
2163 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2164 * and dequeue packets faster.
2165 */
2166 if (unlikely(contended))
2167 spin_lock(&q->busylock);
2168
2169 spin_lock(root_lock);
2170 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2171 kfree_skb(skb);
2172 rc = NET_XMIT_DROP;
2173 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2174 qdisc_run_begin(q)) {
2175 /*
2176 * This is a work-conserving queue; there are no old skbs
2177 * waiting to be sent out; and the qdisc is not running -
2178 * xmit the skb directly.
2179 */
2180 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2181 skb_dst_force(skb);
2182 __qdisc_update_bstats(q, skb->len);
2183 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2184 if (unlikely(contended)) {
2185 spin_unlock(&q->busylock);
2186 contended = false;
2187 }
2188 __qdisc_run(q);
2189 } else
2190 qdisc_run_end(q);
2191
2192 rc = NET_XMIT_SUCCESS;
2193 } else {
2194 skb_dst_force(skb);
2195 rc = qdisc_enqueue_root(skb, q);
2196 if (qdisc_run_begin(q)) {
2197 if (unlikely(contended)) {
2198 spin_unlock(&q->busylock);
2199 contended = false;
2200 }
2201 __qdisc_run(q);
2202 }
2203 }
2204 spin_unlock(root_lock);
2205 if (unlikely(contended))
2206 spin_unlock(&q->busylock);
2207 return rc;
2208}
2209
2210static DEFINE_PER_CPU(int, xmit_recursion);
2211#define RECURSION_LIMIT 10
2212
2213/**
2214 * dev_queue_xmit - transmit a buffer
2215 * @skb: buffer to transmit
2216 *
2217 * Queue a buffer for transmission to a network device. The caller must
2218 * have set the device and priority and built the buffer before calling
2219 * this function. The function can be called from an interrupt.
2220 *
2221 * A negative errno code is returned on a failure. A success does not
2222 * guarantee the frame will be transmitted as it may be dropped due
2223 * to congestion or traffic shaping.
2224 *
2225 * -----------------------------------------------------------------------------------
2226 * I notice this method can also return errors from the queue disciplines,
2227 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2228 * be positive.
2229 *
2230 * Regardless of the return value, the skb is consumed, so it is currently
2231 * difficult to retry a send to this method. (You can bump the ref count
2232 * before sending to hold a reference for retry if you are careful.)
2233 *
2234 * When calling this method, interrupts MUST be enabled. This is because
2235 * the BH enable code must have IRQs enabled so that it will not deadlock.
2236 * --BLG
2237 */
2238int dev_queue_xmit(struct sk_buff *skb)
2239{
2240 struct net_device *dev = skb->dev;
2241 struct netdev_queue *txq;
2242 struct Qdisc *q;
2243 int rc = -ENOMEM;
2244
2245 /* Disable soft irqs for various locks below. Also
2246 * stops preemption for RCU.
2247 */
2248 rcu_read_lock_bh();
2249
2250 txq = dev_pick_tx(dev, skb);
2251 q = rcu_dereference_bh(txq->qdisc);
2252
2253#ifdef CONFIG_NET_CLS_ACT
2254 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2255#endif
2256 trace_net_dev_queue(skb);
2257 if (q->enqueue) {
2258 rc = __dev_xmit_skb(skb, q, dev, txq);
2259 goto out;
2260 }
2261
2262 /* The device has no queue. Common case for software devices:
2263 loopback, all the sorts of tunnels...
2264
2265 Really, it is unlikely that netif_tx_lock protection is necessary
2266 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2267 counters.)
2268 However, it is possible, that they rely on protection
2269 made by us here.
2270
2271 Check this and shot the lock. It is not prone from deadlocks.
2272 Either shot noqueue qdisc, it is even simpler 8)
2273 */
2274 if (dev->flags & IFF_UP) {
2275 int cpu = smp_processor_id(); /* ok because BHs are off */
2276
2277 if (txq->xmit_lock_owner != cpu) {
2278
2279 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2280 goto recursion_alert;
2281
2282 HARD_TX_LOCK(dev, txq, cpu);
2283
2284 if (!netif_tx_queue_stopped(txq)) {
2285 __this_cpu_inc(xmit_recursion);
2286 rc = dev_hard_start_xmit(skb, dev, txq);
2287 __this_cpu_dec(xmit_recursion);
2288 if (dev_xmit_complete(rc)) {
2289 HARD_TX_UNLOCK(dev, txq);
2290 goto out;
2291 }
2292 }
2293 HARD_TX_UNLOCK(dev, txq);
2294 if (net_ratelimit())
2295 printk(KERN_CRIT "Virtual device %s asks to "
2296 "queue packet!\n", dev->name);
2297 } else {
2298 /* Recursion is detected! It is possible,
2299 * unfortunately
2300 */
2301recursion_alert:
2302 if (net_ratelimit())
2303 printk(KERN_CRIT "Dead loop on virtual device "
2304 "%s, fix it urgently!\n", dev->name);
2305 }
2306 }
2307
2308 rc = -ENETDOWN;
2309 rcu_read_unlock_bh();
2310
2311 kfree_skb(skb);
2312 return rc;
2313out:
2314 rcu_read_unlock_bh();
2315 return rc;
2316}
2317EXPORT_SYMBOL(dev_queue_xmit);
2318
2319
2320/*=======================================================================
2321 Receiver routines
2322 =======================================================================*/
2323
2324int netdev_max_backlog __read_mostly = 1000;
2325int netdev_tstamp_prequeue __read_mostly = 1;
2326int netdev_budget __read_mostly = 300;
2327int weight_p __read_mostly = 64; /* old backlog weight */
2328
2329/* Called with irq disabled */
2330static inline void ____napi_schedule(struct softnet_data *sd,
2331 struct napi_struct *napi)
2332{
2333 list_add_tail(&napi->poll_list, &sd->poll_list);
2334 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2335}
2336
2337/*
2338 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2339 * and src/dst port numbers. Returns a non-zero hash number on success
2340 * and 0 on failure.
2341 */
2342__u32 __skb_get_rxhash(struct sk_buff *skb)
2343{
2344 int nhoff, hash = 0, poff;
2345 struct ipv6hdr *ip6;
2346 struct iphdr *ip;
2347 u8 ip_proto;
2348 u32 addr1, addr2, ihl;
2349 union {
2350 u32 v32;
2351 u16 v16[2];
2352 } ports;
2353
2354 nhoff = skb_network_offset(skb);
2355
2356 switch (skb->protocol) {
2357 case __constant_htons(ETH_P_IP):
2358 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2359 goto done;
2360
2361 ip = (struct iphdr *) (skb->data + nhoff);
2362 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2363 ip_proto = 0;
2364 else
2365 ip_proto = ip->protocol;
2366 addr1 = (__force u32) ip->saddr;
2367 addr2 = (__force u32) ip->daddr;
2368 ihl = ip->ihl;
2369 break;
2370 case __constant_htons(ETH_P_IPV6):
2371 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2372 goto done;
2373
2374 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2375 ip_proto = ip6->nexthdr;
2376 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2377 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2378 ihl = (40 >> 2);
2379 break;
2380 default:
2381 goto done;
2382 }
2383
2384 ports.v32 = 0;
2385 poff = proto_ports_offset(ip_proto);
2386 if (poff >= 0) {
2387 nhoff += ihl * 4 + poff;
2388 if (pskb_may_pull(skb, nhoff + 4)) {
2389 ports.v32 = * (__force u32 *) (skb->data + nhoff);
2390 if (ports.v16[1] < ports.v16[0])
2391 swap(ports.v16[0], ports.v16[1]);
2392 }
2393 }
2394
2395 /* get a consistent hash (same value on both flow directions) */
2396 if (addr2 < addr1)
2397 swap(addr1, addr2);
2398
2399 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2400 if (!hash)
2401 hash = 1;
2402
2403done:
2404 return hash;
2405}
2406EXPORT_SYMBOL(__skb_get_rxhash);
2407
2408#ifdef CONFIG_RPS
2409
2410/* One global table that all flow-based protocols share. */
2411struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2412EXPORT_SYMBOL(rps_sock_flow_table);
2413
2414/*
2415 * get_rps_cpu is called from netif_receive_skb and returns the target
2416 * CPU from the RPS map of the receiving queue for a given skb.
2417 * rcu_read_lock must be held on entry.
2418 */
2419static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2420 struct rps_dev_flow **rflowp)
2421{
2422 struct netdev_rx_queue *rxqueue;
2423 struct rps_map *map;
2424 struct rps_dev_flow_table *flow_table;
2425 struct rps_sock_flow_table *sock_flow_table;
2426 int cpu = -1;
2427 u16 tcpu;
2428
2429 if (skb_rx_queue_recorded(skb)) {
2430 u16 index = skb_get_rx_queue(skb);
2431 if (unlikely(index >= dev->real_num_rx_queues)) {
2432 WARN_ONCE(dev->real_num_rx_queues > 1,
2433 "%s received packet on queue %u, but number "
2434 "of RX queues is %u\n",
2435 dev->name, index, dev->real_num_rx_queues);
2436 goto done;
2437 }
2438 rxqueue = dev->_rx + index;
2439 } else
2440 rxqueue = dev->_rx;
2441
2442 map = rcu_dereference(rxqueue->rps_map);
2443 if (map) {
2444 if (map->len == 1) {
2445 tcpu = map->cpus[0];
2446 if (cpu_online(tcpu))
2447 cpu = tcpu;
2448 goto done;
2449 }
2450 } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2451 goto done;
2452 }
2453
2454 skb_reset_network_header(skb);
2455 if (!skb_get_rxhash(skb))
2456 goto done;
2457
2458 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2459 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2460 if (flow_table && sock_flow_table) {
2461 u16 next_cpu;
2462 struct rps_dev_flow *rflow;
2463
2464 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2465 tcpu = rflow->cpu;
2466
2467 next_cpu = sock_flow_table->ents[skb->rxhash &
2468 sock_flow_table->mask];
2469
2470 /*
2471 * If the desired CPU (where last recvmsg was done) is
2472 * different from current CPU (one in the rx-queue flow
2473 * table entry), switch if one of the following holds:
2474 * - Current CPU is unset (equal to RPS_NO_CPU).
2475 * - Current CPU is offline.
2476 * - The current CPU's queue tail has advanced beyond the
2477 * last packet that was enqueued using this table entry.
2478 * This guarantees that all previous packets for the flow
2479 * have been dequeued, thus preserving in order delivery.
2480 */
2481 if (unlikely(tcpu != next_cpu) &&
2482 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2483 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2484 rflow->last_qtail)) >= 0)) {
2485 tcpu = rflow->cpu = next_cpu;
2486 if (tcpu != RPS_NO_CPU)
2487 rflow->last_qtail = per_cpu(softnet_data,
2488 tcpu).input_queue_head;
2489 }
2490 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2491 *rflowp = rflow;
2492 cpu = tcpu;
2493 goto done;
2494 }
2495 }
2496
2497 if (map) {
2498 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2499
2500 if (cpu_online(tcpu)) {
2501 cpu = tcpu;
2502 goto done;
2503 }
2504 }
2505
2506done:
2507 return cpu;
2508}
2509
2510/* Called from hardirq (IPI) context */
2511static void rps_trigger_softirq(void *data)
2512{
2513 struct softnet_data *sd = data;
2514
2515 ____napi_schedule(sd, &sd->backlog);
2516 sd->received_rps++;
2517}
2518
2519#endif /* CONFIG_RPS */
2520
2521/*
2522 * Check if this softnet_data structure is another cpu one
2523 * If yes, queue it to our IPI list and return 1
2524 * If no, return 0
2525 */
2526static int rps_ipi_queued(struct softnet_data *sd)
2527{
2528#ifdef CONFIG_RPS
2529 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2530
2531 if (sd != mysd) {
2532 sd->rps_ipi_next = mysd->rps_ipi_list;
2533 mysd->rps_ipi_list = sd;
2534
2535 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2536 return 1;
2537 }
2538#endif /* CONFIG_RPS */
2539 return 0;
2540}
2541
2542/*
2543 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2544 * queue (may be a remote CPU queue).
2545 */
2546static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2547 unsigned int *qtail)
2548{
2549 struct softnet_data *sd;
2550 unsigned long flags;
2551
2552 sd = &per_cpu(softnet_data, cpu);
2553
2554 local_irq_save(flags);
2555
2556 rps_lock(sd);
2557 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2558 if (skb_queue_len(&sd->input_pkt_queue)) {
2559enqueue:
2560 __skb_queue_tail(&sd->input_pkt_queue, skb);
2561 input_queue_tail_incr_save(sd, qtail);
2562 rps_unlock(sd);
2563 local_irq_restore(flags);
2564 return NET_RX_SUCCESS;
2565 }
2566
2567 /* Schedule NAPI for backlog device
2568 * We can use non atomic operation since we own the queue lock
2569 */
2570 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2571 if (!rps_ipi_queued(sd))
2572 ____napi_schedule(sd, &sd->backlog);
2573 }
2574 goto enqueue;
2575 }
2576
2577 sd->dropped++;
2578 rps_unlock(sd);
2579
2580 local_irq_restore(flags);
2581
2582 atomic_long_inc(&skb->dev->rx_dropped);
2583 kfree_skb(skb);
2584 return NET_RX_DROP;
2585}
2586
2587/**
2588 * netif_rx - post buffer to the network code
2589 * @skb: buffer to post
2590 *
2591 * This function receives a packet from a device driver and queues it for
2592 * the upper (protocol) levels to process. It always succeeds. The buffer
2593 * may be dropped during processing for congestion control or by the
2594 * protocol layers.
2595 *
2596 * return values:
2597 * NET_RX_SUCCESS (no congestion)
2598 * NET_RX_DROP (packet was dropped)
2599 *
2600 */
2601
2602int netif_rx(struct sk_buff *skb)
2603{
2604 int ret;
2605
2606 /* if netpoll wants it, pretend we never saw it */
2607 if (netpoll_rx(skb))
2608 return NET_RX_DROP;
2609
2610 if (netdev_tstamp_prequeue)
2611 net_timestamp_check(skb);
2612
2613 trace_netif_rx(skb);
2614#ifdef CONFIG_RPS
2615 {
2616 struct rps_dev_flow voidflow, *rflow = &voidflow;
2617 int cpu;
2618
2619 preempt_disable();
2620 rcu_read_lock();
2621
2622 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2623 if (cpu < 0)
2624 cpu = smp_processor_id();
2625
2626 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2627
2628 rcu_read_unlock();
2629 preempt_enable();
2630 }
2631#else
2632 {
2633 unsigned int qtail;
2634 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2635 put_cpu();
2636 }
2637#endif
2638 return ret;
2639}
2640EXPORT_SYMBOL(netif_rx);
2641
2642int netif_rx_ni(struct sk_buff *skb)
2643{
2644 int err;
2645
2646 preempt_disable();
2647 err = netif_rx(skb);
2648 if (local_softirq_pending())
2649 do_softirq();
2650 preempt_enable();
2651
2652 return err;
2653}
2654EXPORT_SYMBOL(netif_rx_ni);
2655
2656static void net_tx_action(struct softirq_action *h)
2657{
2658 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2659
2660 if (sd->completion_queue) {
2661 struct sk_buff *clist;
2662
2663 local_irq_disable();
2664 clist = sd->completion_queue;
2665 sd->completion_queue = NULL;
2666 local_irq_enable();
2667
2668 while (clist) {
2669 struct sk_buff *skb = clist;
2670 clist = clist->next;
2671
2672 WARN_ON(atomic_read(&skb->users));
2673 trace_kfree_skb(skb, net_tx_action);
2674 __kfree_skb(skb);
2675 }
2676 }
2677
2678 if (sd->output_queue) {
2679 struct Qdisc *head;
2680
2681 local_irq_disable();
2682 head = sd->output_queue;
2683 sd->output_queue = NULL;
2684 sd->output_queue_tailp = &sd->output_queue;
2685 local_irq_enable();
2686
2687 while (head) {
2688 struct Qdisc *q = head;
2689 spinlock_t *root_lock;
2690
2691 head = head->next_sched;
2692
2693 root_lock = qdisc_lock(q);
2694 if (spin_trylock(root_lock)) {
2695 smp_mb__before_clear_bit();
2696 clear_bit(__QDISC_STATE_SCHED,
2697 &q->state);
2698 qdisc_run(q);
2699 spin_unlock(root_lock);
2700 } else {
2701 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2702 &q->state)) {
2703 __netif_reschedule(q);
2704 } else {
2705 smp_mb__before_clear_bit();
2706 clear_bit(__QDISC_STATE_SCHED,
2707 &q->state);
2708 }
2709 }
2710 }
2711 }
2712}
2713
2714static inline int deliver_skb(struct sk_buff *skb,
2715 struct packet_type *pt_prev,
2716 struct net_device *orig_dev)
2717{
2718 atomic_inc(&skb->users);
2719 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2720}
2721
2722#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2723 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2724/* This hook is defined here for ATM LANE */
2725int (*br_fdb_test_addr_hook)(struct net_device *dev,
2726 unsigned char *addr) __read_mostly;
2727EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2728#endif
2729
2730#ifdef CONFIG_NET_CLS_ACT
2731/* TODO: Maybe we should just force sch_ingress to be compiled in
2732 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2733 * a compare and 2 stores extra right now if we dont have it on
2734 * but have CONFIG_NET_CLS_ACT
2735 * NOTE: This doesnt stop any functionality; if you dont have
2736 * the ingress scheduler, you just cant add policies on ingress.
2737 *
2738 */
2739static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2740{
2741 struct net_device *dev = skb->dev;
2742 u32 ttl = G_TC_RTTL(skb->tc_verd);
2743 int result = TC_ACT_OK;
2744 struct Qdisc *q;
2745
2746 if (unlikely(MAX_RED_LOOP < ttl++)) {
2747 if (net_ratelimit())
2748 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2749 skb->skb_iif, dev->ifindex);
2750 return TC_ACT_SHOT;
2751 }
2752
2753 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2754 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2755
2756 q = rxq->qdisc;
2757 if (q != &noop_qdisc) {
2758 spin_lock(qdisc_lock(q));
2759 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2760 result = qdisc_enqueue_root(skb, q);
2761 spin_unlock(qdisc_lock(q));
2762 }
2763
2764 return result;
2765}
2766
2767static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2768 struct packet_type **pt_prev,
2769 int *ret, struct net_device *orig_dev)
2770{
2771 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2772
2773 if (!rxq || rxq->qdisc == &noop_qdisc)
2774 goto out;
2775
2776 if (*pt_prev) {
2777 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2778 *pt_prev = NULL;
2779 }
2780
2781 switch (ing_filter(skb, rxq)) {
2782 case TC_ACT_SHOT:
2783 case TC_ACT_STOLEN:
2784 kfree_skb(skb);
2785 return NULL;
2786 }
2787
2788out:
2789 skb->tc_verd = 0;
2790 return skb;
2791}
2792#endif
2793
2794/**
2795 * netdev_rx_handler_register - register receive handler
2796 * @dev: device to register a handler for
2797 * @rx_handler: receive handler to register
2798 * @rx_handler_data: data pointer that is used by rx handler
2799 *
2800 * Register a receive hander for a device. This handler will then be
2801 * called from __netif_receive_skb. A negative errno code is returned
2802 * on a failure.
2803 *
2804 * The caller must hold the rtnl_mutex.
2805 */
2806int netdev_rx_handler_register(struct net_device *dev,
2807 rx_handler_func_t *rx_handler,
2808 void *rx_handler_data)
2809{
2810 ASSERT_RTNL();
2811
2812 if (dev->rx_handler)
2813 return -EBUSY;
2814
2815 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2816 rcu_assign_pointer(dev->rx_handler, rx_handler);
2817
2818 return 0;
2819}
2820EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2821
2822/**
2823 * netdev_rx_handler_unregister - unregister receive handler
2824 * @dev: device to unregister a handler from
2825 *
2826 * Unregister a receive hander from a device.
2827 *
2828 * The caller must hold the rtnl_mutex.
2829 */
2830void netdev_rx_handler_unregister(struct net_device *dev)
2831{
2832
2833 ASSERT_RTNL();
2834 rcu_assign_pointer(dev->rx_handler, NULL);
2835 rcu_assign_pointer(dev->rx_handler_data, NULL);
2836}
2837EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2838
2839static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2840 struct net_device *master)
2841{
2842 if (skb->pkt_type == PACKET_HOST) {
2843 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2844
2845 memcpy(dest, master->dev_addr, ETH_ALEN);
2846 }
2847}
2848
2849/* On bonding slaves other than the currently active slave, suppress
2850 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2851 * ARP on active-backup slaves with arp_validate enabled.
2852 */
2853int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2854{
2855 struct net_device *dev = skb->dev;
2856
2857 if (master->priv_flags & IFF_MASTER_ARPMON)
2858 dev->last_rx = jiffies;
2859
2860 if ((master->priv_flags & IFF_MASTER_ALB) &&
2861 (master->priv_flags & IFF_BRIDGE_PORT)) {
2862 /* Do address unmangle. The local destination address
2863 * will be always the one master has. Provides the right
2864 * functionality in a bridge.
2865 */
2866 skb_bond_set_mac_by_master(skb, master);
2867 }
2868
2869 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2870 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2871 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2872 return 0;
2873
2874 if (master->priv_flags & IFF_MASTER_ALB) {
2875 if (skb->pkt_type != PACKET_BROADCAST &&
2876 skb->pkt_type != PACKET_MULTICAST)
2877 return 0;
2878 }
2879 if (master->priv_flags & IFF_MASTER_8023AD &&
2880 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2881 return 0;
2882
2883 return 1;
2884 }
2885 return 0;
2886}
2887EXPORT_SYMBOL(__skb_bond_should_drop);
2888
2889static int __netif_receive_skb(struct sk_buff *skb)
2890{
2891 struct packet_type *ptype, *pt_prev;
2892 rx_handler_func_t *rx_handler;
2893 struct net_device *orig_dev;
2894 struct net_device *master;
2895 struct net_device *null_or_orig;
2896 struct net_device *orig_or_bond;
2897 int ret = NET_RX_DROP;
2898 __be16 type;
2899
2900 if (!netdev_tstamp_prequeue)
2901 net_timestamp_check(skb);
2902
2903 trace_netif_receive_skb(skb);
2904
2905 /* if we've gotten here through NAPI, check netpoll */
2906 if (netpoll_receive_skb(skb))
2907 return NET_RX_DROP;
2908
2909 if (!skb->skb_iif)
2910 skb->skb_iif = skb->dev->ifindex;
2911
2912 /*
2913 * bonding note: skbs received on inactive slaves should only
2914 * be delivered to pkt handlers that are exact matches. Also
2915 * the deliver_no_wcard flag will be set. If packet handlers
2916 * are sensitive to duplicate packets these skbs will need to
2917 * be dropped at the handler.
2918 */
2919 null_or_orig = NULL;
2920 orig_dev = skb->dev;
2921 master = ACCESS_ONCE(orig_dev->master);
2922 if (skb->deliver_no_wcard)
2923 null_or_orig = orig_dev;
2924 else if (master) {
2925 if (skb_bond_should_drop(skb, master)) {
2926 skb->deliver_no_wcard = 1;
2927 null_or_orig = orig_dev; /* deliver only exact match */
2928 } else
2929 skb->dev = master;
2930 }
2931
2932 __this_cpu_inc(softnet_data.processed);
2933 skb_reset_network_header(skb);
2934 skb_reset_transport_header(skb);
2935 skb->mac_len = skb->network_header - skb->mac_header;
2936
2937 pt_prev = NULL;
2938
2939 rcu_read_lock();
2940
2941#ifdef CONFIG_NET_CLS_ACT
2942 if (skb->tc_verd & TC_NCLS) {
2943 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2944 goto ncls;
2945 }
2946#endif
2947
2948 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2949 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2950 ptype->dev == orig_dev) {
2951 if (pt_prev)
2952 ret = deliver_skb(skb, pt_prev, orig_dev);
2953 pt_prev = ptype;
2954 }
2955 }
2956
2957#ifdef CONFIG_NET_CLS_ACT
2958 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2959 if (!skb)
2960 goto out;
2961ncls:
2962#endif
2963
2964 /* Handle special case of bridge or macvlan */
2965 rx_handler = rcu_dereference(skb->dev->rx_handler);
2966 if (rx_handler) {
2967 if (pt_prev) {
2968 ret = deliver_skb(skb, pt_prev, orig_dev);
2969 pt_prev = NULL;
2970 }
2971 skb = rx_handler(skb);
2972 if (!skb)
2973 goto out;
2974 }
2975
2976 if (vlan_tx_tag_present(skb)) {
2977 if (pt_prev) {
2978 ret = deliver_skb(skb, pt_prev, orig_dev);
2979 pt_prev = NULL;
2980 }
2981 if (vlan_hwaccel_do_receive(&skb)) {
2982 ret = __netif_receive_skb(skb);
2983 goto out;
2984 } else if (unlikely(!skb))
2985 goto out;
2986 }
2987
2988 /*
2989 * Make sure frames received on VLAN interfaces stacked on
2990 * bonding interfaces still make their way to any base bonding
2991 * device that may have registered for a specific ptype. The
2992 * handler may have to adjust skb->dev and orig_dev.
2993 */
2994 orig_or_bond = orig_dev;
2995 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2996 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2997 orig_or_bond = vlan_dev_real_dev(skb->dev);
2998 }
2999
3000 type = skb->protocol;
3001 list_for_each_entry_rcu(ptype,
3002 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3003 if (ptype->type == type && (ptype->dev == null_or_orig ||
3004 ptype->dev == skb->dev || ptype->dev == orig_dev ||
3005 ptype->dev == orig_or_bond)) {
3006 if (pt_prev)
3007 ret = deliver_skb(skb, pt_prev, orig_dev);
3008 pt_prev = ptype;
3009 }
3010 }
3011
3012 if (pt_prev) {
3013 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3014 } else {
3015 atomic_long_inc(&skb->dev->rx_dropped);
3016 kfree_skb(skb);
3017 /* Jamal, now you will not able to escape explaining
3018 * me how you were going to use this. :-)
3019 */
3020 ret = NET_RX_DROP;
3021 }
3022
3023out:
3024 rcu_read_unlock();
3025 return ret;
3026}
3027
3028/**
3029 * netif_receive_skb - process receive buffer from network
3030 * @skb: buffer to process
3031 *
3032 * netif_receive_skb() is the main receive data processing function.
3033 * It always succeeds. The buffer may be dropped during processing
3034 * for congestion control or by the protocol layers.
3035 *
3036 * This function may only be called from softirq context and interrupts
3037 * should be enabled.
3038 *
3039 * Return values (usually ignored):
3040 * NET_RX_SUCCESS: no congestion
3041 * NET_RX_DROP: packet was dropped
3042 */
3043int netif_receive_skb(struct sk_buff *skb)
3044{
3045 if (netdev_tstamp_prequeue)
3046 net_timestamp_check(skb);
3047
3048 if (skb_defer_rx_timestamp(skb))
3049 return NET_RX_SUCCESS;
3050
3051#ifdef CONFIG_RPS
3052 {
3053 struct rps_dev_flow voidflow, *rflow = &voidflow;
3054 int cpu, ret;
3055
3056 rcu_read_lock();
3057
3058 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3059
3060 if (cpu >= 0) {
3061 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3062 rcu_read_unlock();
3063 } else {
3064 rcu_read_unlock();
3065 ret = __netif_receive_skb(skb);
3066 }
3067
3068 return ret;
3069 }
3070#else
3071 return __netif_receive_skb(skb);
3072#endif
3073}
3074EXPORT_SYMBOL(netif_receive_skb);
3075
3076/* Network device is going away, flush any packets still pending
3077 * Called with irqs disabled.
3078 */
3079static void flush_backlog(void *arg)
3080{
3081 struct net_device *dev = arg;
3082 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3083 struct sk_buff *skb, *tmp;
3084
3085 rps_lock(sd);
3086 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3087 if (skb->dev == dev) {
3088 __skb_unlink(skb, &sd->input_pkt_queue);
3089 kfree_skb(skb);
3090 input_queue_head_incr(sd);
3091 }
3092 }
3093 rps_unlock(sd);
3094
3095 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3096 if (skb->dev == dev) {
3097 __skb_unlink(skb, &sd->process_queue);
3098 kfree_skb(skb);
3099 input_queue_head_incr(sd);
3100 }
3101 }
3102}
3103
3104static int napi_gro_complete(struct sk_buff *skb)
3105{
3106 struct packet_type *ptype;
3107 __be16 type = skb->protocol;
3108 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3109 int err = -ENOENT;
3110
3111 if (NAPI_GRO_CB(skb)->count == 1) {
3112 skb_shinfo(skb)->gso_size = 0;
3113 goto out;
3114 }
3115
3116 rcu_read_lock();
3117 list_for_each_entry_rcu(ptype, head, list) {
3118 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3119 continue;
3120
3121 err = ptype->gro_complete(skb);
3122 break;
3123 }
3124 rcu_read_unlock();
3125
3126 if (err) {
3127 WARN_ON(&ptype->list == head);
3128 kfree_skb(skb);
3129 return NET_RX_SUCCESS;
3130 }
3131
3132out:
3133 return netif_receive_skb(skb);
3134}
3135
3136inline void napi_gro_flush(struct napi_struct *napi)
3137{
3138 struct sk_buff *skb, *next;
3139
3140 for (skb = napi->gro_list; skb; skb = next) {
3141 next = skb->next;
3142 skb->next = NULL;
3143 napi_gro_complete(skb);
3144 }
3145
3146 napi->gro_count = 0;
3147 napi->gro_list = NULL;
3148}
3149EXPORT_SYMBOL(napi_gro_flush);
3150
3151enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3152{
3153 struct sk_buff **pp = NULL;
3154 struct packet_type *ptype;
3155 __be16 type = skb->protocol;
3156 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3157 int same_flow;
3158 int mac_len;
3159 enum gro_result ret;
3160
3161 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3162 goto normal;
3163
3164 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3165 goto normal;
3166
3167 rcu_read_lock();
3168 list_for_each_entry_rcu(ptype, head, list) {
3169 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3170 continue;
3171
3172 skb_set_network_header(skb, skb_gro_offset(skb));
3173 mac_len = skb->network_header - skb->mac_header;
3174 skb->mac_len = mac_len;
3175 NAPI_GRO_CB(skb)->same_flow = 0;
3176 NAPI_GRO_CB(skb)->flush = 0;
3177 NAPI_GRO_CB(skb)->free = 0;
3178
3179 pp = ptype->gro_receive(&napi->gro_list, skb);
3180 break;
3181 }
3182 rcu_read_unlock();
3183
3184 if (&ptype->list == head)
3185 goto normal;
3186
3187 same_flow = NAPI_GRO_CB(skb)->same_flow;
3188 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3189
3190 if (pp) {
3191 struct sk_buff *nskb = *pp;
3192
3193 *pp = nskb->next;
3194 nskb->next = NULL;
3195 napi_gro_complete(nskb);
3196 napi->gro_count--;
3197 }
3198
3199 if (same_flow)
3200 goto ok;
3201
3202 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3203 goto normal;
3204
3205 napi->gro_count++;
3206 NAPI_GRO_CB(skb)->count = 1;
3207 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3208 skb->next = napi->gro_list;
3209 napi->gro_list = skb;
3210 ret = GRO_HELD;
3211
3212pull:
3213 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3214 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3215
3216 BUG_ON(skb->end - skb->tail < grow);
3217
3218 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3219
3220 skb->tail += grow;
3221 skb->data_len -= grow;
3222
3223 skb_shinfo(skb)->frags[0].page_offset += grow;
3224 skb_shinfo(skb)->frags[0].size -= grow;
3225
3226 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3227 put_page(skb_shinfo(skb)->frags[0].page);
3228 memmove(skb_shinfo(skb)->frags,
3229 skb_shinfo(skb)->frags + 1,
3230 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3231 }
3232 }
3233
3234ok:
3235 return ret;
3236
3237normal:
3238 ret = GRO_NORMAL;
3239 goto pull;
3240}
3241EXPORT_SYMBOL(dev_gro_receive);
3242
3243static inline gro_result_t
3244__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3245{
3246 struct sk_buff *p;
3247
3248 for (p = napi->gro_list; p; p = p->next) {
3249 unsigned long diffs;
3250
3251 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3252 diffs |= p->vlan_tci ^ skb->vlan_tci;
3253 diffs |= compare_ether_header(skb_mac_header(p),
3254 skb_gro_mac_header(skb));
3255 NAPI_GRO_CB(p)->same_flow = !diffs;
3256 NAPI_GRO_CB(p)->flush = 0;
3257 }
3258
3259 return dev_gro_receive(napi, skb);
3260}
3261
3262gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3263{
3264 switch (ret) {
3265 case GRO_NORMAL:
3266 if (netif_receive_skb(skb))
3267 ret = GRO_DROP;
3268 break;
3269
3270 case GRO_DROP:
3271 case GRO_MERGED_FREE:
3272 kfree_skb(skb);
3273 break;
3274
3275 case GRO_HELD:
3276 case GRO_MERGED:
3277 break;
3278 }
3279
3280 return ret;
3281}
3282EXPORT_SYMBOL(napi_skb_finish);
3283
3284void skb_gro_reset_offset(struct sk_buff *skb)
3285{
3286 NAPI_GRO_CB(skb)->data_offset = 0;
3287 NAPI_GRO_CB(skb)->frag0 = NULL;
3288 NAPI_GRO_CB(skb)->frag0_len = 0;
3289
3290 if (skb->mac_header == skb->tail &&
3291 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3292 NAPI_GRO_CB(skb)->frag0 =
3293 page_address(skb_shinfo(skb)->frags[0].page) +
3294 skb_shinfo(skb)->frags[0].page_offset;
3295 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3296 }
3297}
3298EXPORT_SYMBOL(skb_gro_reset_offset);
3299
3300gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3301{
3302 skb_gro_reset_offset(skb);
3303
3304 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3305}
3306EXPORT_SYMBOL(napi_gro_receive);
3307
3308static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3309{
3310 __skb_pull(skb, skb_headlen(skb));
3311 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3312 skb->vlan_tci = 0;
3313
3314 napi->skb = skb;
3315}
3316
3317struct sk_buff *napi_get_frags(struct napi_struct *napi)
3318{
3319 struct sk_buff *skb = napi->skb;
3320
3321 if (!skb) {
3322 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3323 if (skb)
3324 napi->skb = skb;
3325 }
3326 return skb;
3327}
3328EXPORT_SYMBOL(napi_get_frags);
3329
3330gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3331 gro_result_t ret)
3332{
3333 switch (ret) {
3334 case GRO_NORMAL:
3335 case GRO_HELD:
3336 skb->protocol = eth_type_trans(skb, skb->dev);
3337
3338 if (ret == GRO_HELD)
3339 skb_gro_pull(skb, -ETH_HLEN);
3340 else if (netif_receive_skb(skb))
3341 ret = GRO_DROP;
3342 break;
3343
3344 case GRO_DROP:
3345 case GRO_MERGED_FREE:
3346 napi_reuse_skb(napi, skb);
3347 break;
3348
3349 case GRO_MERGED:
3350 break;
3351 }
3352
3353 return ret;
3354}
3355EXPORT_SYMBOL(napi_frags_finish);
3356
3357struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3358{
3359 struct sk_buff *skb = napi->skb;
3360 struct ethhdr *eth;
3361 unsigned int hlen;
3362 unsigned int off;
3363
3364 napi->skb = NULL;
3365
3366 skb_reset_mac_header(skb);
3367 skb_gro_reset_offset(skb);
3368
3369 off = skb_gro_offset(skb);
3370 hlen = off + sizeof(*eth);
3371 eth = skb_gro_header_fast(skb, off);
3372 if (skb_gro_header_hard(skb, hlen)) {
3373 eth = skb_gro_header_slow(skb, hlen, off);
3374 if (unlikely(!eth)) {
3375 napi_reuse_skb(napi, skb);
3376 skb = NULL;
3377 goto out;
3378 }
3379 }
3380
3381 skb_gro_pull(skb, sizeof(*eth));
3382
3383 /*
3384 * This works because the only protocols we care about don't require
3385 * special handling. We'll fix it up properly at the end.
3386 */
3387 skb->protocol = eth->h_proto;
3388
3389out:
3390 return skb;
3391}
3392EXPORT_SYMBOL(napi_frags_skb);
3393
3394gro_result_t napi_gro_frags(struct napi_struct *napi)
3395{
3396 struct sk_buff *skb = napi_frags_skb(napi);
3397
3398 if (!skb)
3399 return GRO_DROP;
3400
3401 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3402}
3403EXPORT_SYMBOL(napi_gro_frags);
3404
3405/*
3406 * net_rps_action sends any pending IPI's for rps.
3407 * Note: called with local irq disabled, but exits with local irq enabled.
3408 */
3409static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3410{
3411#ifdef CONFIG_RPS
3412 struct softnet_data *remsd = sd->rps_ipi_list;
3413
3414 if (remsd) {
3415 sd->rps_ipi_list = NULL;
3416
3417 local_irq_enable();
3418
3419 /* Send pending IPI's to kick RPS processing on remote cpus. */
3420 while (remsd) {
3421 struct softnet_data *next = remsd->rps_ipi_next;
3422
3423 if (cpu_online(remsd->cpu))
3424 __smp_call_function_single(remsd->cpu,
3425 &remsd->csd, 0);
3426 remsd = next;
3427 }
3428 } else
3429#endif
3430 local_irq_enable();
3431}
3432
3433static int process_backlog(struct napi_struct *napi, int quota)
3434{
3435 int work = 0;
3436 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3437
3438#ifdef CONFIG_RPS
3439 /* Check if we have pending ipi, its better to send them now,
3440 * not waiting net_rx_action() end.
3441 */
3442 if (sd->rps_ipi_list) {
3443 local_irq_disable();
3444 net_rps_action_and_irq_enable(sd);
3445 }
3446#endif
3447 napi->weight = weight_p;
3448 local_irq_disable();
3449 while (work < quota) {
3450 struct sk_buff *skb;
3451 unsigned int qlen;
3452
3453 while ((skb = __skb_dequeue(&sd->process_queue))) {
3454 local_irq_enable();
3455 __netif_receive_skb(skb);
3456 local_irq_disable();
3457 input_queue_head_incr(sd);
3458 if (++work >= quota) {
3459 local_irq_enable();
3460 return work;
3461 }
3462 }
3463
3464 rps_lock(sd);
3465 qlen = skb_queue_len(&sd->input_pkt_queue);
3466 if (qlen)
3467 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3468 &sd->process_queue);
3469
3470 if (qlen < quota - work) {
3471 /*
3472 * Inline a custom version of __napi_complete().
3473 * only current cpu owns and manipulates this napi,
3474 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3475 * we can use a plain write instead of clear_bit(),
3476 * and we dont need an smp_mb() memory barrier.
3477 */
3478 list_del(&napi->poll_list);
3479 napi->state = 0;
3480
3481 quota = work + qlen;
3482 }
3483 rps_unlock(sd);
3484 }
3485 local_irq_enable();
3486
3487 return work;
3488}
3489
3490/**
3491 * __napi_schedule - schedule for receive
3492 * @n: entry to schedule
3493 *
3494 * The entry's receive function will be scheduled to run
3495 */
3496void __napi_schedule(struct napi_struct *n)
3497{
3498 unsigned long flags;
3499
3500 local_irq_save(flags);
3501 ____napi_schedule(&__get_cpu_var(softnet_data), n);
3502 local_irq_restore(flags);
3503}
3504EXPORT_SYMBOL(__napi_schedule);
3505
3506void __napi_complete(struct napi_struct *n)
3507{
3508 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3509 BUG_ON(n->gro_list);
3510
3511 list_del(&n->poll_list);
3512 smp_mb__before_clear_bit();
3513 clear_bit(NAPI_STATE_SCHED, &n->state);
3514}
3515EXPORT_SYMBOL(__napi_complete);
3516
3517void napi_complete(struct napi_struct *n)
3518{
3519 unsigned long flags;
3520
3521 /*
3522 * don't let napi dequeue from the cpu poll list
3523 * just in case its running on a different cpu
3524 */
3525 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3526 return;
3527
3528 napi_gro_flush(n);
3529 local_irq_save(flags);
3530 __napi_complete(n);
3531 local_irq_restore(flags);
3532}
3533EXPORT_SYMBOL(napi_complete);
3534
3535void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3536 int (*poll)(struct napi_struct *, int), int weight)
3537{
3538 INIT_LIST_HEAD(&napi->poll_list);
3539 napi->gro_count = 0;
3540 napi->gro_list = NULL;
3541 napi->skb = NULL;
3542 napi->poll = poll;
3543 napi->weight = weight;
3544 list_add(&napi->dev_list, &dev->napi_list);
3545 napi->dev = dev;
3546#ifdef CONFIG_NETPOLL
3547 spin_lock_init(&napi->poll_lock);
3548 napi->poll_owner = -1;
3549#endif
3550 set_bit(NAPI_STATE_SCHED, &napi->state);
3551}
3552EXPORT_SYMBOL(netif_napi_add);
3553
3554void netif_napi_del(struct napi_struct *napi)
3555{
3556 struct sk_buff *skb, *next;
3557
3558 list_del_init(&napi->dev_list);
3559 napi_free_frags(napi);
3560
3561 for (skb = napi->gro_list; skb; skb = next) {
3562 next = skb->next;
3563 skb->next = NULL;
3564 kfree_skb(skb);
3565 }
3566
3567 napi->gro_list = NULL;
3568 napi->gro_count = 0;
3569}
3570EXPORT_SYMBOL(netif_napi_del);
3571
3572static void net_rx_action(struct softirq_action *h)
3573{
3574 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3575 unsigned long time_limit = jiffies + 2;
3576 int budget = netdev_budget;
3577 void *have;
3578
3579 local_irq_disable();
3580
3581 while (!list_empty(&sd->poll_list)) {
3582 struct napi_struct *n;
3583 int work, weight;
3584
3585 /* If softirq window is exhuasted then punt.
3586 * Allow this to run for 2 jiffies since which will allow
3587 * an average latency of 1.5/HZ.
3588 */
3589 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3590 goto softnet_break;
3591
3592 local_irq_enable();
3593
3594 /* Even though interrupts have been re-enabled, this
3595 * access is safe because interrupts can only add new
3596 * entries to the tail of this list, and only ->poll()
3597 * calls can remove this head entry from the list.
3598 */
3599 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3600
3601 have = netpoll_poll_lock(n);
3602
3603 weight = n->weight;
3604
3605 /* This NAPI_STATE_SCHED test is for avoiding a race
3606 * with netpoll's poll_napi(). Only the entity which
3607 * obtains the lock and sees NAPI_STATE_SCHED set will
3608 * actually make the ->poll() call. Therefore we avoid
3609 * accidently calling ->poll() when NAPI is not scheduled.
3610 */
3611 work = 0;
3612 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3613 work = n->poll(n, weight);
3614 trace_napi_poll(n);
3615 }
3616
3617 WARN_ON_ONCE(work > weight);
3618
3619 budget -= work;
3620
3621 local_irq_disable();
3622
3623 /* Drivers must not modify the NAPI state if they
3624 * consume the entire weight. In such cases this code
3625 * still "owns" the NAPI instance and therefore can
3626 * move the instance around on the list at-will.
3627 */
3628 if (unlikely(work == weight)) {
3629 if (unlikely(napi_disable_pending(n))) {
3630 local_irq_enable();
3631 napi_complete(n);
3632 local_irq_disable();
3633 } else
3634 list_move_tail(&n->poll_list, &sd->poll_list);
3635 }
3636
3637 netpoll_poll_unlock(have);
3638 }
3639out:
3640 net_rps_action_and_irq_enable(sd);
3641
3642#ifdef CONFIG_NET_DMA
3643 /*
3644 * There may not be any more sk_buffs coming right now, so push
3645 * any pending DMA copies to hardware
3646 */
3647 dma_issue_pending_all();
3648#endif
3649
3650 return;
3651
3652softnet_break:
3653 sd->time_squeeze++;
3654 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3655 goto out;
3656}
3657
3658static gifconf_func_t *gifconf_list[NPROTO];
3659
3660/**
3661 * register_gifconf - register a SIOCGIF handler
3662 * @family: Address family
3663 * @gifconf: Function handler
3664 *
3665 * Register protocol dependent address dumping routines. The handler
3666 * that is passed must not be freed or reused until it has been replaced
3667 * by another handler.
3668 */
3669int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3670{
3671 if (family >= NPROTO)
3672 return -EINVAL;
3673 gifconf_list[family] = gifconf;
3674 return 0;
3675}
3676EXPORT_SYMBOL(register_gifconf);
3677
3678
3679/*
3680 * Map an interface index to its name (SIOCGIFNAME)
3681 */
3682
3683/*
3684 * We need this ioctl for efficient implementation of the
3685 * if_indextoname() function required by the IPv6 API. Without
3686 * it, we would have to search all the interfaces to find a
3687 * match. --pb
3688 */
3689
3690static int dev_ifname(struct net *net, struct ifreq __user *arg)
3691{
3692 struct net_device *dev;
3693 struct ifreq ifr;
3694
3695 /*
3696 * Fetch the caller's info block.
3697 */
3698
3699 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3700 return -EFAULT;
3701
3702 rcu_read_lock();
3703 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3704 if (!dev) {
3705 rcu_read_unlock();
3706 return -ENODEV;
3707 }
3708
3709 strcpy(ifr.ifr_name, dev->name);
3710 rcu_read_unlock();
3711
3712 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3713 return -EFAULT;
3714 return 0;
3715}
3716
3717/*
3718 * Perform a SIOCGIFCONF call. This structure will change
3719 * size eventually, and there is nothing I can do about it.
3720 * Thus we will need a 'compatibility mode'.
3721 */
3722
3723static int dev_ifconf(struct net *net, char __user *arg)
3724{
3725 struct ifconf ifc;
3726 struct net_device *dev;
3727 char __user *pos;
3728 int len;
3729 int total;
3730 int i;
3731
3732 /*
3733 * Fetch the caller's info block.
3734 */
3735
3736 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3737 return -EFAULT;
3738
3739 pos = ifc.ifc_buf;
3740 len = ifc.ifc_len;
3741
3742 /*
3743 * Loop over the interfaces, and write an info block for each.
3744 */
3745
3746 total = 0;
3747 for_each_netdev(net, dev) {
3748 for (i = 0; i < NPROTO; i++) {
3749 if (gifconf_list[i]) {
3750 int done;
3751 if (!pos)
3752 done = gifconf_list[i](dev, NULL, 0);
3753 else
3754 done = gifconf_list[i](dev, pos + total,
3755 len - total);
3756 if (done < 0)
3757 return -EFAULT;
3758 total += done;
3759 }
3760 }
3761 }
3762
3763 /*
3764 * All done. Write the updated control block back to the caller.
3765 */
3766 ifc.ifc_len = total;
3767
3768 /*
3769 * Both BSD and Solaris return 0 here, so we do too.
3770 */
3771 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3772}
3773
3774#ifdef CONFIG_PROC_FS
3775/*
3776 * This is invoked by the /proc filesystem handler to display a device
3777 * in detail.
3778 */
3779void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3780 __acquires(RCU)
3781{
3782 struct net *net = seq_file_net(seq);
3783 loff_t off;
3784 struct net_device *dev;
3785
3786 rcu_read_lock();
3787 if (!*pos)
3788 return SEQ_START_TOKEN;
3789
3790 off = 1;
3791 for_each_netdev_rcu(net, dev)
3792 if (off++ == *pos)
3793 return dev;
3794
3795 return NULL;
3796}
3797
3798void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3799{
3800 struct net_device *dev = (v == SEQ_START_TOKEN) ?
3801 first_net_device(seq_file_net(seq)) :
3802 next_net_device((struct net_device *)v);
3803
3804 ++*pos;
3805 return rcu_dereference(dev);
3806}
3807
3808void dev_seq_stop(struct seq_file *seq, void *v)
3809 __releases(RCU)
3810{
3811 rcu_read_unlock();
3812}
3813
3814static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3815{
3816 struct rtnl_link_stats64 temp;
3817 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3818
3819 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3820 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3821 dev->name, stats->rx_bytes, stats->rx_packets,
3822 stats->rx_errors,
3823 stats->rx_dropped + stats->rx_missed_errors,
3824 stats->rx_fifo_errors,
3825 stats->rx_length_errors + stats->rx_over_errors +
3826 stats->rx_crc_errors + stats->rx_frame_errors,
3827 stats->rx_compressed, stats->multicast,
3828 stats->tx_bytes, stats->tx_packets,
3829 stats->tx_errors, stats->tx_dropped,
3830 stats->tx_fifo_errors, stats->collisions,
3831 stats->tx_carrier_errors +
3832 stats->tx_aborted_errors +
3833 stats->tx_window_errors +
3834 stats->tx_heartbeat_errors,
3835 stats->tx_compressed);
3836}
3837
3838/*
3839 * Called from the PROCfs module. This now uses the new arbitrary sized
3840 * /proc/net interface to create /proc/net/dev
3841 */
3842static int dev_seq_show(struct seq_file *seq, void *v)
3843{
3844 if (v == SEQ_START_TOKEN)
3845 seq_puts(seq, "Inter-| Receive "
3846 " | Transmit\n"
3847 " face |bytes packets errs drop fifo frame "
3848 "compressed multicast|bytes packets errs "
3849 "drop fifo colls carrier compressed\n");
3850 else
3851 dev_seq_printf_stats(seq, v);
3852 return 0;
3853}
3854
3855static struct softnet_data *softnet_get_online(loff_t *pos)
3856{
3857 struct softnet_data *sd = NULL;
3858
3859 while (*pos < nr_cpu_ids)
3860 if (cpu_online(*pos)) {
3861 sd = &per_cpu(softnet_data, *pos);
3862 break;
3863 } else
3864 ++*pos;
3865 return sd;
3866}
3867
3868static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3869{
3870 return softnet_get_online(pos);
3871}
3872
3873static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3874{
3875 ++*pos;
3876 return softnet_get_online(pos);
3877}
3878
3879static void softnet_seq_stop(struct seq_file *seq, void *v)
3880{
3881}
3882
3883static int softnet_seq_show(struct seq_file *seq, void *v)
3884{
3885 struct softnet_data *sd = v;
3886
3887 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3888 sd->processed, sd->dropped, sd->time_squeeze, 0,
3889 0, 0, 0, 0, /* was fastroute */
3890 sd->cpu_collision, sd->received_rps);
3891 return 0;
3892}
3893
3894static const struct seq_operations dev_seq_ops = {
3895 .start = dev_seq_start,
3896 .next = dev_seq_next,
3897 .stop = dev_seq_stop,
3898 .show = dev_seq_show,
3899};
3900
3901static int dev_seq_open(struct inode *inode, struct file *file)
3902{
3903 return seq_open_net(inode, file, &dev_seq_ops,
3904 sizeof(struct seq_net_private));
3905}
3906
3907static const struct file_operations dev_seq_fops = {
3908 .owner = THIS_MODULE,
3909 .open = dev_seq_open,
3910 .read = seq_read,
3911 .llseek = seq_lseek,
3912 .release = seq_release_net,
3913};
3914
3915static const struct seq_operations softnet_seq_ops = {
3916 .start = softnet_seq_start,
3917 .next = softnet_seq_next,
3918 .stop = softnet_seq_stop,
3919 .show = softnet_seq_show,
3920};
3921
3922static int softnet_seq_open(struct inode *inode, struct file *file)
3923{
3924 return seq_open(file, &softnet_seq_ops);
3925}
3926
3927static const struct file_operations softnet_seq_fops = {
3928 .owner = THIS_MODULE,
3929 .open = softnet_seq_open,
3930 .read = seq_read,
3931 .llseek = seq_lseek,
3932 .release = seq_release,
3933};
3934
3935static void *ptype_get_idx(loff_t pos)
3936{
3937 struct packet_type *pt = NULL;
3938 loff_t i = 0;
3939 int t;
3940
3941 list_for_each_entry_rcu(pt, &ptype_all, list) {
3942 if (i == pos)
3943 return pt;
3944 ++i;
3945 }
3946
3947 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3948 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3949 if (i == pos)
3950 return pt;
3951 ++i;
3952 }
3953 }
3954 return NULL;
3955}
3956
3957static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3958 __acquires(RCU)
3959{
3960 rcu_read_lock();
3961 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3962}
3963
3964static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3965{
3966 struct packet_type *pt;
3967 struct list_head *nxt;
3968 int hash;
3969
3970 ++*pos;
3971 if (v == SEQ_START_TOKEN)
3972 return ptype_get_idx(0);
3973
3974 pt = v;
3975 nxt = pt->list.next;
3976 if (pt->type == htons(ETH_P_ALL)) {
3977 if (nxt != &ptype_all)
3978 goto found;
3979 hash = 0;
3980 nxt = ptype_base[0].next;
3981 } else
3982 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3983
3984 while (nxt == &ptype_base[hash]) {
3985 if (++hash >= PTYPE_HASH_SIZE)
3986 return NULL;
3987 nxt = ptype_base[hash].next;
3988 }
3989found:
3990 return list_entry(nxt, struct packet_type, list);
3991}
3992
3993static void ptype_seq_stop(struct seq_file *seq, void *v)
3994 __releases(RCU)
3995{
3996 rcu_read_unlock();
3997}
3998
3999static int ptype_seq_show(struct seq_file *seq, void *v)
4000{
4001 struct packet_type *pt = v;
4002
4003 if (v == SEQ_START_TOKEN)
4004 seq_puts(seq, "Type Device Function\n");
4005 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4006 if (pt->type == htons(ETH_P_ALL))
4007 seq_puts(seq, "ALL ");
4008 else
4009 seq_printf(seq, "%04x", ntohs(pt->type));
4010
4011 seq_printf(seq, " %-8s %pF\n",
4012 pt->dev ? pt->dev->name : "", pt->func);
4013 }
4014
4015 return 0;
4016}
4017
4018static const struct seq_operations ptype_seq_ops = {
4019 .start = ptype_seq_start,
4020 .next = ptype_seq_next,
4021 .stop = ptype_seq_stop,
4022 .show = ptype_seq_show,
4023};
4024
4025static int ptype_seq_open(struct inode *inode, struct file *file)
4026{
4027 return seq_open_net(inode, file, &ptype_seq_ops,
4028 sizeof(struct seq_net_private));
4029}
4030
4031static const struct file_operations ptype_seq_fops = {
4032 .owner = THIS_MODULE,
4033 .open = ptype_seq_open,
4034 .read = seq_read,
4035 .llseek = seq_lseek,
4036 .release = seq_release_net,
4037};
4038
4039
4040static int __net_init dev_proc_net_init(struct net *net)
4041{
4042 int rc = -ENOMEM;
4043
4044 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4045 goto out;
4046 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4047 goto out_dev;
4048 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4049 goto out_softnet;
4050
4051 if (wext_proc_init(net))
4052 goto out_ptype;
4053 rc = 0;
4054out:
4055 return rc;
4056out_ptype:
4057 proc_net_remove(net, "ptype");
4058out_softnet:
4059 proc_net_remove(net, "softnet_stat");
4060out_dev:
4061 proc_net_remove(net, "dev");
4062 goto out;
4063}
4064
4065static void __net_exit dev_proc_net_exit(struct net *net)
4066{
4067 wext_proc_exit(net);
4068
4069 proc_net_remove(net, "ptype");
4070 proc_net_remove(net, "softnet_stat");
4071 proc_net_remove(net, "dev");
4072}
4073
4074static struct pernet_operations __net_initdata dev_proc_ops = {
4075 .init = dev_proc_net_init,
4076 .exit = dev_proc_net_exit,
4077};
4078
4079static int __init dev_proc_init(void)
4080{
4081 return register_pernet_subsys(&dev_proc_ops);
4082}
4083#else
4084#define dev_proc_init() 0
4085#endif /* CONFIG_PROC_FS */
4086
4087
4088/**
4089 * netdev_set_master - set up master/slave pair
4090 * @slave: slave device
4091 * @master: new master device
4092 *
4093 * Changes the master device of the slave. Pass %NULL to break the
4094 * bonding. The caller must hold the RTNL semaphore. On a failure
4095 * a negative errno code is returned. On success the reference counts
4096 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4097 * function returns zero.
4098 */
4099int netdev_set_master(struct net_device *slave, struct net_device *master)
4100{
4101 struct net_device *old = slave->master;
4102
4103 ASSERT_RTNL();
4104
4105 if (master) {
4106 if (old)
4107 return -EBUSY;
4108 dev_hold(master);
4109 }
4110
4111 slave->master = master;
4112
4113 if (old) {
4114 synchronize_net();
4115 dev_put(old);
4116 }
4117 if (master)
4118 slave->flags |= IFF_SLAVE;
4119 else
4120 slave->flags &= ~IFF_SLAVE;
4121
4122 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4123 return 0;
4124}
4125EXPORT_SYMBOL(netdev_set_master);
4126
4127static void dev_change_rx_flags(struct net_device *dev, int flags)
4128{
4129 const struct net_device_ops *ops = dev->netdev_ops;
4130
4131 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4132 ops->ndo_change_rx_flags(dev, flags);
4133}
4134
4135static int __dev_set_promiscuity(struct net_device *dev, int inc)
4136{
4137 unsigned short old_flags = dev->flags;
4138 uid_t uid;
4139 gid_t gid;
4140
4141 ASSERT_RTNL();
4142
4143 dev->flags |= IFF_PROMISC;
4144 dev->promiscuity += inc;
4145 if (dev->promiscuity == 0) {
4146 /*
4147 * Avoid overflow.
4148 * If inc causes overflow, untouch promisc and return error.
4149 */
4150 if (inc < 0)
4151 dev->flags &= ~IFF_PROMISC;
4152 else {
4153 dev->promiscuity -= inc;
4154 printk(KERN_WARNING "%s: promiscuity touches roof, "
4155 "set promiscuity failed, promiscuity feature "
4156 "of device might be broken.\n", dev->name);
4157 return -EOVERFLOW;
4158 }
4159 }
4160 if (dev->flags != old_flags) {
4161 printk(KERN_INFO "device %s %s promiscuous mode\n",
4162 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4163 "left");
4164 if (audit_enabled) {
4165 current_uid_gid(&uid, &gid);
4166 audit_log(current->audit_context, GFP_ATOMIC,
4167 AUDIT_ANOM_PROMISCUOUS,
4168 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4169 dev->name, (dev->flags & IFF_PROMISC),
4170 (old_flags & IFF_PROMISC),
4171 audit_get_loginuid(current),
4172 uid, gid,
4173 audit_get_sessionid(current));
4174 }
4175
4176 dev_change_rx_flags(dev, IFF_PROMISC);
4177 }
4178 return 0;
4179}
4180
4181/**
4182 * dev_set_promiscuity - update promiscuity count on a device
4183 * @dev: device
4184 * @inc: modifier
4185 *
4186 * Add or remove promiscuity from a device. While the count in the device
4187 * remains above zero the interface remains promiscuous. Once it hits zero
4188 * the device reverts back to normal filtering operation. A negative inc
4189 * value is used to drop promiscuity on the device.
4190 * Return 0 if successful or a negative errno code on error.
4191 */
4192int dev_set_promiscuity(struct net_device *dev, int inc)
4193{
4194 unsigned short old_flags = dev->flags;
4195 int err;
4196
4197 err = __dev_set_promiscuity(dev, inc);
4198 if (err < 0)
4199 return err;
4200 if (dev->flags != old_flags)
4201 dev_set_rx_mode(dev);
4202 return err;
4203}
4204EXPORT_SYMBOL(dev_set_promiscuity);
4205
4206/**
4207 * dev_set_allmulti - update allmulti count on a device
4208 * @dev: device
4209 * @inc: modifier
4210 *
4211 * Add or remove reception of all multicast frames to a device. While the
4212 * count in the device remains above zero the interface remains listening
4213 * to all interfaces. Once it hits zero the device reverts back to normal
4214 * filtering operation. A negative @inc value is used to drop the counter
4215 * when releasing a resource needing all multicasts.
4216 * Return 0 if successful or a negative errno code on error.
4217 */
4218
4219int dev_set_allmulti(struct net_device *dev, int inc)
4220{
4221 unsigned short old_flags = dev->flags;
4222
4223 ASSERT_RTNL();
4224
4225 dev->flags |= IFF_ALLMULTI;
4226 dev->allmulti += inc;
4227 if (dev->allmulti == 0) {
4228 /*
4229 * Avoid overflow.
4230 * If inc causes overflow, untouch allmulti and return error.
4231 */
4232 if (inc < 0)
4233 dev->flags &= ~IFF_ALLMULTI;
4234 else {
4235 dev->allmulti -= inc;
4236 printk(KERN_WARNING "%s: allmulti touches roof, "
4237 "set allmulti failed, allmulti feature of "
4238 "device might be broken.\n", dev->name);
4239 return -EOVERFLOW;
4240 }
4241 }
4242 if (dev->flags ^ old_flags) {
4243 dev_change_rx_flags(dev, IFF_ALLMULTI);
4244 dev_set_rx_mode(dev);
4245 }
4246 return 0;
4247}
4248EXPORT_SYMBOL(dev_set_allmulti);
4249
4250/*
4251 * Upload unicast and multicast address lists to device and
4252 * configure RX filtering. When the device doesn't support unicast
4253 * filtering it is put in promiscuous mode while unicast addresses
4254 * are present.
4255 */
4256void __dev_set_rx_mode(struct net_device *dev)
4257{
4258 const struct net_device_ops *ops = dev->netdev_ops;
4259
4260 /* dev_open will call this function so the list will stay sane. */
4261 if (!(dev->flags&IFF_UP))
4262 return;
4263
4264 if (!netif_device_present(dev))
4265 return;
4266
4267 if (ops->ndo_set_rx_mode)
4268 ops->ndo_set_rx_mode(dev);
4269 else {
4270 /* Unicast addresses changes may only happen under the rtnl,
4271 * therefore calling __dev_set_promiscuity here is safe.
4272 */
4273 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4274 __dev_set_promiscuity(dev, 1);
4275 dev->uc_promisc = 1;
4276 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4277 __dev_set_promiscuity(dev, -1);
4278 dev->uc_promisc = 0;
4279 }
4280
4281 if (ops->ndo_set_multicast_list)
4282 ops->ndo_set_multicast_list(dev);
4283 }
4284}
4285
4286void dev_set_rx_mode(struct net_device *dev)
4287{
4288 netif_addr_lock_bh(dev);
4289 __dev_set_rx_mode(dev);
4290 netif_addr_unlock_bh(dev);
4291}
4292
4293/**
4294 * dev_get_flags - get flags reported to userspace
4295 * @dev: device
4296 *
4297 * Get the combination of flag bits exported through APIs to userspace.
4298 */
4299unsigned dev_get_flags(const struct net_device *dev)
4300{
4301 unsigned flags;
4302
4303 flags = (dev->flags & ~(IFF_PROMISC |
4304 IFF_ALLMULTI |
4305 IFF_RUNNING |
4306 IFF_LOWER_UP |
4307 IFF_DORMANT)) |
4308 (dev->gflags & (IFF_PROMISC |
4309 IFF_ALLMULTI));
4310
4311 if (netif_running(dev)) {
4312 if (netif_oper_up(dev))
4313 flags |= IFF_RUNNING;
4314 if (netif_carrier_ok(dev))
4315 flags |= IFF_LOWER_UP;
4316 if (netif_dormant(dev))
4317 flags |= IFF_DORMANT;
4318 }
4319
4320 return flags;
4321}
4322EXPORT_SYMBOL(dev_get_flags);
4323
4324int __dev_change_flags(struct net_device *dev, unsigned int flags)
4325{
4326 int old_flags = dev->flags;
4327 int ret;
4328
4329 ASSERT_RTNL();
4330
4331 /*
4332 * Set the flags on our device.
4333 */
4334
4335 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4336 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4337 IFF_AUTOMEDIA)) |
4338 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4339 IFF_ALLMULTI));
4340
4341 /*
4342 * Load in the correct multicast list now the flags have changed.
4343 */
4344
4345 if ((old_flags ^ flags) & IFF_MULTICAST)
4346 dev_change_rx_flags(dev, IFF_MULTICAST);
4347
4348 dev_set_rx_mode(dev);
4349
4350 /*
4351 * Have we downed the interface. We handle IFF_UP ourselves
4352 * according to user attempts to set it, rather than blindly
4353 * setting it.
4354 */
4355
4356 ret = 0;
4357 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4358 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4359
4360 if (!ret)
4361 dev_set_rx_mode(dev);
4362 }
4363
4364 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4365 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4366
4367 dev->gflags ^= IFF_PROMISC;
4368 dev_set_promiscuity(dev, inc);
4369 }
4370
4371 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4372 is important. Some (broken) drivers set IFF_PROMISC, when
4373 IFF_ALLMULTI is requested not asking us and not reporting.
4374 */
4375 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4376 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4377
4378 dev->gflags ^= IFF_ALLMULTI;
4379 dev_set_allmulti(dev, inc);
4380 }
4381
4382 return ret;
4383}
4384
4385void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4386{
4387 unsigned int changes = dev->flags ^ old_flags;
4388
4389 if (changes & IFF_UP) {
4390 if (dev->flags & IFF_UP)
4391 call_netdevice_notifiers(NETDEV_UP, dev);
4392 else
4393 call_netdevice_notifiers(NETDEV_DOWN, dev);
4394 }
4395
4396 if (dev->flags & IFF_UP &&
4397 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4398 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4399}
4400
4401/**
4402 * dev_change_flags - change device settings
4403 * @dev: device
4404 * @flags: device state flags
4405 *
4406 * Change settings on device based state flags. The flags are
4407 * in the userspace exported format.
4408 */
4409int dev_change_flags(struct net_device *dev, unsigned flags)
4410{
4411 int ret, changes;
4412 int old_flags = dev->flags;
4413
4414 ret = __dev_change_flags(dev, flags);
4415 if (ret < 0)
4416 return ret;
4417
4418 changes = old_flags ^ dev->flags;
4419 if (changes)
4420 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4421
4422 __dev_notify_flags(dev, old_flags);
4423 return ret;
4424}
4425EXPORT_SYMBOL(dev_change_flags);
4426
4427/**
4428 * dev_set_mtu - Change maximum transfer unit
4429 * @dev: device
4430 * @new_mtu: new transfer unit
4431 *
4432 * Change the maximum transfer size of the network device.
4433 */
4434int dev_set_mtu(struct net_device *dev, int new_mtu)
4435{
4436 const struct net_device_ops *ops = dev->netdev_ops;
4437 int err;
4438
4439 if (new_mtu == dev->mtu)
4440 return 0;
4441
4442 /* MTU must be positive. */
4443 if (new_mtu < 0)
4444 return -EINVAL;
4445
4446 if (!netif_device_present(dev))
4447 return -ENODEV;
4448
4449 err = 0;
4450 if (ops->ndo_change_mtu)
4451 err = ops->ndo_change_mtu(dev, new_mtu);
4452 else
4453 dev->mtu = new_mtu;
4454
4455 if (!err && dev->flags & IFF_UP)
4456 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4457 return err;
4458}
4459EXPORT_SYMBOL(dev_set_mtu);
4460
4461/**
4462 * dev_set_mac_address - Change Media Access Control Address
4463 * @dev: device
4464 * @sa: new address
4465 *
4466 * Change the hardware (MAC) address of the device
4467 */
4468int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4469{
4470 const struct net_device_ops *ops = dev->netdev_ops;
4471 int err;
4472
4473 if (!ops->ndo_set_mac_address)
4474 return -EOPNOTSUPP;
4475 if (sa->sa_family != dev->type)
4476 return -EINVAL;
4477 if (!netif_device_present(dev))
4478 return -ENODEV;
4479 err = ops->ndo_set_mac_address(dev, sa);
4480 if (!err)
4481 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4482 return err;
4483}
4484EXPORT_SYMBOL(dev_set_mac_address);
4485
4486/*
4487 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4488 */
4489static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4490{
4491 int err;
4492 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4493
4494 if (!dev)
4495 return -ENODEV;
4496
4497 switch (cmd) {
4498 case SIOCGIFFLAGS: /* Get interface flags */
4499 ifr->ifr_flags = (short) dev_get_flags(dev);
4500 return 0;
4501
4502 case SIOCGIFMETRIC: /* Get the metric on the interface
4503 (currently unused) */
4504 ifr->ifr_metric = 0;
4505 return 0;
4506
4507 case SIOCGIFMTU: /* Get the MTU of a device */
4508 ifr->ifr_mtu = dev->mtu;
4509 return 0;
4510
4511 case SIOCGIFHWADDR:
4512 if (!dev->addr_len)
4513 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4514 else
4515 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4516 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4517 ifr->ifr_hwaddr.sa_family = dev->type;
4518 return 0;
4519
4520 case SIOCGIFSLAVE:
4521 err = -EINVAL;
4522 break;
4523
4524 case SIOCGIFMAP:
4525 ifr->ifr_map.mem_start = dev->mem_start;
4526 ifr->ifr_map.mem_end = dev->mem_end;
4527 ifr->ifr_map.base_addr = dev->base_addr;
4528 ifr->ifr_map.irq = dev->irq;
4529 ifr->ifr_map.dma = dev->dma;
4530 ifr->ifr_map.port = dev->if_port;
4531 return 0;
4532
4533 case SIOCGIFINDEX:
4534 ifr->ifr_ifindex = dev->ifindex;
4535 return 0;
4536
4537 case SIOCGIFTXQLEN:
4538 ifr->ifr_qlen = dev->tx_queue_len;
4539 return 0;
4540
4541 default:
4542 /* dev_ioctl() should ensure this case
4543 * is never reached
4544 */
4545 WARN_ON(1);
4546 err = -EINVAL;
4547 break;
4548
4549 }
4550 return err;
4551}
4552
4553/*
4554 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4555 */
4556static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4557{
4558 int err;
4559 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4560 const struct net_device_ops *ops;
4561
4562 if (!dev)
4563 return -ENODEV;
4564
4565 ops = dev->netdev_ops;
4566
4567 switch (cmd) {
4568 case SIOCSIFFLAGS: /* Set interface flags */
4569 return dev_change_flags(dev, ifr->ifr_flags);
4570
4571 case SIOCSIFMETRIC: /* Set the metric on the interface
4572 (currently unused) */
4573 return -EOPNOTSUPP;
4574
4575 case SIOCSIFMTU: /* Set the MTU of a device */
4576 return dev_set_mtu(dev, ifr->ifr_mtu);
4577
4578 case SIOCSIFHWADDR:
4579 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4580
4581 case SIOCSIFHWBROADCAST:
4582 if (ifr->ifr_hwaddr.sa_family != dev->type)
4583 return -EINVAL;
4584 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4585 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4586 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4587 return 0;
4588
4589 case SIOCSIFMAP:
4590 if (ops->ndo_set_config) {
4591 if (!netif_device_present(dev))
4592 return -ENODEV;
4593 return ops->ndo_set_config(dev, &ifr->ifr_map);
4594 }
4595 return -EOPNOTSUPP;
4596
4597 case SIOCADDMULTI:
4598 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4599 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4600 return -EINVAL;
4601 if (!netif_device_present(dev))
4602 return -ENODEV;
4603 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4604
4605 case SIOCDELMULTI:
4606 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4607 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4608 return -EINVAL;
4609 if (!netif_device_present(dev))
4610 return -ENODEV;
4611 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4612
4613 case SIOCSIFTXQLEN:
4614 if (ifr->ifr_qlen < 0)
4615 return -EINVAL;
4616 dev->tx_queue_len = ifr->ifr_qlen;
4617 return 0;
4618
4619 case SIOCSIFNAME:
4620 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4621 return dev_change_name(dev, ifr->ifr_newname);
4622
4623 /*
4624 * Unknown or private ioctl
4625 */
4626 default:
4627 if ((cmd >= SIOCDEVPRIVATE &&
4628 cmd <= SIOCDEVPRIVATE + 15) ||
4629 cmd == SIOCBONDENSLAVE ||
4630 cmd == SIOCBONDRELEASE ||
4631 cmd == SIOCBONDSETHWADDR ||
4632 cmd == SIOCBONDSLAVEINFOQUERY ||
4633 cmd == SIOCBONDINFOQUERY ||
4634 cmd == SIOCBONDCHANGEACTIVE ||
4635 cmd == SIOCGMIIPHY ||
4636 cmd == SIOCGMIIREG ||
4637 cmd == SIOCSMIIREG ||
4638 cmd == SIOCBRADDIF ||
4639 cmd == SIOCBRDELIF ||
4640 cmd == SIOCSHWTSTAMP ||
4641 cmd == SIOCWANDEV) {
4642 err = -EOPNOTSUPP;
4643 if (ops->ndo_do_ioctl) {
4644 if (netif_device_present(dev))
4645 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4646 else
4647 err = -ENODEV;
4648 }
4649 } else
4650 err = -EINVAL;
4651
4652 }
4653 return err;
4654}
4655
4656/*
4657 * This function handles all "interface"-type I/O control requests. The actual
4658 * 'doing' part of this is dev_ifsioc above.
4659 */
4660
4661/**
4662 * dev_ioctl - network device ioctl
4663 * @net: the applicable net namespace
4664 * @cmd: command to issue
4665 * @arg: pointer to a struct ifreq in user space
4666 *
4667 * Issue ioctl functions to devices. This is normally called by the
4668 * user space syscall interfaces but can sometimes be useful for
4669 * other purposes. The return value is the return from the syscall if
4670 * positive or a negative errno code on error.
4671 */
4672
4673int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4674{
4675 struct ifreq ifr;
4676 int ret;
4677 char *colon;
4678
4679 /* One special case: SIOCGIFCONF takes ifconf argument
4680 and requires shared lock, because it sleeps writing
4681 to user space.
4682 */
4683
4684 if (cmd == SIOCGIFCONF) {
4685 rtnl_lock();
4686 ret = dev_ifconf(net, (char __user *) arg);
4687 rtnl_unlock();
4688 return ret;
4689 }
4690 if (cmd == SIOCGIFNAME)
4691 return dev_ifname(net, (struct ifreq __user *)arg);
4692
4693 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4694 return -EFAULT;
4695
4696 ifr.ifr_name[IFNAMSIZ-1] = 0;
4697
4698 colon = strchr(ifr.ifr_name, ':');
4699 if (colon)
4700 *colon = 0;
4701
4702 /*
4703 * See which interface the caller is talking about.
4704 */
4705
4706 switch (cmd) {
4707 /*
4708 * These ioctl calls:
4709 * - can be done by all.
4710 * - atomic and do not require locking.
4711 * - return a value
4712 */
4713 case SIOCGIFFLAGS:
4714 case SIOCGIFMETRIC:
4715 case SIOCGIFMTU:
4716 case SIOCGIFHWADDR:
4717 case SIOCGIFSLAVE:
4718 case SIOCGIFMAP:
4719 case SIOCGIFINDEX:
4720 case SIOCGIFTXQLEN:
4721 dev_load(net, ifr.ifr_name);
4722 rcu_read_lock();
4723 ret = dev_ifsioc_locked(net, &ifr, cmd);
4724 rcu_read_unlock();
4725 if (!ret) {
4726 if (colon)
4727 *colon = ':';
4728 if (copy_to_user(arg, &ifr,
4729 sizeof(struct ifreq)))
4730 ret = -EFAULT;
4731 }
4732 return ret;
4733
4734 case SIOCETHTOOL:
4735 dev_load(net, ifr.ifr_name);
4736 rtnl_lock();
4737 ret = dev_ethtool(net, &ifr);
4738 rtnl_unlock();
4739 if (!ret) {
4740 if (colon)
4741 *colon = ':';
4742 if (copy_to_user(arg, &ifr,
4743 sizeof(struct ifreq)))
4744 ret = -EFAULT;
4745 }
4746 return ret;
4747
4748 /*
4749 * These ioctl calls:
4750 * - require superuser power.
4751 * - require strict serialization.
4752 * - return a value
4753 */
4754 case SIOCGMIIPHY:
4755 case SIOCGMIIREG:
4756 case SIOCSIFNAME:
4757 if (!capable(CAP_NET_ADMIN))
4758 return -EPERM;
4759 dev_load(net, ifr.ifr_name);
4760 rtnl_lock();
4761 ret = dev_ifsioc(net, &ifr, cmd);
4762 rtnl_unlock();
4763 if (!ret) {
4764 if (colon)
4765 *colon = ':';
4766 if (copy_to_user(arg, &ifr,
4767 sizeof(struct ifreq)))
4768 ret = -EFAULT;
4769 }
4770 return ret;
4771
4772 /*
4773 * These ioctl calls:
4774 * - require superuser power.
4775 * - require strict serialization.
4776 * - do not return a value
4777 */
4778 case SIOCSIFFLAGS:
4779 case SIOCSIFMETRIC:
4780 case SIOCSIFMTU:
4781 case SIOCSIFMAP:
4782 case SIOCSIFHWADDR:
4783 case SIOCSIFSLAVE:
4784 case SIOCADDMULTI:
4785 case SIOCDELMULTI:
4786 case SIOCSIFHWBROADCAST:
4787 case SIOCSIFTXQLEN:
4788 case SIOCSMIIREG:
4789 case SIOCBONDENSLAVE:
4790 case SIOCBONDRELEASE:
4791 case SIOCBONDSETHWADDR:
4792 case SIOCBONDCHANGEACTIVE:
4793 case SIOCBRADDIF:
4794 case SIOCBRDELIF:
4795 case SIOCSHWTSTAMP:
4796 if (!capable(CAP_NET_ADMIN))
4797 return -EPERM;
4798 /* fall through */
4799 case SIOCBONDSLAVEINFOQUERY:
4800 case SIOCBONDINFOQUERY:
4801 dev_load(net, ifr.ifr_name);
4802 rtnl_lock();
4803 ret = dev_ifsioc(net, &ifr, cmd);
4804 rtnl_unlock();
4805 return ret;
4806
4807 case SIOCGIFMEM:
4808 /* Get the per device memory space. We can add this but
4809 * currently do not support it */
4810 case SIOCSIFMEM:
4811 /* Set the per device memory buffer space.
4812 * Not applicable in our case */
4813 case SIOCSIFLINK:
4814 return -EINVAL;
4815
4816 /*
4817 * Unknown or private ioctl.
4818 */
4819 default:
4820 if (cmd == SIOCWANDEV ||
4821 (cmd >= SIOCDEVPRIVATE &&
4822 cmd <= SIOCDEVPRIVATE + 15)) {
4823 dev_load(net, ifr.ifr_name);
4824 rtnl_lock();
4825 ret = dev_ifsioc(net, &ifr, cmd);
4826 rtnl_unlock();
4827 if (!ret && copy_to_user(arg, &ifr,
4828 sizeof(struct ifreq)))
4829 ret = -EFAULT;
4830 return ret;
4831 }
4832 /* Take care of Wireless Extensions */
4833 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4834 return wext_handle_ioctl(net, &ifr, cmd, arg);
4835 return -EINVAL;
4836 }
4837}
4838
4839
4840/**
4841 * dev_new_index - allocate an ifindex
4842 * @net: the applicable net namespace
4843 *
4844 * Returns a suitable unique value for a new device interface
4845 * number. The caller must hold the rtnl semaphore or the
4846 * dev_base_lock to be sure it remains unique.
4847 */
4848static int dev_new_index(struct net *net)
4849{
4850 static int ifindex;
4851 for (;;) {
4852 if (++ifindex <= 0)
4853 ifindex = 1;
4854 if (!__dev_get_by_index(net, ifindex))
4855 return ifindex;
4856 }
4857}
4858
4859/* Delayed registration/unregisteration */
4860static LIST_HEAD(net_todo_list);
4861
4862static void net_set_todo(struct net_device *dev)
4863{
4864 list_add_tail(&dev->todo_list, &net_todo_list);
4865}
4866
4867static void rollback_registered_many(struct list_head *head)
4868{
4869 struct net_device *dev, *tmp;
4870
4871 BUG_ON(dev_boot_phase);
4872 ASSERT_RTNL();
4873
4874 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4875 /* Some devices call without registering
4876 * for initialization unwind. Remove those
4877 * devices and proceed with the remaining.
4878 */
4879 if (dev->reg_state == NETREG_UNINITIALIZED) {
4880 pr_debug("unregister_netdevice: device %s/%p never "
4881 "was registered\n", dev->name, dev);
4882
4883 WARN_ON(1);
4884 list_del(&dev->unreg_list);
4885 continue;
4886 }
4887
4888 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4889
4890 /* If device is running, close it first. */
4891 dev_close(dev);
4892
4893 /* And unlink it from device chain. */
4894 unlist_netdevice(dev);
4895
4896 dev->reg_state = NETREG_UNREGISTERING;
4897 }
4898
4899 synchronize_net();
4900
4901 list_for_each_entry(dev, head, unreg_list) {
4902 /* Shutdown queueing discipline. */
4903 dev_shutdown(dev);
4904
4905
4906 /* Notify protocols, that we are about to destroy
4907 this device. They should clean all the things.
4908 */
4909 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4910
4911 if (!dev->rtnl_link_ops ||
4912 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4913 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4914
4915 /*
4916 * Flush the unicast and multicast chains
4917 */
4918 dev_uc_flush(dev);
4919 dev_mc_flush(dev);
4920
4921 if (dev->netdev_ops->ndo_uninit)
4922 dev->netdev_ops->ndo_uninit(dev);
4923
4924 /* Notifier chain MUST detach us from master device. */
4925 WARN_ON(dev->master);
4926
4927 /* Remove entries from kobject tree */
4928 netdev_unregister_kobject(dev);
4929 }
4930
4931 /* Process any work delayed until the end of the batch */
4932 dev = list_first_entry(head, struct net_device, unreg_list);
4933 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4934
4935 rcu_barrier();
4936
4937 list_for_each_entry(dev, head, unreg_list)
4938 dev_put(dev);
4939}
4940
4941static void rollback_registered(struct net_device *dev)
4942{
4943 LIST_HEAD(single);
4944
4945 list_add(&dev->unreg_list, &single);
4946 rollback_registered_many(&single);
4947}
4948
4949unsigned long netdev_fix_features(unsigned long features, const char *name)
4950{
4951 /* Fix illegal SG+CSUM combinations. */
4952 if ((features & NETIF_F_SG) &&
4953 !(features & NETIF_F_ALL_CSUM)) {
4954 if (name)
4955 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4956 "checksum feature.\n", name);
4957 features &= ~NETIF_F_SG;
4958 }
4959
4960 /* TSO requires that SG is present as well. */
4961 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4962 if (name)
4963 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4964 "SG feature.\n", name);
4965 features &= ~NETIF_F_TSO;
4966 }
4967
4968 if (features & NETIF_F_UFO) {
4969 if (!(features & NETIF_F_GEN_CSUM)) {
4970 if (name)
4971 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4972 "since no NETIF_F_HW_CSUM feature.\n",
4973 name);
4974 features &= ~NETIF_F_UFO;
4975 }
4976
4977 if (!(features & NETIF_F_SG)) {
4978 if (name)
4979 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4980 "since no NETIF_F_SG feature.\n", name);
4981 features &= ~NETIF_F_UFO;
4982 }
4983 }
4984
4985 return features;
4986}
4987EXPORT_SYMBOL(netdev_fix_features);
4988
4989/**
4990 * netif_stacked_transfer_operstate - transfer operstate
4991 * @rootdev: the root or lower level device to transfer state from
4992 * @dev: the device to transfer operstate to
4993 *
4994 * Transfer operational state from root to device. This is normally
4995 * called when a stacking relationship exists between the root
4996 * device and the device(a leaf device).
4997 */
4998void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4999 struct net_device *dev)
5000{
5001 if (rootdev->operstate == IF_OPER_DORMANT)
5002 netif_dormant_on(dev);
5003 else
5004 netif_dormant_off(dev);
5005
5006 if (netif_carrier_ok(rootdev)) {
5007 if (!netif_carrier_ok(dev))
5008 netif_carrier_on(dev);
5009 } else {
5010 if (netif_carrier_ok(dev))
5011 netif_carrier_off(dev);
5012 }
5013}
5014EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5015
5016static int netif_alloc_rx_queues(struct net_device *dev)
5017{
5018#ifdef CONFIG_RPS
5019 unsigned int i, count = dev->num_rx_queues;
5020 struct netdev_rx_queue *rx;
5021
5022 BUG_ON(count < 1);
5023
5024 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5025 if (!rx) {
5026 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5027 return -ENOMEM;
5028 }
5029 dev->_rx = rx;
5030
5031 /*
5032 * Set a pointer to first element in the array which holds the
5033 * reference count.
5034 */
5035 for (i = 0; i < count; i++)
5036 rx[i].first = rx;
5037#endif
5038 return 0;
5039}
5040
5041static int netif_alloc_netdev_queues(struct net_device *dev)
5042{
5043 unsigned int count = dev->num_tx_queues;
5044 struct netdev_queue *tx;
5045
5046 BUG_ON(count < 1);
5047
5048 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5049 if (!tx) {
5050 pr_err("netdev: Unable to allocate %u tx queues.\n",
5051 count);
5052 return -ENOMEM;
5053 }
5054 dev->_tx = tx;
5055 return 0;
5056}
5057
5058static void netdev_init_one_queue(struct net_device *dev,
5059 struct netdev_queue *queue,
5060 void *_unused)
5061{
5062 queue->dev = dev;
5063
5064 /* Initialize queue lock */
5065 spin_lock_init(&queue->_xmit_lock);
5066 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5067 queue->xmit_lock_owner = -1;
5068}
5069
5070static void netdev_init_queues(struct net_device *dev)
5071{
5072 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5073 spin_lock_init(&dev->tx_global_lock);
5074}
5075
5076/**
5077 * register_netdevice - register a network device
5078 * @dev: device to register
5079 *
5080 * Take a completed network device structure and add it to the kernel
5081 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5082 * chain. 0 is returned on success. A negative errno code is returned
5083 * on a failure to set up the device, or if the name is a duplicate.
5084 *
5085 * Callers must hold the rtnl semaphore. You may want
5086 * register_netdev() instead of this.
5087 *
5088 * BUGS:
5089 * The locking appears insufficient to guarantee two parallel registers
5090 * will not get the same name.
5091 */
5092
5093int register_netdevice(struct net_device *dev)
5094{
5095 int ret;
5096 struct net *net = dev_net(dev);
5097
5098 BUG_ON(dev_boot_phase);
5099 ASSERT_RTNL();
5100
5101 might_sleep();
5102
5103 /* When net_device's are persistent, this will be fatal. */
5104 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5105 BUG_ON(!net);
5106
5107 spin_lock_init(&dev->addr_list_lock);
5108 netdev_set_addr_lockdep_class(dev);
5109
5110 dev->iflink = -1;
5111
5112 ret = netif_alloc_rx_queues(dev);
5113 if (ret)
5114 goto out;
5115
5116 ret = netif_alloc_netdev_queues(dev);
5117 if (ret)
5118 goto out;
5119
5120 netdev_init_queues(dev);
5121
5122 /* Init, if this function is available */
5123 if (dev->netdev_ops->ndo_init) {
5124 ret = dev->netdev_ops->ndo_init(dev);
5125 if (ret) {
5126 if (ret > 0)
5127 ret = -EIO;
5128 goto out;
5129 }
5130 }
5131
5132 ret = dev_get_valid_name(dev, dev->name, 0);
5133 if (ret)
5134 goto err_uninit;
5135
5136 dev->ifindex = dev_new_index(net);
5137 if (dev->iflink == -1)
5138 dev->iflink = dev->ifindex;
5139
5140 /* Fix illegal checksum combinations */
5141 if ((dev->features & NETIF_F_HW_CSUM) &&
5142 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5143 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5144 dev->name);
5145 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5146 }
5147
5148 if ((dev->features & NETIF_F_NO_CSUM) &&
5149 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5150 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5151 dev->name);
5152 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5153 }
5154
5155 dev->features = netdev_fix_features(dev->features, dev->name);
5156
5157 /* Enable software GSO if SG is supported. */
5158 if (dev->features & NETIF_F_SG)
5159 dev->features |= NETIF_F_GSO;
5160
5161 /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5162 * vlan_dev_init() will do the dev->features check, so these features
5163 * are enabled only if supported by underlying device.
5164 */
5165 dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5166
5167 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5168 ret = notifier_to_errno(ret);
5169 if (ret)
5170 goto err_uninit;
5171
5172 ret = netdev_register_kobject(dev);
5173 if (ret)
5174 goto err_uninit;
5175 dev->reg_state = NETREG_REGISTERED;
5176
5177 /*
5178 * Default initial state at registry is that the
5179 * device is present.
5180 */
5181
5182 set_bit(__LINK_STATE_PRESENT, &dev->state);
5183
5184 dev_init_scheduler(dev);
5185 dev_hold(dev);
5186 list_netdevice(dev);
5187
5188 /* Notify protocols, that a new device appeared. */
5189 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5190 ret = notifier_to_errno(ret);
5191 if (ret) {
5192 rollback_registered(dev);
5193 dev->reg_state = NETREG_UNREGISTERED;
5194 }
5195 /*
5196 * Prevent userspace races by waiting until the network
5197 * device is fully setup before sending notifications.
5198 */
5199 if (!dev->rtnl_link_ops ||
5200 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5201 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5202
5203out:
5204 return ret;
5205
5206err_uninit:
5207 if (dev->netdev_ops->ndo_uninit)
5208 dev->netdev_ops->ndo_uninit(dev);
5209 goto out;
5210}
5211EXPORT_SYMBOL(register_netdevice);
5212
5213/**
5214 * init_dummy_netdev - init a dummy network device for NAPI
5215 * @dev: device to init
5216 *
5217 * This takes a network device structure and initialize the minimum
5218 * amount of fields so it can be used to schedule NAPI polls without
5219 * registering a full blown interface. This is to be used by drivers
5220 * that need to tie several hardware interfaces to a single NAPI
5221 * poll scheduler due to HW limitations.
5222 */
5223int init_dummy_netdev(struct net_device *dev)
5224{
5225 /* Clear everything. Note we don't initialize spinlocks
5226 * are they aren't supposed to be taken by any of the
5227 * NAPI code and this dummy netdev is supposed to be
5228 * only ever used for NAPI polls
5229 */
5230 memset(dev, 0, sizeof(struct net_device));
5231
5232 /* make sure we BUG if trying to hit standard
5233 * register/unregister code path
5234 */
5235 dev->reg_state = NETREG_DUMMY;
5236
5237 /* NAPI wants this */
5238 INIT_LIST_HEAD(&dev->napi_list);
5239
5240 /* a dummy interface is started by default */
5241 set_bit(__LINK_STATE_PRESENT, &dev->state);
5242 set_bit(__LINK_STATE_START, &dev->state);
5243
5244 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5245 * because users of this 'device' dont need to change
5246 * its refcount.
5247 */
5248
5249 return 0;
5250}
5251EXPORT_SYMBOL_GPL(init_dummy_netdev);
5252
5253
5254/**
5255 * register_netdev - register a network device
5256 * @dev: device to register
5257 *
5258 * Take a completed network device structure and add it to the kernel
5259 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5260 * chain. 0 is returned on success. A negative errno code is returned
5261 * on a failure to set up the device, or if the name is a duplicate.
5262 *
5263 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5264 * and expands the device name if you passed a format string to
5265 * alloc_netdev.
5266 */
5267int register_netdev(struct net_device *dev)
5268{
5269 int err;
5270
5271 rtnl_lock();
5272
5273 /*
5274 * If the name is a format string the caller wants us to do a
5275 * name allocation.
5276 */
5277 if (strchr(dev->name, '%')) {
5278 err = dev_alloc_name(dev, dev->name);
5279 if (err < 0)
5280 goto out;
5281 }
5282
5283 err = register_netdevice(dev);
5284out:
5285 rtnl_unlock();
5286 return err;
5287}
5288EXPORT_SYMBOL(register_netdev);
5289
5290int netdev_refcnt_read(const struct net_device *dev)
5291{
5292 int i, refcnt = 0;
5293
5294 for_each_possible_cpu(i)
5295 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5296 return refcnt;
5297}
5298EXPORT_SYMBOL(netdev_refcnt_read);
5299
5300/*
5301 * netdev_wait_allrefs - wait until all references are gone.
5302 *
5303 * This is called when unregistering network devices.
5304 *
5305 * Any protocol or device that holds a reference should register
5306 * for netdevice notification, and cleanup and put back the
5307 * reference if they receive an UNREGISTER event.
5308 * We can get stuck here if buggy protocols don't correctly
5309 * call dev_put.
5310 */
5311static void netdev_wait_allrefs(struct net_device *dev)
5312{
5313 unsigned long rebroadcast_time, warning_time;
5314 int refcnt;
5315
5316 linkwatch_forget_dev(dev);
5317
5318 rebroadcast_time = warning_time = jiffies;
5319 refcnt = netdev_refcnt_read(dev);
5320
5321 while (refcnt != 0) {
5322 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5323 rtnl_lock();
5324
5325 /* Rebroadcast unregister notification */
5326 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5327 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5328 * should have already handle it the first time */
5329
5330 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5331 &dev->state)) {
5332 /* We must not have linkwatch events
5333 * pending on unregister. If this
5334 * happens, we simply run the queue
5335 * unscheduled, resulting in a noop
5336 * for this device.
5337 */
5338 linkwatch_run_queue();
5339 }
5340
5341 __rtnl_unlock();
5342
5343 rebroadcast_time = jiffies;
5344 }
5345
5346 msleep(250);
5347
5348 refcnt = netdev_refcnt_read(dev);
5349
5350 if (time_after(jiffies, warning_time + 10 * HZ)) {
5351 printk(KERN_EMERG "unregister_netdevice: "
5352 "waiting for %s to become free. Usage "
5353 "count = %d\n",
5354 dev->name, refcnt);
5355 warning_time = jiffies;
5356 }
5357 }
5358}
5359
5360/* The sequence is:
5361 *
5362 * rtnl_lock();
5363 * ...
5364 * register_netdevice(x1);
5365 * register_netdevice(x2);
5366 * ...
5367 * unregister_netdevice(y1);
5368 * unregister_netdevice(y2);
5369 * ...
5370 * rtnl_unlock();
5371 * free_netdev(y1);
5372 * free_netdev(y2);
5373 *
5374 * We are invoked by rtnl_unlock().
5375 * This allows us to deal with problems:
5376 * 1) We can delete sysfs objects which invoke hotplug
5377 * without deadlocking with linkwatch via keventd.
5378 * 2) Since we run with the RTNL semaphore not held, we can sleep
5379 * safely in order to wait for the netdev refcnt to drop to zero.
5380 *
5381 * We must not return until all unregister events added during
5382 * the interval the lock was held have been completed.
5383 */
5384void netdev_run_todo(void)
5385{
5386 struct list_head list;
5387
5388 /* Snapshot list, allow later requests */
5389 list_replace_init(&net_todo_list, &list);
5390
5391 __rtnl_unlock();
5392
5393 while (!list_empty(&list)) {
5394 struct net_device *dev
5395 = list_first_entry(&list, struct net_device, todo_list);
5396 list_del(&dev->todo_list);
5397
5398 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5399 printk(KERN_ERR "network todo '%s' but state %d\n",
5400 dev->name, dev->reg_state);
5401 dump_stack();
5402 continue;
5403 }
5404
5405 dev->reg_state = NETREG_UNREGISTERED;
5406
5407 on_each_cpu(flush_backlog, dev, 1);
5408
5409 netdev_wait_allrefs(dev);
5410
5411 /* paranoia */
5412 BUG_ON(netdev_refcnt_read(dev));
5413 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5414 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5415 WARN_ON(dev->dn_ptr);
5416
5417 if (dev->destructor)
5418 dev->destructor(dev);
5419
5420 /* Free network device */
5421 kobject_put(&dev->dev.kobj);
5422 }
5423}
5424
5425/**
5426 * dev_txq_stats_fold - fold tx_queues stats
5427 * @dev: device to get statistics from
5428 * @stats: struct rtnl_link_stats64 to hold results
5429 */
5430void dev_txq_stats_fold(const struct net_device *dev,
5431 struct rtnl_link_stats64 *stats)
5432{
5433 u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5434 unsigned int i;
5435 struct netdev_queue *txq;
5436
5437 for (i = 0; i < dev->num_tx_queues; i++) {
5438 txq = netdev_get_tx_queue(dev, i);
5439 spin_lock_bh(&txq->_xmit_lock);
5440 tx_bytes += txq->tx_bytes;
5441 tx_packets += txq->tx_packets;
5442 tx_dropped += txq->tx_dropped;
5443 spin_unlock_bh(&txq->_xmit_lock);
5444 }
5445 if (tx_bytes || tx_packets || tx_dropped) {
5446 stats->tx_bytes = tx_bytes;
5447 stats->tx_packets = tx_packets;
5448 stats->tx_dropped = tx_dropped;
5449 }
5450}
5451EXPORT_SYMBOL(dev_txq_stats_fold);
5452
5453/* Convert net_device_stats to rtnl_link_stats64. They have the same
5454 * fields in the same order, with only the type differing.
5455 */
5456static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5457 const struct net_device_stats *netdev_stats)
5458{
5459#if BITS_PER_LONG == 64
5460 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5461 memcpy(stats64, netdev_stats, sizeof(*stats64));
5462#else
5463 size_t i, n = sizeof(*stats64) / sizeof(u64);
5464 const unsigned long *src = (const unsigned long *)netdev_stats;
5465 u64 *dst = (u64 *)stats64;
5466
5467 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5468 sizeof(*stats64) / sizeof(u64));
5469 for (i = 0; i < n; i++)
5470 dst[i] = src[i];
5471#endif
5472}
5473
5474/**
5475 * dev_get_stats - get network device statistics
5476 * @dev: device to get statistics from
5477 * @storage: place to store stats
5478 *
5479 * Get network statistics from device. Return @storage.
5480 * The device driver may provide its own method by setting
5481 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5482 * otherwise the internal statistics structure is used.
5483 */
5484struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5485 struct rtnl_link_stats64 *storage)
5486{
5487 const struct net_device_ops *ops = dev->netdev_ops;
5488
5489 if (ops->ndo_get_stats64) {
5490 memset(storage, 0, sizeof(*storage));
5491 ops->ndo_get_stats64(dev, storage);
5492 } else if (ops->ndo_get_stats) {
5493 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5494 } else {
5495 netdev_stats_to_stats64(storage, &dev->stats);
5496 dev_txq_stats_fold(dev, storage);
5497 }
5498 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5499 return storage;
5500}
5501EXPORT_SYMBOL(dev_get_stats);
5502
5503struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5504{
5505 struct netdev_queue *queue = dev_ingress_queue(dev);
5506
5507#ifdef CONFIG_NET_CLS_ACT
5508 if (queue)
5509 return queue;
5510 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5511 if (!queue)
5512 return NULL;
5513 netdev_init_one_queue(dev, queue, NULL);
5514 queue->qdisc = &noop_qdisc;
5515 queue->qdisc_sleeping = &noop_qdisc;
5516 rcu_assign_pointer(dev->ingress_queue, queue);
5517#endif
5518 return queue;
5519}
5520
5521/**
5522 * alloc_netdev_mq - allocate network device
5523 * @sizeof_priv: size of private data to allocate space for
5524 * @name: device name format string
5525 * @setup: callback to initialize device
5526 * @queue_count: the number of subqueues to allocate
5527 *
5528 * Allocates a struct net_device with private data area for driver use
5529 * and performs basic initialization. Also allocates subquue structs
5530 * for each queue on the device at the end of the netdevice.
5531 */
5532struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5533 void (*setup)(struct net_device *), unsigned int queue_count)
5534{
5535 struct net_device *dev;
5536 size_t alloc_size;
5537 struct net_device *p;
5538
5539 BUG_ON(strlen(name) >= sizeof(dev->name));
5540
5541 if (queue_count < 1) {
5542 pr_err("alloc_netdev: Unable to allocate device "
5543 "with zero queues.\n");
5544 return NULL;
5545 }
5546
5547 alloc_size = sizeof(struct net_device);
5548 if (sizeof_priv) {
5549 /* ensure 32-byte alignment of private area */
5550 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5551 alloc_size += sizeof_priv;
5552 }
5553 /* ensure 32-byte alignment of whole construct */
5554 alloc_size += NETDEV_ALIGN - 1;
5555
5556 p = kzalloc(alloc_size, GFP_KERNEL);
5557 if (!p) {
5558 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5559 return NULL;
5560 }
5561
5562 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5563 dev->padded = (char *)dev - (char *)p;
5564
5565 dev->pcpu_refcnt = alloc_percpu(int);
5566 if (!dev->pcpu_refcnt)
5567 goto free_p;
5568
5569 if (dev_addr_init(dev))
5570 goto free_pcpu;
5571
5572 dev_mc_init(dev);
5573 dev_uc_init(dev);
5574
5575 dev_net_set(dev, &init_net);
5576
5577 dev->num_tx_queues = queue_count;
5578 dev->real_num_tx_queues = queue_count;
5579
5580#ifdef CONFIG_RPS
5581 dev->num_rx_queues = queue_count;
5582 dev->real_num_rx_queues = queue_count;
5583#endif
5584
5585 dev->gso_max_size = GSO_MAX_SIZE;
5586
5587 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5588 dev->ethtool_ntuple_list.count = 0;
5589 INIT_LIST_HEAD(&dev->napi_list);
5590 INIT_LIST_HEAD(&dev->unreg_list);
5591 INIT_LIST_HEAD(&dev->link_watch_list);
5592 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5593 setup(dev);
5594 strcpy(dev->name, name);
5595 return dev;
5596
5597free_pcpu:
5598 free_percpu(dev->pcpu_refcnt);
5599free_p:
5600 kfree(p);
5601 return NULL;
5602}
5603EXPORT_SYMBOL(alloc_netdev_mq);
5604
5605/**
5606 * free_netdev - free network device
5607 * @dev: device
5608 *
5609 * This function does the last stage of destroying an allocated device
5610 * interface. The reference to the device object is released.
5611 * If this is the last reference then it will be freed.
5612 */
5613void free_netdev(struct net_device *dev)
5614{
5615 struct napi_struct *p, *n;
5616
5617 release_net(dev_net(dev));
5618
5619 kfree(dev->_tx);
5620
5621 kfree(rcu_dereference_raw(dev->ingress_queue));
5622
5623 /* Flush device addresses */
5624 dev_addr_flush(dev);
5625
5626 /* Clear ethtool n-tuple list */
5627 ethtool_ntuple_flush(dev);
5628
5629 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5630 netif_napi_del(p);
5631
5632 free_percpu(dev->pcpu_refcnt);
5633 dev->pcpu_refcnt = NULL;
5634
5635 /* Compatibility with error handling in drivers */
5636 if (dev->reg_state == NETREG_UNINITIALIZED) {
5637 kfree((char *)dev - dev->padded);
5638 return;
5639 }
5640
5641 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5642 dev->reg_state = NETREG_RELEASED;
5643
5644 /* will free via device release */
5645 put_device(&dev->dev);
5646}
5647EXPORT_SYMBOL(free_netdev);
5648
5649/**
5650 * synchronize_net - Synchronize with packet receive processing
5651 *
5652 * Wait for packets currently being received to be done.
5653 * Does not block later packets from starting.
5654 */
5655void synchronize_net(void)
5656{
5657 might_sleep();
5658 synchronize_rcu();
5659}
5660EXPORT_SYMBOL(synchronize_net);
5661
5662/**
5663 * unregister_netdevice_queue - remove device from the kernel
5664 * @dev: device
5665 * @head: list
5666 *
5667 * This function shuts down a device interface and removes it
5668 * from the kernel tables.
5669 * If head not NULL, device is queued to be unregistered later.
5670 *
5671 * Callers must hold the rtnl semaphore. You may want
5672 * unregister_netdev() instead of this.
5673 */
5674
5675void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5676{
5677 ASSERT_RTNL();
5678
5679 if (head) {
5680 list_move_tail(&dev->unreg_list, head);
5681 } else {
5682 rollback_registered(dev);
5683 /* Finish processing unregister after unlock */
5684 net_set_todo(dev);
5685 }
5686}
5687EXPORT_SYMBOL(unregister_netdevice_queue);
5688
5689/**
5690 * unregister_netdevice_many - unregister many devices
5691 * @head: list of devices
5692 */
5693void unregister_netdevice_many(struct list_head *head)
5694{
5695 struct net_device *dev;
5696
5697 if (!list_empty(head)) {
5698 rollback_registered_many(head);
5699 list_for_each_entry(dev, head, unreg_list)
5700 net_set_todo(dev);
5701 }
5702}
5703EXPORT_SYMBOL(unregister_netdevice_many);
5704
5705/**
5706 * unregister_netdev - remove device from the kernel
5707 * @dev: device
5708 *
5709 * This function shuts down a device interface and removes it
5710 * from the kernel tables.
5711 *
5712 * This is just a wrapper for unregister_netdevice that takes
5713 * the rtnl semaphore. In general you want to use this and not
5714 * unregister_netdevice.
5715 */
5716void unregister_netdev(struct net_device *dev)
5717{
5718 rtnl_lock();
5719 unregister_netdevice(dev);
5720 rtnl_unlock();
5721}
5722EXPORT_SYMBOL(unregister_netdev);
5723
5724/**
5725 * dev_change_net_namespace - move device to different nethost namespace
5726 * @dev: device
5727 * @net: network namespace
5728 * @pat: If not NULL name pattern to try if the current device name
5729 * is already taken in the destination network namespace.
5730 *
5731 * This function shuts down a device interface and moves it
5732 * to a new network namespace. On success 0 is returned, on
5733 * a failure a netagive errno code is returned.
5734 *
5735 * Callers must hold the rtnl semaphore.
5736 */
5737
5738int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5739{
5740 int err;
5741
5742 ASSERT_RTNL();
5743
5744 /* Don't allow namespace local devices to be moved. */
5745 err = -EINVAL;
5746 if (dev->features & NETIF_F_NETNS_LOCAL)
5747 goto out;
5748
5749 /* Ensure the device has been registrered */
5750 err = -EINVAL;
5751 if (dev->reg_state != NETREG_REGISTERED)
5752 goto out;
5753
5754 /* Get out if there is nothing todo */
5755 err = 0;
5756 if (net_eq(dev_net(dev), net))
5757 goto out;
5758
5759 /* Pick the destination device name, and ensure
5760 * we can use it in the destination network namespace.
5761 */
5762 err = -EEXIST;
5763 if (__dev_get_by_name(net, dev->name)) {
5764 /* We get here if we can't use the current device name */
5765 if (!pat)
5766 goto out;
5767 if (dev_get_valid_name(dev, pat, 1))
5768 goto out;
5769 }
5770
5771 /*
5772 * And now a mini version of register_netdevice unregister_netdevice.
5773 */
5774
5775 /* If device is running close it first. */
5776 dev_close(dev);
5777
5778 /* And unlink it from device chain */
5779 err = -ENODEV;
5780 unlist_netdevice(dev);
5781
5782 synchronize_net();
5783
5784 /* Shutdown queueing discipline. */
5785 dev_shutdown(dev);
5786
5787 /* Notify protocols, that we are about to destroy
5788 this device. They should clean all the things.
5789
5790 Note that dev->reg_state stays at NETREG_REGISTERED.
5791 This is wanted because this way 8021q and macvlan know
5792 the device is just moving and can keep their slaves up.
5793 */
5794 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5795 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5796
5797 /*
5798 * Flush the unicast and multicast chains
5799 */
5800 dev_uc_flush(dev);
5801 dev_mc_flush(dev);
5802
5803 /* Actually switch the network namespace */
5804 dev_net_set(dev, net);
5805
5806 /* If there is an ifindex conflict assign a new one */
5807 if (__dev_get_by_index(net, dev->ifindex)) {
5808 int iflink = (dev->iflink == dev->ifindex);
5809 dev->ifindex = dev_new_index(net);
5810 if (iflink)
5811 dev->iflink = dev->ifindex;
5812 }
5813
5814 /* Fixup kobjects */
5815 err = device_rename(&dev->dev, dev->name);
5816 WARN_ON(err);
5817
5818 /* Add the device back in the hashes */
5819 list_netdevice(dev);
5820
5821 /* Notify protocols, that a new device appeared. */
5822 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5823
5824 /*
5825 * Prevent userspace races by waiting until the network
5826 * device is fully setup before sending notifications.
5827 */
5828 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5829
5830 synchronize_net();
5831 err = 0;
5832out:
5833 return err;
5834}
5835EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5836
5837static int dev_cpu_callback(struct notifier_block *nfb,
5838 unsigned long action,
5839 void *ocpu)
5840{
5841 struct sk_buff **list_skb;
5842 struct sk_buff *skb;
5843 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5844 struct softnet_data *sd, *oldsd;
5845
5846 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5847 return NOTIFY_OK;
5848
5849 local_irq_disable();
5850 cpu = smp_processor_id();
5851 sd = &per_cpu(softnet_data, cpu);
5852 oldsd = &per_cpu(softnet_data, oldcpu);
5853
5854 /* Find end of our completion_queue. */
5855 list_skb = &sd->completion_queue;
5856 while (*list_skb)
5857 list_skb = &(*list_skb)->next;
5858 /* Append completion queue from offline CPU. */
5859 *list_skb = oldsd->completion_queue;
5860 oldsd->completion_queue = NULL;
5861
5862 /* Append output queue from offline CPU. */
5863 if (oldsd->output_queue) {
5864 *sd->output_queue_tailp = oldsd->output_queue;
5865 sd->output_queue_tailp = oldsd->output_queue_tailp;
5866 oldsd->output_queue = NULL;
5867 oldsd->output_queue_tailp = &oldsd->output_queue;
5868 }
5869
5870 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5871 local_irq_enable();
5872
5873 /* Process offline CPU's input_pkt_queue */
5874 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5875 netif_rx(skb);
5876 input_queue_head_incr(oldsd);
5877 }
5878 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5879 netif_rx(skb);
5880 input_queue_head_incr(oldsd);
5881 }
5882
5883 return NOTIFY_OK;
5884}
5885
5886
5887/**
5888 * netdev_increment_features - increment feature set by one
5889 * @all: current feature set
5890 * @one: new feature set
5891 * @mask: mask feature set
5892 *
5893 * Computes a new feature set after adding a device with feature set
5894 * @one to the master device with current feature set @all. Will not
5895 * enable anything that is off in @mask. Returns the new feature set.
5896 */
5897unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5898 unsigned long mask)
5899{
5900 /* If device needs checksumming, downgrade to it. */
5901 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5902 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5903 else if (mask & NETIF_F_ALL_CSUM) {
5904 /* If one device supports v4/v6 checksumming, set for all. */
5905 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5906 !(all & NETIF_F_GEN_CSUM)) {
5907 all &= ~NETIF_F_ALL_CSUM;
5908 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5909 }
5910
5911 /* If one device supports hw checksumming, set for all. */
5912 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5913 all &= ~NETIF_F_ALL_CSUM;
5914 all |= NETIF_F_HW_CSUM;
5915 }
5916 }
5917
5918 one |= NETIF_F_ALL_CSUM;
5919
5920 one |= all & NETIF_F_ONE_FOR_ALL;
5921 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5922 all |= one & mask & NETIF_F_ONE_FOR_ALL;
5923
5924 return all;
5925}
5926EXPORT_SYMBOL(netdev_increment_features);
5927
5928static struct hlist_head *netdev_create_hash(void)
5929{
5930 int i;
5931 struct hlist_head *hash;
5932
5933 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5934 if (hash != NULL)
5935 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5936 INIT_HLIST_HEAD(&hash[i]);
5937
5938 return hash;
5939}
5940
5941/* Initialize per network namespace state */
5942static int __net_init netdev_init(struct net *net)
5943{
5944 INIT_LIST_HEAD(&net->dev_base_head);
5945
5946 net->dev_name_head = netdev_create_hash();
5947 if (net->dev_name_head == NULL)
5948 goto err_name;
5949
5950 net->dev_index_head = netdev_create_hash();
5951 if (net->dev_index_head == NULL)
5952 goto err_idx;
5953
5954 return 0;
5955
5956err_idx:
5957 kfree(net->dev_name_head);
5958err_name:
5959 return -ENOMEM;
5960}
5961
5962/**
5963 * netdev_drivername - network driver for the device
5964 * @dev: network device
5965 * @buffer: buffer for resulting name
5966 * @len: size of buffer
5967 *
5968 * Determine network driver for device.
5969 */
5970char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5971{
5972 const struct device_driver *driver;
5973 const struct device *parent;
5974
5975 if (len <= 0 || !buffer)
5976 return buffer;
5977 buffer[0] = 0;
5978
5979 parent = dev->dev.parent;
5980
5981 if (!parent)
5982 return buffer;
5983
5984 driver = parent->driver;
5985 if (driver && driver->name)
5986 strlcpy(buffer, driver->name, len);
5987 return buffer;
5988}
5989
5990static int __netdev_printk(const char *level, const struct net_device *dev,
5991 struct va_format *vaf)
5992{
5993 int r;
5994
5995 if (dev && dev->dev.parent)
5996 r = dev_printk(level, dev->dev.parent, "%s: %pV",
5997 netdev_name(dev), vaf);
5998 else if (dev)
5999 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6000 else
6001 r = printk("%s(NULL net_device): %pV", level, vaf);
6002
6003 return r;
6004}
6005
6006int netdev_printk(const char *level, const struct net_device *dev,
6007 const char *format, ...)
6008{
6009 struct va_format vaf;
6010 va_list args;
6011 int r;
6012
6013 va_start(args, format);
6014
6015 vaf.fmt = format;
6016 vaf.va = &args;
6017
6018 r = __netdev_printk(level, dev, &vaf);
6019 va_end(args);
6020
6021 return r;
6022}
6023EXPORT_SYMBOL(netdev_printk);
6024
6025#define define_netdev_printk_level(func, level) \
6026int func(const struct net_device *dev, const char *fmt, ...) \
6027{ \
6028 int r; \
6029 struct va_format vaf; \
6030 va_list args; \
6031 \
6032 va_start(args, fmt); \
6033 \
6034 vaf.fmt = fmt; \
6035 vaf.va = &args; \
6036 \
6037 r = __netdev_printk(level, dev, &vaf); \
6038 va_end(args); \
6039 \
6040 return r; \
6041} \
6042EXPORT_SYMBOL(func);
6043
6044define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6045define_netdev_printk_level(netdev_alert, KERN_ALERT);
6046define_netdev_printk_level(netdev_crit, KERN_CRIT);
6047define_netdev_printk_level(netdev_err, KERN_ERR);
6048define_netdev_printk_level(netdev_warn, KERN_WARNING);
6049define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6050define_netdev_printk_level(netdev_info, KERN_INFO);
6051
6052static void __net_exit netdev_exit(struct net *net)
6053{
6054 kfree(net->dev_name_head);
6055 kfree(net->dev_index_head);
6056}
6057
6058static struct pernet_operations __net_initdata netdev_net_ops = {
6059 .init = netdev_init,
6060 .exit = netdev_exit,
6061};
6062
6063static void __net_exit default_device_exit(struct net *net)
6064{
6065 struct net_device *dev, *aux;
6066 /*
6067 * Push all migratable network devices back to the
6068 * initial network namespace
6069 */
6070 rtnl_lock();
6071 for_each_netdev_safe(net, dev, aux) {
6072 int err;
6073 char fb_name[IFNAMSIZ];
6074
6075 /* Ignore unmoveable devices (i.e. loopback) */
6076 if (dev->features & NETIF_F_NETNS_LOCAL)
6077 continue;
6078
6079 /* Leave virtual devices for the generic cleanup */
6080 if (dev->rtnl_link_ops)
6081 continue;
6082
6083 /* Push remaing network devices to init_net */
6084 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6085 err = dev_change_net_namespace(dev, &init_net, fb_name);
6086 if (err) {
6087 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6088 __func__, dev->name, err);
6089 BUG();
6090 }
6091 }
6092 rtnl_unlock();
6093}
6094
6095static void __net_exit default_device_exit_batch(struct list_head *net_list)
6096{
6097 /* At exit all network devices most be removed from a network
6098 * namespace. Do this in the reverse order of registeration.
6099 * Do this across as many network namespaces as possible to
6100 * improve batching efficiency.
6101 */
6102 struct net_device *dev;
6103 struct net *net;
6104 LIST_HEAD(dev_kill_list);
6105
6106 rtnl_lock();
6107 list_for_each_entry(net, net_list, exit_list) {
6108 for_each_netdev_reverse(net, dev) {
6109 if (dev->rtnl_link_ops)
6110 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6111 else
6112 unregister_netdevice_queue(dev, &dev_kill_list);
6113 }
6114 }
6115 unregister_netdevice_many(&dev_kill_list);
6116 rtnl_unlock();
6117}
6118
6119static struct pernet_operations __net_initdata default_device_ops = {
6120 .exit = default_device_exit,
6121 .exit_batch = default_device_exit_batch,
6122};
6123
6124/*
6125 * Initialize the DEV module. At boot time this walks the device list and
6126 * unhooks any devices that fail to initialise (normally hardware not
6127 * present) and leaves us with a valid list of present and active devices.
6128 *
6129 */
6130
6131/*
6132 * This is called single threaded during boot, so no need
6133 * to take the rtnl semaphore.
6134 */
6135static int __init net_dev_init(void)
6136{
6137 int i, rc = -ENOMEM;
6138
6139 BUG_ON(!dev_boot_phase);
6140
6141 if (dev_proc_init())
6142 goto out;
6143
6144 if (netdev_kobject_init())
6145 goto out;
6146
6147 INIT_LIST_HEAD(&ptype_all);
6148 for (i = 0; i < PTYPE_HASH_SIZE; i++)
6149 INIT_LIST_HEAD(&ptype_base[i]);
6150
6151 if (register_pernet_subsys(&netdev_net_ops))
6152 goto out;
6153
6154 /*
6155 * Initialise the packet receive queues.
6156 */
6157
6158 for_each_possible_cpu(i) {
6159 struct softnet_data *sd = &per_cpu(softnet_data, i);
6160
6161 memset(sd, 0, sizeof(*sd));
6162 skb_queue_head_init(&sd->input_pkt_queue);
6163 skb_queue_head_init(&sd->process_queue);
6164 sd->completion_queue = NULL;
6165 INIT_LIST_HEAD(&sd->poll_list);
6166 sd->output_queue = NULL;
6167 sd->output_queue_tailp = &sd->output_queue;
6168#ifdef CONFIG_RPS
6169 sd->csd.func = rps_trigger_softirq;
6170 sd->csd.info = sd;
6171 sd->csd.flags = 0;
6172 sd->cpu = i;
6173#endif
6174
6175 sd->backlog.poll = process_backlog;
6176 sd->backlog.weight = weight_p;
6177 sd->backlog.gro_list = NULL;
6178 sd->backlog.gro_count = 0;
6179 }
6180
6181 dev_boot_phase = 0;
6182
6183 /* The loopback device is special if any other network devices
6184 * is present in a network namespace the loopback device must
6185 * be present. Since we now dynamically allocate and free the
6186 * loopback device ensure this invariant is maintained by
6187 * keeping the loopback device as the first device on the
6188 * list of network devices. Ensuring the loopback devices
6189 * is the first device that appears and the last network device
6190 * that disappears.
6191 */
6192 if (register_pernet_device(&loopback_net_ops))
6193 goto out;
6194
6195 if (register_pernet_device(&default_device_ops))
6196 goto out;
6197
6198 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6199 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6200
6201 hotcpu_notifier(dev_cpu_callback, 0);
6202 dst_init();
6203 dev_mcast_init();
6204 rc = 0;
6205out:
6206 return rc;
6207}
6208
6209subsys_initcall(net_dev_init);
6210
6211static int __init initialize_hashrnd(void)
6212{
6213 get_random_bytes(&hashrnd, sizeof(hashrnd));
6214 return 0;
6215}
6216
6217late_initcall_sync(initialize_hashrnd);
6218