]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/core/dev.c
sctp: implement SIOCINQ ioctl() (take 3)
[net-next-2.6.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <asm/system.h>
77#include <linux/bitops.h>
4fc268d2 78#include <linux/capability.h>
1da177e4
LT
79#include <linux/cpu.h>
80#include <linux/types.h>
81#include <linux/kernel.h>
08e9897d 82#include <linux/hash.h>
5a0e3ad6 83#include <linux/slab.h>
1da177e4 84#include <linux/sched.h>
4a3e2f71 85#include <linux/mutex.h>
1da177e4
LT
86#include <linux/string.h>
87#include <linux/mm.h>
88#include <linux/socket.h>
89#include <linux/sockios.h>
90#include <linux/errno.h>
91#include <linux/interrupt.h>
92#include <linux/if_ether.h>
93#include <linux/netdevice.h>
94#include <linux/etherdevice.h>
0187bdfb 95#include <linux/ethtool.h>
1da177e4
LT
96#include <linux/notifier.h>
97#include <linux/skbuff.h>
457c4cbc 98#include <net/net_namespace.h>
1da177e4
LT
99#include <net/sock.h>
100#include <linux/rtnetlink.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/stat.h>
1da177e4
LT
104#include <net/dst.h>
105#include <net/pkt_sched.h>
106#include <net/checksum.h>
44540960 107#include <net/xfrm.h>
1da177e4
LT
108#include <linux/highmem.h>
109#include <linux/init.h>
110#include <linux/kmod.h>
111#include <linux/module.h>
1da177e4
LT
112#include <linux/netpoll.h>
113#include <linux/rcupdate.h>
114#include <linux/delay.h>
295f4a1f 115#include <net/wext.h>
1da177e4 116#include <net/iw_handler.h>
1da177e4 117#include <asm/current.h>
5bdb9886 118#include <linux/audit.h>
db217334 119#include <linux/dmaengine.h>
f6a78bfc 120#include <linux/err.h>
c7fa9d18 121#include <linux/ctype.h>
723e98b7 122#include <linux/if_arp.h>
6de329e2 123#include <linux/if_vlan.h>
8f0f2223 124#include <linux/ip.h>
ad55dcaf 125#include <net/ip.h>
8f0f2223
DM
126#include <linux/ipv6.h>
127#include <linux/in.h>
b6b2fed1
DM
128#include <linux/jhash.h>
129#include <linux/random.h>
9cbc1cb8 130#include <trace/events/napi.h>
5acbbd42 131#include <linux/pci.h>
1da177e4 132
342709ef
PE
133#include "net-sysfs.h"
134
d565b0a1
HX
135/* Instead of increasing this, you should create a hash table. */
136#define MAX_GRO_SKBS 8
137
5d38a079
HX
138/* This should be increased if a protocol with a bigger head is added. */
139#define GRO_MAX_HEAD (MAX_HEADER + 128)
140
1da177e4
LT
141/*
142 * The list of packet types we will receive (as opposed to discard)
143 * and the routines to invoke.
144 *
145 * Why 16. Because with 16 the only overlap we get on a hash of the
146 * low nibble of the protocol value is RARP/SNAP/X.25.
147 *
148 * NOTE: That is no longer true with the addition of VLAN tags. Not
149 * sure which should go first, but I bet it won't make much
150 * difference if we are running VLANs. The good news is that
151 * this protocol won't be in the list unless compiled in, so
3041a069 152 * the average user (w/out VLANs) will not be adversely affected.
1da177e4
LT
153 * --BLG
154 *
155 * 0800 IP
156 * 8100 802.1Q VLAN
157 * 0001 802.3
158 * 0002 AX.25
159 * 0004 802.2
160 * 8035 RARP
161 * 0005 SNAP
162 * 0805 X.25
163 * 0806 ARP
164 * 8137 IPX
165 * 0009 Localtalk
166 * 86DD IPv6
167 */
168
82d8a867
PE
169#define PTYPE_HASH_SIZE (16)
170#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
171
1da177e4 172static DEFINE_SPINLOCK(ptype_lock);
82d8a867 173static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
6b2bedc3 174static struct list_head ptype_all __read_mostly; /* Taps */
1da177e4 175
1da177e4 176/*
7562f876 177 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
178 * semaphore.
179 *
c6d14c84 180 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
1da177e4
LT
181 *
182 * Writers must hold the rtnl semaphore while they loop through the
7562f876 183 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
184 * actual updates. This allows pure readers to access the list even
185 * while a writer is preparing to update it.
186 *
187 * To put it another way, dev_base_lock is held for writing only to
188 * protect against pure readers; the rtnl semaphore provides the
189 * protection against other writers.
190 *
191 * See, for example usages, register_netdevice() and
192 * unregister_netdevice(), which must be called with the rtnl
193 * semaphore held.
194 */
1da177e4 195DEFINE_RWLOCK(dev_base_lock);
1da177e4
LT
196EXPORT_SYMBOL(dev_base_lock);
197
881d966b 198static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4
LT
199{
200 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
08e9897d 201 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
1da177e4
LT
202}
203
881d966b 204static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 205{
7c28bd0b 206 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
1da177e4
LT
207}
208
e36fa2f7 209static inline void rps_lock(struct softnet_data *sd)
152102c7
CG
210{
211#ifdef CONFIG_RPS
e36fa2f7 212 spin_lock(&sd->input_pkt_queue.lock);
152102c7
CG
213#endif
214}
215
e36fa2f7 216static inline void rps_unlock(struct softnet_data *sd)
152102c7
CG
217{
218#ifdef CONFIG_RPS
e36fa2f7 219 spin_unlock(&sd->input_pkt_queue.lock);
152102c7
CG
220#endif
221}
222
ce286d32
EB
223/* Device list insertion */
224static int list_netdevice(struct net_device *dev)
225{
c346dca1 226 struct net *net = dev_net(dev);
ce286d32
EB
227
228 ASSERT_RTNL();
229
230 write_lock_bh(&dev_base_lock);
c6d14c84 231 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
72c9528b 232 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
fb699dfd
ED
233 hlist_add_head_rcu(&dev->index_hlist,
234 dev_index_hash(net, dev->ifindex));
ce286d32
EB
235 write_unlock_bh(&dev_base_lock);
236 return 0;
237}
238
fb699dfd
ED
239/* Device list removal
240 * caller must respect a RCU grace period before freeing/reusing dev
241 */
ce286d32
EB
242static void unlist_netdevice(struct net_device *dev)
243{
244 ASSERT_RTNL();
245
246 /* Unlink dev from the device chain */
247 write_lock_bh(&dev_base_lock);
c6d14c84 248 list_del_rcu(&dev->dev_list);
72c9528b 249 hlist_del_rcu(&dev->name_hlist);
fb699dfd 250 hlist_del_rcu(&dev->index_hlist);
ce286d32
EB
251 write_unlock_bh(&dev_base_lock);
252}
253
1da177e4
LT
254/*
255 * Our notifier list
256 */
257
f07d5b94 258static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
259
260/*
261 * Device drivers call our routines to queue packets here. We empty the
262 * queue in the local softnet handler.
263 */
bea3348e 264
9958da05 265DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
d1b19dff 266EXPORT_PER_CPU_SYMBOL(softnet_data);
1da177e4 267
cf508b12 268#ifdef CONFIG_LOCKDEP
723e98b7 269/*
c773e847 270 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
271 * according to dev->type
272 */
273static const unsigned short netdev_lock_type[] =
274 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
275 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
276 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
277 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
278 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
279 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
280 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
281 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
282 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
283 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
284 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
285 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
286 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
2d91d78b 287 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
929122cd 288 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
fcb94e42 289 ARPHRD_VOID, ARPHRD_NONE};
723e98b7 290
36cbd3dc 291static const char *const netdev_lock_name[] =
723e98b7
JP
292 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
293 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
294 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
295 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
296 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
297 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
298 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
299 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
300 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
301 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
302 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
303 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
304 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
2d91d78b 305 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
929122cd 306 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
fcb94e42 307 "_xmit_VOID", "_xmit_NONE"};
723e98b7
JP
308
309static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 310static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
311
312static inline unsigned short netdev_lock_pos(unsigned short dev_type)
313{
314 int i;
315
316 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
317 if (netdev_lock_type[i] == dev_type)
318 return i;
319 /* the last key is used by default */
320 return ARRAY_SIZE(netdev_lock_type) - 1;
321}
322
cf508b12
DM
323static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
324 unsigned short dev_type)
723e98b7
JP
325{
326 int i;
327
328 i = netdev_lock_pos(dev_type);
329 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
330 netdev_lock_name[i]);
331}
cf508b12
DM
332
333static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
334{
335 int i;
336
337 i = netdev_lock_pos(dev->type);
338 lockdep_set_class_and_name(&dev->addr_list_lock,
339 &netdev_addr_lock_key[i],
340 netdev_lock_name[i]);
341}
723e98b7 342#else
cf508b12
DM
343static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
344 unsigned short dev_type)
345{
346}
347static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
348{
349}
350#endif
1da177e4
LT
351
352/*******************************************************************************
353
354 Protocol management and registration routines
355
356*******************************************************************************/
357
1da177e4
LT
358/*
359 * Add a protocol ID to the list. Now that the input handler is
360 * smarter we can dispense with all the messy stuff that used to be
361 * here.
362 *
363 * BEWARE!!! Protocol handlers, mangling input packets,
364 * MUST BE last in hash buckets and checking protocol handlers
365 * MUST start from promiscuous ptype_all chain in net_bh.
366 * It is true now, do not change it.
367 * Explanation follows: if protocol handler, mangling packet, will
368 * be the first on list, it is not able to sense, that packet
369 * is cloned and should be copied-on-write, so that it will
370 * change it and subsequent readers will get broken packet.
371 * --ANK (980803)
372 */
373
c07b68e8
ED
374static inline struct list_head *ptype_head(const struct packet_type *pt)
375{
376 if (pt->type == htons(ETH_P_ALL))
377 return &ptype_all;
378 else
379 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
380}
381
1da177e4
LT
382/**
383 * dev_add_pack - add packet handler
384 * @pt: packet type declaration
385 *
386 * Add a protocol handler to the networking stack. The passed &packet_type
387 * is linked into kernel lists and may not be freed until it has been
388 * removed from the kernel lists.
389 *
4ec93edb 390 * This call does not sleep therefore it can not
1da177e4
LT
391 * guarantee all CPU's that are in middle of receiving packets
392 * will see the new packet type (until the next received packet).
393 */
394
395void dev_add_pack(struct packet_type *pt)
396{
c07b68e8 397 struct list_head *head = ptype_head(pt);
1da177e4 398
c07b68e8
ED
399 spin_lock(&ptype_lock);
400 list_add_rcu(&pt->list, head);
401 spin_unlock(&ptype_lock);
1da177e4 402}
d1b19dff 403EXPORT_SYMBOL(dev_add_pack);
1da177e4 404
1da177e4
LT
405/**
406 * __dev_remove_pack - remove packet handler
407 * @pt: packet type declaration
408 *
409 * Remove a protocol handler that was previously added to the kernel
410 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
411 * from the kernel lists and can be freed or reused once this function
4ec93edb 412 * returns.
1da177e4
LT
413 *
414 * The packet type might still be in use by receivers
415 * and must not be freed until after all the CPU's have gone
416 * through a quiescent state.
417 */
418void __dev_remove_pack(struct packet_type *pt)
419{
c07b68e8 420 struct list_head *head = ptype_head(pt);
1da177e4
LT
421 struct packet_type *pt1;
422
c07b68e8 423 spin_lock(&ptype_lock);
1da177e4
LT
424
425 list_for_each_entry(pt1, head, list) {
426 if (pt == pt1) {
427 list_del_rcu(&pt->list);
428 goto out;
429 }
430 }
431
432 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
433out:
c07b68e8 434 spin_unlock(&ptype_lock);
1da177e4 435}
d1b19dff
ED
436EXPORT_SYMBOL(__dev_remove_pack);
437
1da177e4
LT
438/**
439 * dev_remove_pack - remove packet handler
440 * @pt: packet type declaration
441 *
442 * Remove a protocol handler that was previously added to the kernel
443 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
444 * from the kernel lists and can be freed or reused once this function
445 * returns.
446 *
447 * This call sleeps to guarantee that no CPU is looking at the packet
448 * type after return.
449 */
450void dev_remove_pack(struct packet_type *pt)
451{
452 __dev_remove_pack(pt);
4ec93edb 453
1da177e4
LT
454 synchronize_net();
455}
d1b19dff 456EXPORT_SYMBOL(dev_remove_pack);
1da177e4
LT
457
458/******************************************************************************
459
460 Device Boot-time Settings Routines
461
462*******************************************************************************/
463
464/* Boot time configuration table */
465static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
466
467/**
468 * netdev_boot_setup_add - add new setup entry
469 * @name: name of the device
470 * @map: configured settings for the device
471 *
472 * Adds new setup entry to the dev_boot_setup list. The function
473 * returns 0 on error and 1 on success. This is a generic routine to
474 * all netdevices.
475 */
476static int netdev_boot_setup_add(char *name, struct ifmap *map)
477{
478 struct netdev_boot_setup *s;
479 int i;
480
481 s = dev_boot_setup;
482 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
483 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
484 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 485 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
486 memcpy(&s[i].map, map, sizeof(s[i].map));
487 break;
488 }
489 }
490
491 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
492}
493
494/**
495 * netdev_boot_setup_check - check boot time settings
496 * @dev: the netdevice
497 *
498 * Check boot time settings for the device.
499 * The found settings are set for the device to be used
500 * later in the device probing.
501 * Returns 0 if no settings found, 1 if they are.
502 */
503int netdev_boot_setup_check(struct net_device *dev)
504{
505 struct netdev_boot_setup *s = dev_boot_setup;
506 int i;
507
508 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
509 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 510 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
511 dev->irq = s[i].map.irq;
512 dev->base_addr = s[i].map.base_addr;
513 dev->mem_start = s[i].map.mem_start;
514 dev->mem_end = s[i].map.mem_end;
515 return 1;
516 }
517 }
518 return 0;
519}
d1b19dff 520EXPORT_SYMBOL(netdev_boot_setup_check);
1da177e4
LT
521
522
523/**
524 * netdev_boot_base - get address from boot time settings
525 * @prefix: prefix for network device
526 * @unit: id for network device
527 *
528 * Check boot time settings for the base address of device.
529 * The found settings are set for the device to be used
530 * later in the device probing.
531 * Returns 0 if no settings found.
532 */
533unsigned long netdev_boot_base(const char *prefix, int unit)
534{
535 const struct netdev_boot_setup *s = dev_boot_setup;
536 char name[IFNAMSIZ];
537 int i;
538
539 sprintf(name, "%s%d", prefix, unit);
540
541 /*
542 * If device already registered then return base of 1
543 * to indicate not to probe for this interface
544 */
881d966b 545 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
546 return 1;
547
548 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
549 if (!strcmp(name, s[i].name))
550 return s[i].map.base_addr;
551 return 0;
552}
553
554/*
555 * Saves at boot time configured settings for any netdevice.
556 */
557int __init netdev_boot_setup(char *str)
558{
559 int ints[5];
560 struct ifmap map;
561
562 str = get_options(str, ARRAY_SIZE(ints), ints);
563 if (!str || !*str)
564 return 0;
565
566 /* Save settings */
567 memset(&map, 0, sizeof(map));
568 if (ints[0] > 0)
569 map.irq = ints[1];
570 if (ints[0] > 1)
571 map.base_addr = ints[2];
572 if (ints[0] > 2)
573 map.mem_start = ints[3];
574 if (ints[0] > 3)
575 map.mem_end = ints[4];
576
577 /* Add new entry to the list */
578 return netdev_boot_setup_add(str, &map);
579}
580
581__setup("netdev=", netdev_boot_setup);
582
583/*******************************************************************************
584
585 Device Interface Subroutines
586
587*******************************************************************************/
588
589/**
590 * __dev_get_by_name - find a device by its name
c4ea43c5 591 * @net: the applicable net namespace
1da177e4
LT
592 * @name: name to find
593 *
594 * Find an interface by name. Must be called under RTNL semaphore
595 * or @dev_base_lock. If the name is found a pointer to the device
596 * is returned. If the name is not found then %NULL is returned. The
597 * reference counters are not incremented so the caller must be
598 * careful with locks.
599 */
600
881d966b 601struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
602{
603 struct hlist_node *p;
0bd8d536
ED
604 struct net_device *dev;
605 struct hlist_head *head = dev_name_hash(net, name);
1da177e4 606
0bd8d536 607 hlist_for_each_entry(dev, p, head, name_hlist)
1da177e4
LT
608 if (!strncmp(dev->name, name, IFNAMSIZ))
609 return dev;
0bd8d536 610
1da177e4
LT
611 return NULL;
612}
d1b19dff 613EXPORT_SYMBOL(__dev_get_by_name);
1da177e4 614
72c9528b
ED
615/**
616 * dev_get_by_name_rcu - find a device by its name
617 * @net: the applicable net namespace
618 * @name: name to find
619 *
620 * Find an interface by name.
621 * If the name is found a pointer to the device is returned.
622 * If the name is not found then %NULL is returned.
623 * The reference counters are not incremented so the caller must be
624 * careful with locks. The caller must hold RCU lock.
625 */
626
627struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
628{
629 struct hlist_node *p;
630 struct net_device *dev;
631 struct hlist_head *head = dev_name_hash(net, name);
632
633 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
634 if (!strncmp(dev->name, name, IFNAMSIZ))
635 return dev;
636
637 return NULL;
638}
639EXPORT_SYMBOL(dev_get_by_name_rcu);
640
1da177e4
LT
641/**
642 * dev_get_by_name - find a device by its name
c4ea43c5 643 * @net: the applicable net namespace
1da177e4
LT
644 * @name: name to find
645 *
646 * Find an interface by name. This can be called from any
647 * context and does its own locking. The returned handle has
648 * the usage count incremented and the caller must use dev_put() to
649 * release it when it is no longer needed. %NULL is returned if no
650 * matching device is found.
651 */
652
881d966b 653struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
654{
655 struct net_device *dev;
656
72c9528b
ED
657 rcu_read_lock();
658 dev = dev_get_by_name_rcu(net, name);
1da177e4
LT
659 if (dev)
660 dev_hold(dev);
72c9528b 661 rcu_read_unlock();
1da177e4
LT
662 return dev;
663}
d1b19dff 664EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
665
666/**
667 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 668 * @net: the applicable net namespace
1da177e4
LT
669 * @ifindex: index of device
670 *
671 * Search for an interface by index. Returns %NULL if the device
672 * is not found or a pointer to the device. The device has not
673 * had its reference counter increased so the caller must be careful
674 * about locking. The caller must hold either the RTNL semaphore
675 * or @dev_base_lock.
676 */
677
881d966b 678struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
679{
680 struct hlist_node *p;
0bd8d536
ED
681 struct net_device *dev;
682 struct hlist_head *head = dev_index_hash(net, ifindex);
1da177e4 683
0bd8d536 684 hlist_for_each_entry(dev, p, head, index_hlist)
1da177e4
LT
685 if (dev->ifindex == ifindex)
686 return dev;
0bd8d536 687
1da177e4
LT
688 return NULL;
689}
d1b19dff 690EXPORT_SYMBOL(__dev_get_by_index);
1da177e4 691
fb699dfd
ED
692/**
693 * dev_get_by_index_rcu - find a device by its ifindex
694 * @net: the applicable net namespace
695 * @ifindex: index of device
696 *
697 * Search for an interface by index. Returns %NULL if the device
698 * is not found or a pointer to the device. The device has not
699 * had its reference counter increased so the caller must be careful
700 * about locking. The caller must hold RCU lock.
701 */
702
703struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
704{
705 struct hlist_node *p;
706 struct net_device *dev;
707 struct hlist_head *head = dev_index_hash(net, ifindex);
708
709 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
710 if (dev->ifindex == ifindex)
711 return dev;
712
713 return NULL;
714}
715EXPORT_SYMBOL(dev_get_by_index_rcu);
716
1da177e4
LT
717
718/**
719 * dev_get_by_index - find a device by its ifindex
c4ea43c5 720 * @net: the applicable net namespace
1da177e4
LT
721 * @ifindex: index of device
722 *
723 * Search for an interface by index. Returns NULL if the device
724 * is not found or a pointer to the device. The device returned has
725 * had a reference added and the pointer is safe until the user calls
726 * dev_put to indicate they have finished with it.
727 */
728
881d966b 729struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
730{
731 struct net_device *dev;
732
fb699dfd
ED
733 rcu_read_lock();
734 dev = dev_get_by_index_rcu(net, ifindex);
1da177e4
LT
735 if (dev)
736 dev_hold(dev);
fb699dfd 737 rcu_read_unlock();
1da177e4
LT
738 return dev;
739}
d1b19dff 740EXPORT_SYMBOL(dev_get_by_index);
1da177e4
LT
741
742/**
743 * dev_getbyhwaddr - find a device by its hardware address
c4ea43c5 744 * @net: the applicable net namespace
1da177e4
LT
745 * @type: media type of device
746 * @ha: hardware address
747 *
748 * Search for an interface by MAC address. Returns NULL if the device
749 * is not found or a pointer to the device. The caller must hold the
750 * rtnl semaphore. The returned device has not had its ref count increased
751 * and the caller must therefore be careful about locking
752 *
753 * BUGS:
754 * If the API was consistent this would be __dev_get_by_hwaddr
755 */
756
881d966b 757struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
1da177e4
LT
758{
759 struct net_device *dev;
760
761 ASSERT_RTNL();
762
81103a52 763 for_each_netdev(net, dev)
1da177e4
LT
764 if (dev->type == type &&
765 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
766 return dev;
767
768 return NULL;
1da177e4 769}
cf309e3f
JF
770EXPORT_SYMBOL(dev_getbyhwaddr);
771
881d966b 772struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
773{
774 struct net_device *dev;
775
4e9cac2b 776 ASSERT_RTNL();
881d966b 777 for_each_netdev(net, dev)
4e9cac2b 778 if (dev->type == type)
7562f876
PE
779 return dev;
780
781 return NULL;
4e9cac2b 782}
4e9cac2b
PM
783EXPORT_SYMBOL(__dev_getfirstbyhwtype);
784
881d966b 785struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b 786{
99fe3c39 787 struct net_device *dev, *ret = NULL;
4e9cac2b 788
99fe3c39
ED
789 rcu_read_lock();
790 for_each_netdev_rcu(net, dev)
791 if (dev->type == type) {
792 dev_hold(dev);
793 ret = dev;
794 break;
795 }
796 rcu_read_unlock();
797 return ret;
1da177e4 798}
1da177e4
LT
799EXPORT_SYMBOL(dev_getfirstbyhwtype);
800
801/**
bb69ae04 802 * dev_get_by_flags_rcu - find any device with given flags
c4ea43c5 803 * @net: the applicable net namespace
1da177e4
LT
804 * @if_flags: IFF_* values
805 * @mask: bitmask of bits in if_flags to check
806 *
807 * Search for any interface with the given flags. Returns NULL if a device
bb69ae04
ED
808 * is not found or a pointer to the device. Must be called inside
809 * rcu_read_lock(), and result refcount is unchanged.
1da177e4
LT
810 */
811
bb69ae04 812struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
d1b19dff 813 unsigned short mask)
1da177e4 814{
7562f876 815 struct net_device *dev, *ret;
1da177e4 816
7562f876 817 ret = NULL;
c6d14c84 818 for_each_netdev_rcu(net, dev) {
1da177e4 819 if (((dev->flags ^ if_flags) & mask) == 0) {
7562f876 820 ret = dev;
1da177e4
LT
821 break;
822 }
823 }
7562f876 824 return ret;
1da177e4 825}
bb69ae04 826EXPORT_SYMBOL(dev_get_by_flags_rcu);
1da177e4
LT
827
828/**
829 * dev_valid_name - check if name is okay for network device
830 * @name: name string
831 *
832 * Network device names need to be valid file names to
c7fa9d18
DM
833 * to allow sysfs to work. We also disallow any kind of
834 * whitespace.
1da177e4 835 */
c2373ee9 836int dev_valid_name(const char *name)
1da177e4 837{
c7fa9d18
DM
838 if (*name == '\0')
839 return 0;
b6fe17d6
SH
840 if (strlen(name) >= IFNAMSIZ)
841 return 0;
c7fa9d18
DM
842 if (!strcmp(name, ".") || !strcmp(name, ".."))
843 return 0;
844
845 while (*name) {
846 if (*name == '/' || isspace(*name))
847 return 0;
848 name++;
849 }
850 return 1;
1da177e4 851}
d1b19dff 852EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
853
854/**
b267b179
EB
855 * __dev_alloc_name - allocate a name for a device
856 * @net: network namespace to allocate the device name in
1da177e4 857 * @name: name format string
b267b179 858 * @buf: scratch buffer and result name string
1da177e4
LT
859 *
860 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
861 * id. It scans list of devices to build up a free map, then chooses
862 * the first empty slot. The caller must hold the dev_base or rtnl lock
863 * while allocating the name and adding the device in order to avoid
864 * duplicates.
865 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
866 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
867 */
868
b267b179 869static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
870{
871 int i = 0;
1da177e4
LT
872 const char *p;
873 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 874 unsigned long *inuse;
1da177e4
LT
875 struct net_device *d;
876
877 p = strnchr(name, IFNAMSIZ-1, '%');
878 if (p) {
879 /*
880 * Verify the string as this thing may have come from
881 * the user. There must be either one "%d" and no other "%"
882 * characters.
883 */
884 if (p[1] != 'd' || strchr(p + 2, '%'))
885 return -EINVAL;
886
887 /* Use one page as a bit array of possible slots */
cfcabdcc 888 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
889 if (!inuse)
890 return -ENOMEM;
891
881d966b 892 for_each_netdev(net, d) {
1da177e4
LT
893 if (!sscanf(d->name, name, &i))
894 continue;
895 if (i < 0 || i >= max_netdevices)
896 continue;
897
898 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 899 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
900 if (!strncmp(buf, d->name, IFNAMSIZ))
901 set_bit(i, inuse);
902 }
903
904 i = find_first_zero_bit(inuse, max_netdevices);
905 free_page((unsigned long) inuse);
906 }
907
d9031024
OP
908 if (buf != name)
909 snprintf(buf, IFNAMSIZ, name, i);
b267b179 910 if (!__dev_get_by_name(net, buf))
1da177e4 911 return i;
1da177e4
LT
912
913 /* It is possible to run out of possible slots
914 * when the name is long and there isn't enough space left
915 * for the digits, or if all bits are used.
916 */
917 return -ENFILE;
918}
919
b267b179
EB
920/**
921 * dev_alloc_name - allocate a name for a device
922 * @dev: device
923 * @name: name format string
924 *
925 * Passed a format string - eg "lt%d" it will try and find a suitable
926 * id. It scans list of devices to build up a free map, then chooses
927 * the first empty slot. The caller must hold the dev_base or rtnl lock
928 * while allocating the name and adding the device in order to avoid
929 * duplicates.
930 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
931 * Returns the number of the unit assigned or a negative errno code.
932 */
933
934int dev_alloc_name(struct net_device *dev, const char *name)
935{
936 char buf[IFNAMSIZ];
937 struct net *net;
938 int ret;
939
c346dca1
YH
940 BUG_ON(!dev_net(dev));
941 net = dev_net(dev);
b267b179
EB
942 ret = __dev_alloc_name(net, name, buf);
943 if (ret >= 0)
944 strlcpy(dev->name, buf, IFNAMSIZ);
945 return ret;
946}
d1b19dff 947EXPORT_SYMBOL(dev_alloc_name);
b267b179 948
8ce6cebc 949static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
d9031024 950{
8ce6cebc
DL
951 struct net *net;
952
953 BUG_ON(!dev_net(dev));
954 net = dev_net(dev);
955
d9031024
OP
956 if (!dev_valid_name(name))
957 return -EINVAL;
958
959 if (fmt && strchr(name, '%'))
8ce6cebc 960 return dev_alloc_name(dev, name);
d9031024
OP
961 else if (__dev_get_by_name(net, name))
962 return -EEXIST;
8ce6cebc
DL
963 else if (dev->name != name)
964 strlcpy(dev->name, name, IFNAMSIZ);
d9031024
OP
965
966 return 0;
967}
1da177e4
LT
968
969/**
970 * dev_change_name - change name of a device
971 * @dev: device
972 * @newname: name (or format string) must be at least IFNAMSIZ
973 *
974 * Change name of a device, can pass format strings "eth%d".
975 * for wildcarding.
976 */
cf04a4c7 977int dev_change_name(struct net_device *dev, const char *newname)
1da177e4 978{
fcc5a03a 979 char oldname[IFNAMSIZ];
1da177e4 980 int err = 0;
fcc5a03a 981 int ret;
881d966b 982 struct net *net;
1da177e4
LT
983
984 ASSERT_RTNL();
c346dca1 985 BUG_ON(!dev_net(dev));
1da177e4 986
c346dca1 987 net = dev_net(dev);
1da177e4
LT
988 if (dev->flags & IFF_UP)
989 return -EBUSY;
990
c8d90dca
SH
991 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
992 return 0;
993
fcc5a03a
HX
994 memcpy(oldname, dev->name, IFNAMSIZ);
995
8ce6cebc 996 err = dev_get_valid_name(dev, newname, 1);
d9031024
OP
997 if (err < 0)
998 return err;
1da177e4 999
fcc5a03a 1000rollback:
a1b3f594
EB
1001 ret = device_rename(&dev->dev, dev->name);
1002 if (ret) {
1003 memcpy(dev->name, oldname, IFNAMSIZ);
1004 return ret;
dcc99773 1005 }
7f988eab
HX
1006
1007 write_lock_bh(&dev_base_lock);
92749821 1008 hlist_del(&dev->name_hlist);
72c9528b
ED
1009 write_unlock_bh(&dev_base_lock);
1010
1011 synchronize_rcu();
1012
1013 write_lock_bh(&dev_base_lock);
1014 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
1015 write_unlock_bh(&dev_base_lock);
1016
056925ab 1017 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
1018 ret = notifier_to_errno(ret);
1019
1020 if (ret) {
91e9c07b
ED
1021 /* err >= 0 after dev_alloc_name() or stores the first errno */
1022 if (err >= 0) {
fcc5a03a
HX
1023 err = ret;
1024 memcpy(dev->name, oldname, IFNAMSIZ);
1025 goto rollback;
91e9c07b
ED
1026 } else {
1027 printk(KERN_ERR
1028 "%s: name change rollback failed: %d.\n",
1029 dev->name, ret);
fcc5a03a
HX
1030 }
1031 }
1da177e4
LT
1032
1033 return err;
1034}
1035
0b815a1a
SH
1036/**
1037 * dev_set_alias - change ifalias of a device
1038 * @dev: device
1039 * @alias: name up to IFALIASZ
f0db275a 1040 * @len: limit of bytes to copy from info
0b815a1a
SH
1041 *
1042 * Set ifalias for a device,
1043 */
1044int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1045{
1046 ASSERT_RTNL();
1047
1048 if (len >= IFALIASZ)
1049 return -EINVAL;
1050
96ca4a2c
OH
1051 if (!len) {
1052 if (dev->ifalias) {
1053 kfree(dev->ifalias);
1054 dev->ifalias = NULL;
1055 }
1056 return 0;
1057 }
1058
d1b19dff 1059 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
0b815a1a
SH
1060 if (!dev->ifalias)
1061 return -ENOMEM;
1062
1063 strlcpy(dev->ifalias, alias, len+1);
1064 return len;
1065}
1066
1067
d8a33ac4 1068/**
3041a069 1069 * netdev_features_change - device changes features
d8a33ac4
SH
1070 * @dev: device to cause notification
1071 *
1072 * Called to indicate a device has changed features.
1073 */
1074void netdev_features_change(struct net_device *dev)
1075{
056925ab 1076 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
1077}
1078EXPORT_SYMBOL(netdev_features_change);
1079
1da177e4
LT
1080/**
1081 * netdev_state_change - device changes state
1082 * @dev: device to cause notification
1083 *
1084 * Called to indicate a device has changed state. This function calls
1085 * the notifier chains for netdev_chain and sends a NEWLINK message
1086 * to the routing socket.
1087 */
1088void netdev_state_change(struct net_device *dev)
1089{
1090 if (dev->flags & IFF_UP) {
056925ab 1091 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1da177e4
LT
1092 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1093 }
1094}
d1b19dff 1095EXPORT_SYMBOL(netdev_state_change);
1da177e4 1096
3ca5b404 1097int netdev_bonding_change(struct net_device *dev, unsigned long event)
c1da4ac7 1098{
3ca5b404 1099 return call_netdevice_notifiers(event, dev);
c1da4ac7
OG
1100}
1101EXPORT_SYMBOL(netdev_bonding_change);
1102
1da177e4
LT
1103/**
1104 * dev_load - load a network module
c4ea43c5 1105 * @net: the applicable net namespace
1da177e4
LT
1106 * @name: name of interface
1107 *
1108 * If a network interface is not present and the process has suitable
1109 * privileges this function loads the module. If module loading is not
1110 * available in this kernel then it becomes a nop.
1111 */
1112
881d966b 1113void dev_load(struct net *net, const char *name)
1da177e4 1114{
4ec93edb 1115 struct net_device *dev;
1da177e4 1116
72c9528b
ED
1117 rcu_read_lock();
1118 dev = dev_get_by_name_rcu(net, name);
1119 rcu_read_unlock();
1da177e4 1120
a8f80e8f 1121 if (!dev && capable(CAP_NET_ADMIN))
1da177e4
LT
1122 request_module("%s", name);
1123}
d1b19dff 1124EXPORT_SYMBOL(dev_load);
1da177e4 1125
bd380811 1126static int __dev_open(struct net_device *dev)
1da177e4 1127{
d314774c 1128 const struct net_device_ops *ops = dev->netdev_ops;
3b8bcfd5 1129 int ret;
1da177e4 1130
e46b66bc
BH
1131 ASSERT_RTNL();
1132
1da177e4
LT
1133 /*
1134 * Is it even present?
1135 */
1136 if (!netif_device_present(dev))
1137 return -ENODEV;
1138
3b8bcfd5
JB
1139 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1140 ret = notifier_to_errno(ret);
1141 if (ret)
1142 return ret;
1143
1da177e4
LT
1144 /*
1145 * Call device private open method
1146 */
1147 set_bit(__LINK_STATE_START, &dev->state);
bada339b 1148
d314774c
SH
1149 if (ops->ndo_validate_addr)
1150 ret = ops->ndo_validate_addr(dev);
bada339b 1151
d314774c
SH
1152 if (!ret && ops->ndo_open)
1153 ret = ops->ndo_open(dev);
1da177e4 1154
4ec93edb 1155 /*
1da177e4
LT
1156 * If it went open OK then:
1157 */
1158
bada339b
JG
1159 if (ret)
1160 clear_bit(__LINK_STATE_START, &dev->state);
1161 else {
1da177e4
LT
1162 /*
1163 * Set the flags.
1164 */
1165 dev->flags |= IFF_UP;
1166
649274d9
DW
1167 /*
1168 * Enable NET_DMA
1169 */
b4bd07c2 1170 net_dmaengine_get();
649274d9 1171
1da177e4
LT
1172 /*
1173 * Initialize multicasting status
1174 */
4417da66 1175 dev_set_rx_mode(dev);
1da177e4
LT
1176
1177 /*
1178 * Wakeup transmit queue engine
1179 */
1180 dev_activate(dev);
1da177e4 1181 }
bada339b 1182
1da177e4
LT
1183 return ret;
1184}
1185
1186/**
bd380811
PM
1187 * dev_open - prepare an interface for use.
1188 * @dev: device to open
1da177e4 1189 *
bd380811
PM
1190 * Takes a device from down to up state. The device's private open
1191 * function is invoked and then the multicast lists are loaded. Finally
1192 * the device is moved into the up state and a %NETDEV_UP message is
1193 * sent to the netdev notifier chain.
1194 *
1195 * Calling this function on an active interface is a nop. On a failure
1196 * a negative errno code is returned.
1da177e4 1197 */
bd380811
PM
1198int dev_open(struct net_device *dev)
1199{
1200 int ret;
1201
1202 /*
1203 * Is it already up?
1204 */
1205 if (dev->flags & IFF_UP)
1206 return 0;
1207
1208 /*
1209 * Open device
1210 */
1211 ret = __dev_open(dev);
1212 if (ret < 0)
1213 return ret;
1214
1215 /*
1216 * ... and announce new interface.
1217 */
1218 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1219 call_netdevice_notifiers(NETDEV_UP, dev);
1220
1221 return ret;
1222}
1223EXPORT_SYMBOL(dev_open);
1224
1225static int __dev_close(struct net_device *dev)
1da177e4 1226{
d314774c 1227 const struct net_device_ops *ops = dev->netdev_ops;
e46b66bc 1228
bd380811 1229 ASSERT_RTNL();
9d5010db
DM
1230 might_sleep();
1231
1da177e4
LT
1232 /*
1233 * Tell people we are going down, so that they can
1234 * prepare to death, when device is still operating.
1235 */
056925ab 1236 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1237
1da177e4
LT
1238 clear_bit(__LINK_STATE_START, &dev->state);
1239
1240 /* Synchronize to scheduled poll. We cannot touch poll list,
bea3348e
SH
1241 * it can be even on different cpu. So just clear netif_running().
1242 *
1243 * dev->stop() will invoke napi_disable() on all of it's
1244 * napi_struct instances on this device.
1245 */
1da177e4 1246 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1da177e4 1247
d8b2a4d2
ML
1248 dev_deactivate(dev);
1249
1da177e4
LT
1250 /*
1251 * Call the device specific close. This cannot fail.
1252 * Only if device is UP
1253 *
1254 * We allow it to be called even after a DETACH hot-plug
1255 * event.
1256 */
d314774c
SH
1257 if (ops->ndo_stop)
1258 ops->ndo_stop(dev);
1da177e4
LT
1259
1260 /*
1261 * Device is now down.
1262 */
1263
1264 dev->flags &= ~IFF_UP;
1265
1266 /*
bd380811 1267 * Shutdown NET_DMA
1da177e4 1268 */
bd380811
PM
1269 net_dmaengine_put();
1270
1271 return 0;
1272}
1273
1274/**
1275 * dev_close - shutdown an interface.
1276 * @dev: device to shutdown
1277 *
1278 * This function moves an active device into down state. A
1279 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1280 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1281 * chain.
1282 */
1283int dev_close(struct net_device *dev)
1284{
1285 if (!(dev->flags & IFF_UP))
1286 return 0;
1287
1288 __dev_close(dev);
1da177e4 1289
649274d9 1290 /*
bd380811 1291 * Tell people we are down
649274d9 1292 */
bd380811
PM
1293 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1294 call_netdevice_notifiers(NETDEV_DOWN, dev);
649274d9 1295
1da177e4
LT
1296 return 0;
1297}
d1b19dff 1298EXPORT_SYMBOL(dev_close);
1da177e4
LT
1299
1300
0187bdfb
BH
1301/**
1302 * dev_disable_lro - disable Large Receive Offload on a device
1303 * @dev: device
1304 *
1305 * Disable Large Receive Offload (LRO) on a net device. Must be
1306 * called under RTNL. This is needed if received packets may be
1307 * forwarded to another interface.
1308 */
1309void dev_disable_lro(struct net_device *dev)
1310{
1311 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1312 dev->ethtool_ops->set_flags) {
1313 u32 flags = dev->ethtool_ops->get_flags(dev);
1314 if (flags & ETH_FLAG_LRO) {
1315 flags &= ~ETH_FLAG_LRO;
1316 dev->ethtool_ops->set_flags(dev, flags);
1317 }
1318 }
1319 WARN_ON(dev->features & NETIF_F_LRO);
1320}
1321EXPORT_SYMBOL(dev_disable_lro);
1322
1323
881d966b
EB
1324static int dev_boot_phase = 1;
1325
1da177e4
LT
1326/*
1327 * Device change register/unregister. These are not inline or static
1328 * as we export them to the world.
1329 */
1330
1331/**
1332 * register_netdevice_notifier - register a network notifier block
1333 * @nb: notifier
1334 *
1335 * Register a notifier to be called when network device events occur.
1336 * The notifier passed is linked into the kernel structures and must
1337 * not be reused until it has been unregistered. A negative errno code
1338 * is returned on a failure.
1339 *
1340 * When registered all registration and up events are replayed
4ec93edb 1341 * to the new notifier to allow device to have a race free
1da177e4
LT
1342 * view of the network device list.
1343 */
1344
1345int register_netdevice_notifier(struct notifier_block *nb)
1346{
1347 struct net_device *dev;
fcc5a03a 1348 struct net_device *last;
881d966b 1349 struct net *net;
1da177e4
LT
1350 int err;
1351
1352 rtnl_lock();
f07d5b94 1353 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1354 if (err)
1355 goto unlock;
881d966b
EB
1356 if (dev_boot_phase)
1357 goto unlock;
1358 for_each_net(net) {
1359 for_each_netdev(net, dev) {
1360 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1361 err = notifier_to_errno(err);
1362 if (err)
1363 goto rollback;
1364
1365 if (!(dev->flags & IFF_UP))
1366 continue;
1da177e4 1367
881d966b
EB
1368 nb->notifier_call(nb, NETDEV_UP, dev);
1369 }
1da177e4 1370 }
fcc5a03a
HX
1371
1372unlock:
1da177e4
LT
1373 rtnl_unlock();
1374 return err;
fcc5a03a
HX
1375
1376rollback:
1377 last = dev;
881d966b
EB
1378 for_each_net(net) {
1379 for_each_netdev(net, dev) {
1380 if (dev == last)
1381 break;
fcc5a03a 1382
881d966b
EB
1383 if (dev->flags & IFF_UP) {
1384 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1385 nb->notifier_call(nb, NETDEV_DOWN, dev);
1386 }
1387 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
a5ee1551 1388 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
fcc5a03a 1389 }
fcc5a03a 1390 }
c67625a1
PE
1391
1392 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1393 goto unlock;
1da177e4 1394}
d1b19dff 1395EXPORT_SYMBOL(register_netdevice_notifier);
1da177e4
LT
1396
1397/**
1398 * unregister_netdevice_notifier - unregister a network notifier block
1399 * @nb: notifier
1400 *
1401 * Unregister a notifier previously registered by
1402 * register_netdevice_notifier(). The notifier is unlinked into the
1403 * kernel structures and may then be reused. A negative errno code
1404 * is returned on a failure.
1405 */
1406
1407int unregister_netdevice_notifier(struct notifier_block *nb)
1408{
9f514950
HX
1409 int err;
1410
1411 rtnl_lock();
f07d5b94 1412 err = raw_notifier_chain_unregister(&netdev_chain, nb);
9f514950
HX
1413 rtnl_unlock();
1414 return err;
1da177e4 1415}
d1b19dff 1416EXPORT_SYMBOL(unregister_netdevice_notifier);
1da177e4
LT
1417
1418/**
1419 * call_netdevice_notifiers - call all network notifier blocks
1420 * @val: value passed unmodified to notifier function
c4ea43c5 1421 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1422 *
1423 * Call all network notifier blocks. Parameters and return value
f07d5b94 1424 * are as for raw_notifier_call_chain().
1da177e4
LT
1425 */
1426
ad7379d4 1427int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1428{
ab930471 1429 ASSERT_RTNL();
ad7379d4 1430 return raw_notifier_call_chain(&netdev_chain, val, dev);
1da177e4
LT
1431}
1432
1433/* When > 0 there are consumers of rx skb time stamps */
1434static atomic_t netstamp_needed = ATOMIC_INIT(0);
1435
1436void net_enable_timestamp(void)
1437{
1438 atomic_inc(&netstamp_needed);
1439}
d1b19dff 1440EXPORT_SYMBOL(net_enable_timestamp);
1da177e4
LT
1441
1442void net_disable_timestamp(void)
1443{
1444 atomic_dec(&netstamp_needed);
1445}
d1b19dff 1446EXPORT_SYMBOL(net_disable_timestamp);
1da177e4 1447
3b098e2d 1448static inline void net_timestamp_set(struct sk_buff *skb)
1da177e4
LT
1449{
1450 if (atomic_read(&netstamp_needed))
a61bbcf2 1451 __net_timestamp(skb);
b7aa0bf7
ED
1452 else
1453 skb->tstamp.tv64 = 0;
1da177e4
LT
1454}
1455
3b098e2d
ED
1456static inline void net_timestamp_check(struct sk_buff *skb)
1457{
1458 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1459 __net_timestamp(skb);
1460}
1461
44540960
AB
1462/**
1463 * dev_forward_skb - loopback an skb to another netif
1464 *
1465 * @dev: destination network device
1466 * @skb: buffer to forward
1467 *
1468 * return values:
1469 * NET_RX_SUCCESS (no congestion)
6ec82562 1470 * NET_RX_DROP (packet was dropped, but freed)
44540960
AB
1471 *
1472 * dev_forward_skb can be used for injecting an skb from the
1473 * start_xmit function of one device into the receive queue
1474 * of another device.
1475 *
1476 * The receiving device may be in another namespace, so
1477 * we have to clear all information in the skb that could
1478 * impact namespace isolation.
1479 */
1480int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1481{
1482 skb_orphan(skb);
c736eefa 1483 nf_reset(skb);
44540960 1484
6ec82562
ED
1485 if (!(dev->flags & IFF_UP) ||
1486 (skb->len > (dev->mtu + dev->hard_header_len))) {
1487 kfree_skb(skb);
44540960 1488 return NET_RX_DROP;
6ec82562 1489 }
8a83a00b 1490 skb_set_dev(skb, dev);
44540960
AB
1491 skb->tstamp.tv64 = 0;
1492 skb->pkt_type = PACKET_HOST;
1493 skb->protocol = eth_type_trans(skb, dev);
44540960
AB
1494 return netif_rx(skb);
1495}
1496EXPORT_SYMBOL_GPL(dev_forward_skb);
1497
1da177e4
LT
1498/*
1499 * Support routine. Sends outgoing frames to any network
1500 * taps currently in use.
1501 */
1502
f6a78bfc 1503static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1504{
1505 struct packet_type *ptype;
a61bbcf2 1506
8caf1539
JP
1507#ifdef CONFIG_NET_CLS_ACT
1508 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
3b098e2d 1509 net_timestamp_set(skb);
8caf1539 1510#else
3b098e2d 1511 net_timestamp_set(skb);
8caf1539 1512#endif
1da177e4
LT
1513
1514 rcu_read_lock();
1515 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1516 /* Never send packets back to the socket
1517 * they originated from - MvS (miquels@drinkel.ow.org)
1518 */
1519 if ((ptype->dev == dev || !ptype->dev) &&
1520 (ptype->af_packet_priv == NULL ||
1521 (struct sock *)ptype->af_packet_priv != skb->sk)) {
d1b19dff 1522 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1da177e4
LT
1523 if (!skb2)
1524 break;
1525
1526 /* skb->nh should be correctly
1527 set by sender, so that the second statement is
1528 just protection against buggy protocols.
1529 */
459a98ed 1530 skb_reset_mac_header(skb2);
1da177e4 1531
d56f90a7 1532 if (skb_network_header(skb2) < skb2->data ||
27a884dc 1533 skb2->network_header > skb2->tail) {
1da177e4
LT
1534 if (net_ratelimit())
1535 printk(KERN_CRIT "protocol %04x is "
1536 "buggy, dev %s\n",
70777d03
SAS
1537 ntohs(skb2->protocol),
1538 dev->name);
c1d2bbe1 1539 skb_reset_network_header(skb2);
1da177e4
LT
1540 }
1541
b0e380b1 1542 skb2->transport_header = skb2->network_header;
1da177e4 1543 skb2->pkt_type = PACKET_OUTGOING;
f2ccd8fa 1544 ptype->func(skb2, skb->dev, ptype, skb->dev);
1da177e4
LT
1545 }
1546 }
1547 rcu_read_unlock();
1548}
1549
f0796d5c
JF
1550/*
1551 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1552 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1553 */
1554void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1555{
1556 unsigned int real_num = dev->real_num_tx_queues;
1557
1558 if (unlikely(txq > dev->num_tx_queues))
1559 ;
1560 else if (txq > real_num)
1561 dev->real_num_tx_queues = txq;
1562 else if (txq < real_num) {
1563 dev->real_num_tx_queues = txq;
1564 qdisc_reset_all_tx_gt(dev, txq);
1565 }
1566}
1567EXPORT_SYMBOL(netif_set_real_num_tx_queues);
56079431 1568
def82a1d 1569static inline void __netif_reschedule(struct Qdisc *q)
56079431 1570{
def82a1d
JP
1571 struct softnet_data *sd;
1572 unsigned long flags;
56079431 1573
def82a1d
JP
1574 local_irq_save(flags);
1575 sd = &__get_cpu_var(softnet_data);
a9cbd588
CG
1576 q->next_sched = NULL;
1577 *sd->output_queue_tailp = q;
1578 sd->output_queue_tailp = &q->next_sched;
def82a1d
JP
1579 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1580 local_irq_restore(flags);
1581}
1582
1583void __netif_schedule(struct Qdisc *q)
1584{
1585 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1586 __netif_reschedule(q);
56079431
DV
1587}
1588EXPORT_SYMBOL(__netif_schedule);
1589
bea3348e 1590void dev_kfree_skb_irq(struct sk_buff *skb)
56079431 1591{
3578b0c8 1592 if (atomic_dec_and_test(&skb->users)) {
bea3348e
SH
1593 struct softnet_data *sd;
1594 unsigned long flags;
56079431 1595
bea3348e
SH
1596 local_irq_save(flags);
1597 sd = &__get_cpu_var(softnet_data);
1598 skb->next = sd->completion_queue;
1599 sd->completion_queue = skb;
1600 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1601 local_irq_restore(flags);
1602 }
56079431 1603}
bea3348e 1604EXPORT_SYMBOL(dev_kfree_skb_irq);
56079431
DV
1605
1606void dev_kfree_skb_any(struct sk_buff *skb)
1607{
1608 if (in_irq() || irqs_disabled())
1609 dev_kfree_skb_irq(skb);
1610 else
1611 dev_kfree_skb(skb);
1612}
1613EXPORT_SYMBOL(dev_kfree_skb_any);
1614
1615
bea3348e
SH
1616/**
1617 * netif_device_detach - mark device as removed
1618 * @dev: network device
1619 *
1620 * Mark device as removed from system and therefore no longer available.
1621 */
56079431
DV
1622void netif_device_detach(struct net_device *dev)
1623{
1624 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1625 netif_running(dev)) {
d543103a 1626 netif_tx_stop_all_queues(dev);
56079431
DV
1627 }
1628}
1629EXPORT_SYMBOL(netif_device_detach);
1630
bea3348e
SH
1631/**
1632 * netif_device_attach - mark device as attached
1633 * @dev: network device
1634 *
1635 * Mark device as attached from system and restart if needed.
1636 */
56079431
DV
1637void netif_device_attach(struct net_device *dev)
1638{
1639 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1640 netif_running(dev)) {
d543103a 1641 netif_tx_wake_all_queues(dev);
4ec93edb 1642 __netdev_watchdog_up(dev);
56079431
DV
1643 }
1644}
1645EXPORT_SYMBOL(netif_device_attach);
1646
6de329e2
BH
1647static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1648{
1649 return ((features & NETIF_F_GEN_CSUM) ||
1650 ((features & NETIF_F_IP_CSUM) &&
1651 protocol == htons(ETH_P_IP)) ||
1652 ((features & NETIF_F_IPV6_CSUM) &&
1c8dbcf6
YZ
1653 protocol == htons(ETH_P_IPV6)) ||
1654 ((features & NETIF_F_FCOE_CRC) &&
1655 protocol == htons(ETH_P_FCOE)));
6de329e2
BH
1656}
1657
1658static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1659{
1660 if (can_checksum_protocol(dev->features, skb->protocol))
1661 return true;
1662
1663 if (skb->protocol == htons(ETH_P_8021Q)) {
1664 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1665 if (can_checksum_protocol(dev->features & dev->vlan_features,
1666 veh->h_vlan_encapsulated_proto))
1667 return true;
1668 }
1669
1670 return false;
1671}
56079431 1672
8a83a00b
AB
1673/**
1674 * skb_dev_set -- assign a new device to a buffer
1675 * @skb: buffer for the new device
1676 * @dev: network device
1677 *
1678 * If an skb is owned by a device already, we have to reset
1679 * all data private to the namespace a device belongs to
1680 * before assigning it a new device.
1681 */
1682#ifdef CONFIG_NET_NS
1683void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1684{
1685 skb_dst_drop(skb);
1686 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1687 secpath_reset(skb);
1688 nf_reset(skb);
1689 skb_init_secmark(skb);
1690 skb->mark = 0;
1691 skb->priority = 0;
1692 skb->nf_trace = 0;
1693 skb->ipvs_property = 0;
1694#ifdef CONFIG_NET_SCHED
1695 skb->tc_index = 0;
1696#endif
1697 }
1698 skb->dev = dev;
1699}
1700EXPORT_SYMBOL(skb_set_dev);
1701#endif /* CONFIG_NET_NS */
1702
1da177e4
LT
1703/*
1704 * Invalidate hardware checksum when packet is to be mangled, and
1705 * complete checksum manually on outgoing path.
1706 */
84fa7933 1707int skb_checksum_help(struct sk_buff *skb)
1da177e4 1708{
d3bc23e7 1709 __wsum csum;
663ead3b 1710 int ret = 0, offset;
1da177e4 1711
84fa7933 1712 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
1713 goto out_set_summed;
1714
1715 if (unlikely(skb_shinfo(skb)->gso_size)) {
a430a43d
HX
1716 /* Let GSO fix up the checksum. */
1717 goto out_set_summed;
1da177e4
LT
1718 }
1719
a030847e
HX
1720 offset = skb->csum_start - skb_headroom(skb);
1721 BUG_ON(offset >= skb_headlen(skb));
1722 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1723
1724 offset += skb->csum_offset;
1725 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1726
1727 if (skb_cloned(skb) &&
1728 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
1729 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1730 if (ret)
1731 goto out;
1732 }
1733
a030847e 1734 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
a430a43d 1735out_set_summed:
1da177e4 1736 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 1737out:
1da177e4
LT
1738 return ret;
1739}
d1b19dff 1740EXPORT_SYMBOL(skb_checksum_help);
1da177e4 1741
f6a78bfc
HX
1742/**
1743 * skb_gso_segment - Perform segmentation on skb.
1744 * @skb: buffer to segment
576a30eb 1745 * @features: features for the output path (see dev->features)
f6a78bfc
HX
1746 *
1747 * This function segments the given skb and returns a list of segments.
576a30eb
HX
1748 *
1749 * It may return NULL if the skb requires no segmentation. This is
1750 * only possible when GSO is used for verifying header integrity.
f6a78bfc 1751 */
576a30eb 1752struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
f6a78bfc
HX
1753{
1754 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1755 struct packet_type *ptype;
252e3346 1756 __be16 type = skb->protocol;
a430a43d 1757 int err;
f6a78bfc 1758
459a98ed 1759 skb_reset_mac_header(skb);
b0e380b1 1760 skb->mac_len = skb->network_header - skb->mac_header;
f6a78bfc
HX
1761 __skb_pull(skb, skb->mac_len);
1762
67fd1a73
HX
1763 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1764 struct net_device *dev = skb->dev;
1765 struct ethtool_drvinfo info = {};
1766
1767 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1768 dev->ethtool_ops->get_drvinfo(dev, &info);
1769
1770 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1771 "ip_summed=%d",
1772 info.driver, dev ? dev->features : 0L,
1773 skb->sk ? skb->sk->sk_route_caps : 0L,
1774 skb->len, skb->data_len, skb->ip_summed);
1775
a430a43d
HX
1776 if (skb_header_cloned(skb) &&
1777 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1778 return ERR_PTR(err);
1779 }
1780
f6a78bfc 1781 rcu_read_lock();
82d8a867
PE
1782 list_for_each_entry_rcu(ptype,
1783 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
f6a78bfc 1784 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
84fa7933 1785 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
a430a43d
HX
1786 err = ptype->gso_send_check(skb);
1787 segs = ERR_PTR(err);
1788 if (err || skb_gso_ok(skb, features))
1789 break;
d56f90a7
ACM
1790 __skb_push(skb, (skb->data -
1791 skb_network_header(skb)));
a430a43d 1792 }
576a30eb 1793 segs = ptype->gso_segment(skb, features);
f6a78bfc
HX
1794 break;
1795 }
1796 }
1797 rcu_read_unlock();
1798
98e399f8 1799 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 1800
f6a78bfc
HX
1801 return segs;
1802}
f6a78bfc
HX
1803EXPORT_SYMBOL(skb_gso_segment);
1804
fb286bb2
HX
1805/* Take action when hardware reception checksum errors are detected. */
1806#ifdef CONFIG_BUG
1807void netdev_rx_csum_fault(struct net_device *dev)
1808{
1809 if (net_ratelimit()) {
4ec93edb 1810 printk(KERN_ERR "%s: hw csum failure.\n",
246a4212 1811 dev ? dev->name : "<unknown>");
fb286bb2
HX
1812 dump_stack();
1813 }
1814}
1815EXPORT_SYMBOL(netdev_rx_csum_fault);
1816#endif
1817
1da177e4
LT
1818/* Actually, we should eliminate this check as soon as we know, that:
1819 * 1. IOMMU is present and allows to map all the memory.
1820 * 2. No high memory really exists on this machine.
1821 */
1822
9092c658 1823static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1da177e4 1824{
3d3a8533 1825#ifdef CONFIG_HIGHMEM
1da177e4 1826 int i;
5acbbd42
FT
1827 if (!(dev->features & NETIF_F_HIGHDMA)) {
1828 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1829 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1830 return 1;
1831 }
1da177e4 1832
5acbbd42
FT
1833 if (PCI_DMA_BUS_IS_PHYS) {
1834 struct device *pdev = dev->dev.parent;
1da177e4 1835
9092c658
ED
1836 if (!pdev)
1837 return 0;
5acbbd42
FT
1838 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1839 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1840 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1841 return 1;
1842 }
1843 }
3d3a8533 1844#endif
1da177e4
LT
1845 return 0;
1846}
1da177e4 1847
f6a78bfc
HX
1848struct dev_gso_cb {
1849 void (*destructor)(struct sk_buff *skb);
1850};
1851
1852#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1853
1854static void dev_gso_skb_destructor(struct sk_buff *skb)
1855{
1856 struct dev_gso_cb *cb;
1857
1858 do {
1859 struct sk_buff *nskb = skb->next;
1860
1861 skb->next = nskb->next;
1862 nskb->next = NULL;
1863 kfree_skb(nskb);
1864 } while (skb->next);
1865
1866 cb = DEV_GSO_CB(skb);
1867 if (cb->destructor)
1868 cb->destructor(skb);
1869}
1870
1871/**
1872 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1873 * @skb: buffer to segment
1874 *
1875 * This function segments the given skb and stores the list of segments
1876 * in skb->next.
1877 */
1878static int dev_gso_segment(struct sk_buff *skb)
1879{
1880 struct net_device *dev = skb->dev;
1881 struct sk_buff *segs;
576a30eb
HX
1882 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1883 NETIF_F_SG : 0);
1884
1885 segs = skb_gso_segment(skb, features);
1886
1887 /* Verifying header integrity only. */
1888 if (!segs)
1889 return 0;
f6a78bfc 1890
801678c5 1891 if (IS_ERR(segs))
f6a78bfc
HX
1892 return PTR_ERR(segs);
1893
1894 skb->next = segs;
1895 DEV_GSO_CB(skb)->destructor = skb->destructor;
1896 skb->destructor = dev_gso_skb_destructor;
1897
1898 return 0;
1899}
1900
fc6055a5
ED
1901/*
1902 * Try to orphan skb early, right before transmission by the device.
2244d07b
OH
1903 * We cannot orphan skb if tx timestamp is requested or the sk-reference
1904 * is needed on driver level for other reasons, e.g. see net/can/raw.c
fc6055a5
ED
1905 */
1906static inline void skb_orphan_try(struct sk_buff *skb)
1907{
87fd308c
ED
1908 struct sock *sk = skb->sk;
1909
2244d07b 1910 if (sk && !skb_shinfo(skb)->tx_flags) {
87fd308c
ED
1911 /* skb_tx_hash() wont be able to get sk.
1912 * We copy sk_hash into skb->rxhash
1913 */
1914 if (!skb->rxhash)
1915 skb->rxhash = sk->sk_hash;
fc6055a5 1916 skb_orphan(skb);
87fd308c 1917 }
fc6055a5
ED
1918}
1919
6afff0ca
JF
1920/*
1921 * Returns true if either:
1922 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
1923 * 2. skb is fragmented and the device does not support SG, or if
1924 * at least one of fragments is in highmem and device does not
1925 * support DMA from it.
1926 */
1927static inline int skb_needs_linearize(struct sk_buff *skb,
1928 struct net_device *dev)
1929{
1930 return skb_is_nonlinear(skb) &&
21dc3301 1931 ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
6afff0ca
JF
1932 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
1933 illegal_highdma(dev, skb))));
1934}
1935
fd2ea0a7
DM
1936int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1937 struct netdev_queue *txq)
f6a78bfc 1938{
00829823 1939 const struct net_device_ops *ops = dev->netdev_ops;
572a9d7b 1940 int rc = NETDEV_TX_OK;
00829823 1941
f6a78bfc 1942 if (likely(!skb->next)) {
9be9a6b9 1943 if (!list_empty(&ptype_all))
f6a78bfc
HX
1944 dev_queue_xmit_nit(skb, dev);
1945
93f154b5
ED
1946 /*
1947 * If device doesnt need skb->dst, release it right now while
1948 * its hot in this cpu cache
1949 */
adf30907
ED
1950 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1951 skb_dst_drop(skb);
1952
fc6055a5 1953 skb_orphan_try(skb);
9ccb8975
DM
1954
1955 if (netif_needs_gso(dev, skb)) {
1956 if (unlikely(dev_gso_segment(skb)))
1957 goto out_kfree_skb;
1958 if (skb->next)
1959 goto gso;
6afff0ca
JF
1960 } else {
1961 if (skb_needs_linearize(skb, dev) &&
1962 __skb_linearize(skb))
1963 goto out_kfree_skb;
1964
1965 /* If packet is not checksummed and device does not
1966 * support checksumming for this protocol, complete
1967 * checksumming here.
1968 */
1969 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1970 skb_set_transport_header(skb, skb->csum_start -
1971 skb_headroom(skb));
1972 if (!dev_can_checksum(dev, skb) &&
1973 skb_checksum_help(skb))
1974 goto out_kfree_skb;
1975 }
9ccb8975
DM
1976 }
1977
ac45f602 1978 rc = ops->ndo_start_xmit(skb, dev);
ec634fe3 1979 if (rc == NETDEV_TX_OK)
08baf561 1980 txq_trans_update(txq);
ac45f602 1981 return rc;
f6a78bfc
HX
1982 }
1983
576a30eb 1984gso:
f6a78bfc
HX
1985 do {
1986 struct sk_buff *nskb = skb->next;
f6a78bfc
HX
1987
1988 skb->next = nskb->next;
1989 nskb->next = NULL;
068a2de5
KK
1990
1991 /*
1992 * If device doesnt need nskb->dst, release it right now while
1993 * its hot in this cpu cache
1994 */
1995 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1996 skb_dst_drop(nskb);
1997
00829823 1998 rc = ops->ndo_start_xmit(nskb, dev);
ec634fe3 1999 if (unlikely(rc != NETDEV_TX_OK)) {
572a9d7b
PM
2000 if (rc & ~NETDEV_TX_MASK)
2001 goto out_kfree_gso_skb;
f54d9e8d 2002 nskb->next = skb->next;
f6a78bfc
HX
2003 skb->next = nskb;
2004 return rc;
2005 }
08baf561 2006 txq_trans_update(txq);
fd2ea0a7 2007 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
f54d9e8d 2008 return NETDEV_TX_BUSY;
f6a78bfc 2009 } while (skb->next);
4ec93edb 2010
572a9d7b
PM
2011out_kfree_gso_skb:
2012 if (likely(skb->next == NULL))
2013 skb->destructor = DEV_GSO_CB(skb)->destructor;
f6a78bfc
HX
2014out_kfree_skb:
2015 kfree_skb(skb);
572a9d7b 2016 return rc;
f6a78bfc
HX
2017}
2018
0a9627f2 2019static u32 hashrnd __read_mostly;
b6b2fed1 2020
9247744e 2021u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
8f0f2223 2022{
7019298a 2023 u32 hash;
b6b2fed1 2024
513de11b
DM
2025 if (skb_rx_queue_recorded(skb)) {
2026 hash = skb_get_rx_queue(skb);
d1b19dff 2027 while (unlikely(hash >= dev->real_num_tx_queues))
513de11b
DM
2028 hash -= dev->real_num_tx_queues;
2029 return hash;
2030 }
ec581f6a
ED
2031
2032 if (skb->sk && skb->sk->sk_hash)
7019298a 2033 hash = skb->sk->sk_hash;
ec581f6a 2034 else
87fd308c 2035 hash = (__force u16) skb->protocol ^ skb->rxhash;
0a9627f2 2036 hash = jhash_1word(hash, hashrnd);
b6b2fed1
DM
2037
2038 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
8f0f2223 2039}
9247744e 2040EXPORT_SYMBOL(skb_tx_hash);
8f0f2223 2041
ed04642f
ED
2042static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2043{
2044 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2045 if (net_ratelimit()) {
7a161ea9
ED
2046 pr_warning("%s selects TX queue %d, but "
2047 "real number of TX queues is %d\n",
2048 dev->name, queue_index, dev->real_num_tx_queues);
ed04642f
ED
2049 }
2050 return 0;
2051 }
2052 return queue_index;
2053}
2054
e8a0464c
DM
2055static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2056 struct sk_buff *skb)
2057{
b0f77d0e 2058 int queue_index;
a4ee3ce3
KK
2059 struct sock *sk = skb->sk;
2060
b0f77d0e
TH
2061 queue_index = sk_tx_queue_get(sk);
2062 if (queue_index < 0) {
a4ee3ce3
KK
2063 const struct net_device_ops *ops = dev->netdev_ops;
2064
2065 if (ops->ndo_select_queue) {
2066 queue_index = ops->ndo_select_queue(dev, skb);
ed04642f 2067 queue_index = dev_cap_txqueue(dev, queue_index);
a4ee3ce3
KK
2068 } else {
2069 queue_index = 0;
2070 if (dev->real_num_tx_queues > 1)
2071 queue_index = skb_tx_hash(dev, skb);
fd2ea0a7 2072
8728c544 2073 if (sk) {
87eb3670 2074 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
8728c544
ED
2075
2076 if (dst && skb_dst(skb) == dst)
2077 sk_tx_queue_set(sk, queue_index);
2078 }
a4ee3ce3
KK
2079 }
2080 }
eae792b7 2081
fd2ea0a7
DM
2082 skb_set_queue_mapping(skb, queue_index);
2083 return netdev_get_tx_queue(dev, queue_index);
e8a0464c
DM
2084}
2085
bbd8a0d3
KK
2086static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2087 struct net_device *dev,
2088 struct netdev_queue *txq)
2089{
2090 spinlock_t *root_lock = qdisc_lock(q);
79640a4c 2091 bool contended = qdisc_is_running(q);
bbd8a0d3
KK
2092 int rc;
2093
79640a4c
ED
2094 /*
2095 * Heuristic to force contended enqueues to serialize on a
2096 * separate lock before trying to get qdisc main lock.
2097 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2098 * and dequeue packets faster.
2099 */
2100 if (unlikely(contended))
2101 spin_lock(&q->busylock);
2102
bbd8a0d3
KK
2103 spin_lock(root_lock);
2104 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2105 kfree_skb(skb);
2106 rc = NET_XMIT_DROP;
2107 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
bc135b23 2108 qdisc_run_begin(q)) {
bbd8a0d3
KK
2109 /*
2110 * This is a work-conserving queue; there are no old skbs
2111 * waiting to be sent out; and the qdisc is not running -
2112 * xmit the skb directly.
2113 */
7fee226a
ED
2114 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2115 skb_dst_force(skb);
bbd8a0d3 2116 __qdisc_update_bstats(q, skb->len);
79640a4c
ED
2117 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2118 if (unlikely(contended)) {
2119 spin_unlock(&q->busylock);
2120 contended = false;
2121 }
bbd8a0d3 2122 __qdisc_run(q);
79640a4c 2123 } else
bc135b23 2124 qdisc_run_end(q);
bbd8a0d3
KK
2125
2126 rc = NET_XMIT_SUCCESS;
2127 } else {
7fee226a 2128 skb_dst_force(skb);
bbd8a0d3 2129 rc = qdisc_enqueue_root(skb, q);
79640a4c
ED
2130 if (qdisc_run_begin(q)) {
2131 if (unlikely(contended)) {
2132 spin_unlock(&q->busylock);
2133 contended = false;
2134 }
2135 __qdisc_run(q);
2136 }
bbd8a0d3
KK
2137 }
2138 spin_unlock(root_lock);
79640a4c
ED
2139 if (unlikely(contended))
2140 spin_unlock(&q->busylock);
bbd8a0d3
KK
2141 return rc;
2142}
2143
d29f749e
DJ
2144/**
2145 * dev_queue_xmit - transmit a buffer
2146 * @skb: buffer to transmit
2147 *
2148 * Queue a buffer for transmission to a network device. The caller must
2149 * have set the device and priority and built the buffer before calling
2150 * this function. The function can be called from an interrupt.
2151 *
2152 * A negative errno code is returned on a failure. A success does not
2153 * guarantee the frame will be transmitted as it may be dropped due
2154 * to congestion or traffic shaping.
2155 *
2156 * -----------------------------------------------------------------------------------
2157 * I notice this method can also return errors from the queue disciplines,
2158 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2159 * be positive.
2160 *
2161 * Regardless of the return value, the skb is consumed, so it is currently
2162 * difficult to retry a send to this method. (You can bump the ref count
2163 * before sending to hold a reference for retry if you are careful.)
2164 *
2165 * When calling this method, interrupts MUST be enabled. This is because
2166 * the BH enable code must have IRQs enabled so that it will not deadlock.
2167 * --BLG
2168 */
1da177e4
LT
2169int dev_queue_xmit(struct sk_buff *skb)
2170{
2171 struct net_device *dev = skb->dev;
dc2b4847 2172 struct netdev_queue *txq;
1da177e4
LT
2173 struct Qdisc *q;
2174 int rc = -ENOMEM;
2175
4ec93edb
YH
2176 /* Disable soft irqs for various locks below. Also
2177 * stops preemption for RCU.
1da177e4 2178 */
4ec93edb 2179 rcu_read_lock_bh();
1da177e4 2180
eae792b7 2181 txq = dev_pick_tx(dev, skb);
a898def2 2182 q = rcu_dereference_bh(txq->qdisc);
37437bb2 2183
1da177e4 2184#ifdef CONFIG_NET_CLS_ACT
d1b19dff 2185 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1da177e4
LT
2186#endif
2187 if (q->enqueue) {
bbd8a0d3 2188 rc = __dev_xmit_skb(skb, q, dev, txq);
37437bb2 2189 goto out;
1da177e4
LT
2190 }
2191
2192 /* The device has no queue. Common case for software devices:
2193 loopback, all the sorts of tunnels...
2194
932ff279
HX
2195 Really, it is unlikely that netif_tx_lock protection is necessary
2196 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
2197 counters.)
2198 However, it is possible, that they rely on protection
2199 made by us here.
2200
2201 Check this and shot the lock. It is not prone from deadlocks.
2202 Either shot noqueue qdisc, it is even simpler 8)
2203 */
2204 if (dev->flags & IFF_UP) {
2205 int cpu = smp_processor_id(); /* ok because BHs are off */
2206
c773e847 2207 if (txq->xmit_lock_owner != cpu) {
1da177e4 2208
c773e847 2209 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 2210
fd2ea0a7 2211 if (!netif_tx_queue_stopped(txq)) {
572a9d7b
PM
2212 rc = dev_hard_start_xmit(skb, dev, txq);
2213 if (dev_xmit_complete(rc)) {
c773e847 2214 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2215 goto out;
2216 }
2217 }
c773e847 2218 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2219 if (net_ratelimit())
2220 printk(KERN_CRIT "Virtual device %s asks to "
2221 "queue packet!\n", dev->name);
2222 } else {
2223 /* Recursion is detected! It is possible,
2224 * unfortunately */
2225 if (net_ratelimit())
2226 printk(KERN_CRIT "Dead loop on virtual device "
2227 "%s, fix it urgently!\n", dev->name);
2228 }
2229 }
2230
2231 rc = -ENETDOWN;
d4828d85 2232 rcu_read_unlock_bh();
1da177e4 2233
1da177e4
LT
2234 kfree_skb(skb);
2235 return rc;
2236out:
d4828d85 2237 rcu_read_unlock_bh();
1da177e4
LT
2238 return rc;
2239}
d1b19dff 2240EXPORT_SYMBOL(dev_queue_xmit);
1da177e4
LT
2241
2242
2243/*=======================================================================
2244 Receiver routines
2245 =======================================================================*/
2246
6b2bedc3 2247int netdev_max_backlog __read_mostly = 1000;
3b098e2d 2248int netdev_tstamp_prequeue __read_mostly = 1;
6b2bedc3
SH
2249int netdev_budget __read_mostly = 300;
2250int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4 2251
eecfd7c4
ED
2252/* Called with irq disabled */
2253static inline void ____napi_schedule(struct softnet_data *sd,
2254 struct napi_struct *napi)
2255{
2256 list_add_tail(&napi->poll_list, &sd->poll_list);
2257 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2258}
2259
0a9627f2 2260/*
bfb564e7
KK
2261 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2262 * and src/dst port numbers. Returns a non-zero hash number on success
2263 * and 0 on failure.
0a9627f2 2264 */
bfb564e7 2265__u32 __skb_get_rxhash(struct sk_buff *skb)
0a9627f2 2266{
12fcdefb 2267 int nhoff, hash = 0, poff;
0a9627f2
TH
2268 struct ipv6hdr *ip6;
2269 struct iphdr *ip;
0a9627f2 2270 u8 ip_proto;
8c52d509
CG
2271 u32 addr1, addr2, ihl;
2272 union {
2273 u32 v32;
2274 u16 v16[2];
2275 } ports;
0a9627f2 2276
bfb564e7 2277 nhoff = skb_network_offset(skb);
0a9627f2
TH
2278
2279 switch (skb->protocol) {
2280 case __constant_htons(ETH_P_IP):
bfb564e7 2281 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
0a9627f2
TH
2282 goto done;
2283
1003489e 2284 ip = (struct iphdr *) (skb->data + nhoff);
dbe5775b
CG
2285 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2286 ip_proto = 0;
2287 else
2288 ip_proto = ip->protocol;
b249dcb8
ED
2289 addr1 = (__force u32) ip->saddr;
2290 addr2 = (__force u32) ip->daddr;
0a9627f2
TH
2291 ihl = ip->ihl;
2292 break;
2293 case __constant_htons(ETH_P_IPV6):
bfb564e7 2294 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
0a9627f2
TH
2295 goto done;
2296
1003489e 2297 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
0a9627f2 2298 ip_proto = ip6->nexthdr;
b249dcb8
ED
2299 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2300 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
0a9627f2
TH
2301 ihl = (40 >> 2);
2302 break;
2303 default:
2304 goto done;
2305 }
bfb564e7 2306
12fcdefb
CG
2307 ports.v32 = 0;
2308 poff = proto_ports_offset(ip_proto);
2309 if (poff >= 0) {
2310 nhoff += ihl * 4 + poff;
2311 if (pskb_may_pull(skb, nhoff + 4)) {
2312 ports.v32 = * (__force u32 *) (skb->data + nhoff);
8c52d509
CG
2313 if (ports.v16[1] < ports.v16[0])
2314 swap(ports.v16[0], ports.v16[1]);
b249dcb8 2315 }
0a9627f2
TH
2316 }
2317
b249dcb8
ED
2318 /* get a consistent hash (same value on both flow directions) */
2319 if (addr2 < addr1)
2320 swap(addr1, addr2);
0a9627f2 2321
bfb564e7
KK
2322 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2323 if (!hash)
2324 hash = 1;
2325
2326done:
2327 return hash;
2328}
2329EXPORT_SYMBOL(__skb_get_rxhash);
2330
2331#ifdef CONFIG_RPS
2332
2333/* One global table that all flow-based protocols share. */
2334struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
2335EXPORT_SYMBOL(rps_sock_flow_table);
2336
2337/*
2338 * get_rps_cpu is called from netif_receive_skb and returns the target
2339 * CPU from the RPS map of the receiving queue for a given skb.
2340 * rcu_read_lock must be held on entry.
2341 */
2342static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2343 struct rps_dev_flow **rflowp)
2344{
2345 struct netdev_rx_queue *rxqueue;
2346 struct rps_map *map;
2347 struct rps_dev_flow_table *flow_table;
2348 struct rps_sock_flow_table *sock_flow_table;
2349 int cpu = -1;
2350 u16 tcpu;
2351
2352 if (skb_rx_queue_recorded(skb)) {
2353 u16 index = skb_get_rx_queue(skb);
2354 if (unlikely(index >= dev->num_rx_queues)) {
2355 WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
2356 "on queue %u, but number of RX queues is %u\n",
2357 dev->name, index, dev->num_rx_queues);
2358 goto done;
2359 }
2360 rxqueue = dev->_rx + index;
2361 } else
2362 rxqueue = dev->_rx;
2363
2364 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
2365 goto done;
2366
2d47b459 2367 skb_reset_network_header(skb);
bfb564e7
KK
2368 if (!skb_get_rxhash(skb))
2369 goto done;
2370
fec5e652
TH
2371 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2372 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2373 if (flow_table && sock_flow_table) {
2374 u16 next_cpu;
2375 struct rps_dev_flow *rflow;
2376
2377 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2378 tcpu = rflow->cpu;
2379
2380 next_cpu = sock_flow_table->ents[skb->rxhash &
2381 sock_flow_table->mask];
2382
2383 /*
2384 * If the desired CPU (where last recvmsg was done) is
2385 * different from current CPU (one in the rx-queue flow
2386 * table entry), switch if one of the following holds:
2387 * - Current CPU is unset (equal to RPS_NO_CPU).
2388 * - Current CPU is offline.
2389 * - The current CPU's queue tail has advanced beyond the
2390 * last packet that was enqueued using this table entry.
2391 * This guarantees that all previous packets for the flow
2392 * have been dequeued, thus preserving in order delivery.
2393 */
2394 if (unlikely(tcpu != next_cpu) &&
2395 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2396 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2397 rflow->last_qtail)) >= 0)) {
2398 tcpu = rflow->cpu = next_cpu;
2399 if (tcpu != RPS_NO_CPU)
2400 rflow->last_qtail = per_cpu(softnet_data,
2401 tcpu).input_queue_head;
2402 }
2403 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2404 *rflowp = rflow;
2405 cpu = tcpu;
2406 goto done;
2407 }
2408 }
2409
0a9627f2
TH
2410 map = rcu_dereference(rxqueue->rps_map);
2411 if (map) {
fec5e652 2412 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
0a9627f2
TH
2413
2414 if (cpu_online(tcpu)) {
2415 cpu = tcpu;
2416 goto done;
2417 }
2418 }
2419
2420done:
0a9627f2
TH
2421 return cpu;
2422}
2423
0a9627f2 2424/* Called from hardirq (IPI) context */
e36fa2f7 2425static void rps_trigger_softirq(void *data)
0a9627f2 2426{
e36fa2f7
ED
2427 struct softnet_data *sd = data;
2428
eecfd7c4 2429 ____napi_schedule(sd, &sd->backlog);
dee42870 2430 sd->received_rps++;
0a9627f2 2431}
e36fa2f7 2432
fec5e652 2433#endif /* CONFIG_RPS */
0a9627f2 2434
e36fa2f7
ED
2435/*
2436 * Check if this softnet_data structure is another cpu one
2437 * If yes, queue it to our IPI list and return 1
2438 * If no, return 0
2439 */
2440static int rps_ipi_queued(struct softnet_data *sd)
2441{
2442#ifdef CONFIG_RPS
2443 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2444
2445 if (sd != mysd) {
2446 sd->rps_ipi_next = mysd->rps_ipi_list;
2447 mysd->rps_ipi_list = sd;
2448
2449 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2450 return 1;
2451 }
2452#endif /* CONFIG_RPS */
2453 return 0;
2454}
2455
0a9627f2
TH
2456/*
2457 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2458 * queue (may be a remote CPU queue).
2459 */
fec5e652
TH
2460static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2461 unsigned int *qtail)
0a9627f2 2462{
e36fa2f7 2463 struct softnet_data *sd;
0a9627f2
TH
2464 unsigned long flags;
2465
e36fa2f7 2466 sd = &per_cpu(softnet_data, cpu);
0a9627f2
TH
2467
2468 local_irq_save(flags);
0a9627f2 2469
e36fa2f7 2470 rps_lock(sd);
6e7676c1
CG
2471 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2472 if (skb_queue_len(&sd->input_pkt_queue)) {
0a9627f2 2473enqueue:
e36fa2f7 2474 __skb_queue_tail(&sd->input_pkt_queue, skb);
76cc8b13 2475 input_queue_tail_incr_save(sd, qtail);
e36fa2f7 2476 rps_unlock(sd);
152102c7 2477 local_irq_restore(flags);
0a9627f2
TH
2478 return NET_RX_SUCCESS;
2479 }
2480
ebda37c2
ED
2481 /* Schedule NAPI for backlog device
2482 * We can use non atomic operation since we own the queue lock
2483 */
2484 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
e36fa2f7 2485 if (!rps_ipi_queued(sd))
eecfd7c4 2486 ____napi_schedule(sd, &sd->backlog);
0a9627f2
TH
2487 }
2488 goto enqueue;
2489 }
2490
dee42870 2491 sd->dropped++;
e36fa2f7 2492 rps_unlock(sd);
0a9627f2 2493
0a9627f2
TH
2494 local_irq_restore(flags);
2495
2496 kfree_skb(skb);
2497 return NET_RX_DROP;
2498}
1da177e4 2499
1da177e4
LT
2500/**
2501 * netif_rx - post buffer to the network code
2502 * @skb: buffer to post
2503 *
2504 * This function receives a packet from a device driver and queues it for
2505 * the upper (protocol) levels to process. It always succeeds. The buffer
2506 * may be dropped during processing for congestion control or by the
2507 * protocol layers.
2508 *
2509 * return values:
2510 * NET_RX_SUCCESS (no congestion)
1da177e4
LT
2511 * NET_RX_DROP (packet was dropped)
2512 *
2513 */
2514
2515int netif_rx(struct sk_buff *skb)
2516{
b0e28f1e 2517 int ret;
1da177e4
LT
2518
2519 /* if netpoll wants it, pretend we never saw it */
2520 if (netpoll_rx(skb))
2521 return NET_RX_DROP;
2522
3b098e2d
ED
2523 if (netdev_tstamp_prequeue)
2524 net_timestamp_check(skb);
1da177e4 2525
df334545 2526#ifdef CONFIG_RPS
b0e28f1e 2527 {
fec5e652 2528 struct rps_dev_flow voidflow, *rflow = &voidflow;
b0e28f1e
ED
2529 int cpu;
2530
cece1945 2531 preempt_disable();
b0e28f1e 2532 rcu_read_lock();
fec5e652
TH
2533
2534 cpu = get_rps_cpu(skb->dev, skb, &rflow);
b0e28f1e
ED
2535 if (cpu < 0)
2536 cpu = smp_processor_id();
fec5e652
TH
2537
2538 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2539
b0e28f1e 2540 rcu_read_unlock();
cece1945 2541 preempt_enable();
b0e28f1e 2542 }
1e94d72f 2543#else
fec5e652
TH
2544 {
2545 unsigned int qtail;
2546 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2547 put_cpu();
2548 }
1e94d72f 2549#endif
b0e28f1e 2550 return ret;
1da177e4 2551}
d1b19dff 2552EXPORT_SYMBOL(netif_rx);
1da177e4
LT
2553
2554int netif_rx_ni(struct sk_buff *skb)
2555{
2556 int err;
2557
2558 preempt_disable();
2559 err = netif_rx(skb);
2560 if (local_softirq_pending())
2561 do_softirq();
2562 preempt_enable();
2563
2564 return err;
2565}
1da177e4
LT
2566EXPORT_SYMBOL(netif_rx_ni);
2567
1da177e4
LT
2568static void net_tx_action(struct softirq_action *h)
2569{
2570 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2571
2572 if (sd->completion_queue) {
2573 struct sk_buff *clist;
2574
2575 local_irq_disable();
2576 clist = sd->completion_queue;
2577 sd->completion_queue = NULL;
2578 local_irq_enable();
2579
2580 while (clist) {
2581 struct sk_buff *skb = clist;
2582 clist = clist->next;
2583
547b792c 2584 WARN_ON(atomic_read(&skb->users));
1da177e4
LT
2585 __kfree_skb(skb);
2586 }
2587 }
2588
2589 if (sd->output_queue) {
37437bb2 2590 struct Qdisc *head;
1da177e4
LT
2591
2592 local_irq_disable();
2593 head = sd->output_queue;
2594 sd->output_queue = NULL;
a9cbd588 2595 sd->output_queue_tailp = &sd->output_queue;
1da177e4
LT
2596 local_irq_enable();
2597
2598 while (head) {
37437bb2
DM
2599 struct Qdisc *q = head;
2600 spinlock_t *root_lock;
2601
1da177e4
LT
2602 head = head->next_sched;
2603
5fb66229 2604 root_lock = qdisc_lock(q);
37437bb2 2605 if (spin_trylock(root_lock)) {
def82a1d
JP
2606 smp_mb__before_clear_bit();
2607 clear_bit(__QDISC_STATE_SCHED,
2608 &q->state);
37437bb2
DM
2609 qdisc_run(q);
2610 spin_unlock(root_lock);
1da177e4 2611 } else {
195648bb 2612 if (!test_bit(__QDISC_STATE_DEACTIVATED,
e8a83e10 2613 &q->state)) {
195648bb 2614 __netif_reschedule(q);
e8a83e10
JP
2615 } else {
2616 smp_mb__before_clear_bit();
2617 clear_bit(__QDISC_STATE_SCHED,
2618 &q->state);
2619 }
1da177e4
LT
2620 }
2621 }
2622 }
2623}
2624
6f05f629
SH
2625static inline int deliver_skb(struct sk_buff *skb,
2626 struct packet_type *pt_prev,
2627 struct net_device *orig_dev)
1da177e4
LT
2628{
2629 atomic_inc(&skb->users);
f2ccd8fa 2630 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4
LT
2631}
2632
ab95bfe0
JP
2633#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2634 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
da678292
MM
2635/* This hook is defined here for ATM LANE */
2636int (*br_fdb_test_addr_hook)(struct net_device *dev,
2637 unsigned char *addr) __read_mostly;
4fb019a0 2638EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
da678292 2639#endif
1da177e4 2640
1da177e4
LT
2641#ifdef CONFIG_NET_CLS_ACT
2642/* TODO: Maybe we should just force sch_ingress to be compiled in
2643 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2644 * a compare and 2 stores extra right now if we dont have it on
2645 * but have CONFIG_NET_CLS_ACT
4ec93edb 2646 * NOTE: This doesnt stop any functionality; if you dont have
1da177e4
LT
2647 * the ingress scheduler, you just cant add policies on ingress.
2648 *
2649 */
4ec93edb 2650static int ing_filter(struct sk_buff *skb)
1da177e4 2651{
1da177e4 2652 struct net_device *dev = skb->dev;
f697c3e8 2653 u32 ttl = G_TC_RTTL(skb->tc_verd);
555353cf
DM
2654 struct netdev_queue *rxq;
2655 int result = TC_ACT_OK;
2656 struct Qdisc *q;
4ec93edb 2657
de384830
SH
2658 if (unlikely(MAX_RED_LOOP < ttl++)) {
2659 if (net_ratelimit())
2660 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2661 skb->skb_iif, dev->ifindex);
f697c3e8
HX
2662 return TC_ACT_SHOT;
2663 }
1da177e4 2664
f697c3e8
HX
2665 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2666 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1da177e4 2667
555353cf
DM
2668 rxq = &dev->rx_queue;
2669
83874000 2670 q = rxq->qdisc;
8d50b53d 2671 if (q != &noop_qdisc) {
83874000 2672 spin_lock(qdisc_lock(q));
a9312ae8
DM
2673 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2674 result = qdisc_enqueue_root(skb, q);
83874000
DM
2675 spin_unlock(qdisc_lock(q));
2676 }
f697c3e8
HX
2677
2678 return result;
2679}
86e65da9 2680
f697c3e8
HX
2681static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2682 struct packet_type **pt_prev,
2683 int *ret, struct net_device *orig_dev)
2684{
8d50b53d 2685 if (skb->dev->rx_queue.qdisc == &noop_qdisc)
f697c3e8 2686 goto out;
1da177e4 2687
f697c3e8
HX
2688 if (*pt_prev) {
2689 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2690 *pt_prev = NULL;
1da177e4
LT
2691 }
2692
f697c3e8
HX
2693 switch (ing_filter(skb)) {
2694 case TC_ACT_SHOT:
2695 case TC_ACT_STOLEN:
2696 kfree_skb(skb);
2697 return NULL;
2698 }
2699
2700out:
2701 skb->tc_verd = 0;
2702 return skb;
1da177e4
LT
2703}
2704#endif
2705
bc1d0411
PM
2706/*
2707 * netif_nit_deliver - deliver received packets to network taps
2708 * @skb: buffer
2709 *
2710 * This function is used to deliver incoming packets to network
2711 * taps. It should be used when the normal netif_receive_skb path
2712 * is bypassed, for example because of VLAN acceleration.
2713 */
2714void netif_nit_deliver(struct sk_buff *skb)
2715{
2716 struct packet_type *ptype;
2717
2718 if (list_empty(&ptype_all))
2719 return;
2720
2721 skb_reset_network_header(skb);
2722 skb_reset_transport_header(skb);
2723 skb->mac_len = skb->network_header - skb->mac_header;
2724
2725 rcu_read_lock();
2726 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2727 if (!ptype->dev || ptype->dev == skb->dev)
2728 deliver_skb(skb, ptype, skb->dev);
2729 }
2730 rcu_read_unlock();
2731}
2732
ab95bfe0
JP
2733/**
2734 * netdev_rx_handler_register - register receive handler
2735 * @dev: device to register a handler for
2736 * @rx_handler: receive handler to register
93e2c32b 2737 * @rx_handler_data: data pointer that is used by rx handler
ab95bfe0
JP
2738 *
2739 * Register a receive hander for a device. This handler will then be
2740 * called from __netif_receive_skb. A negative errno code is returned
2741 * on a failure.
2742 *
2743 * The caller must hold the rtnl_mutex.
2744 */
2745int netdev_rx_handler_register(struct net_device *dev,
93e2c32b
JP
2746 rx_handler_func_t *rx_handler,
2747 void *rx_handler_data)
ab95bfe0
JP
2748{
2749 ASSERT_RTNL();
2750
2751 if (dev->rx_handler)
2752 return -EBUSY;
2753
93e2c32b 2754 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
ab95bfe0
JP
2755 rcu_assign_pointer(dev->rx_handler, rx_handler);
2756
2757 return 0;
2758}
2759EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2760
2761/**
2762 * netdev_rx_handler_unregister - unregister receive handler
2763 * @dev: device to unregister a handler from
2764 *
2765 * Unregister a receive hander from a device.
2766 *
2767 * The caller must hold the rtnl_mutex.
2768 */
2769void netdev_rx_handler_unregister(struct net_device *dev)
2770{
2771
2772 ASSERT_RTNL();
2773 rcu_assign_pointer(dev->rx_handler, NULL);
93e2c32b 2774 rcu_assign_pointer(dev->rx_handler_data, NULL);
ab95bfe0
JP
2775}
2776EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2777
acbbc071
ED
2778static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2779 struct net_device *master)
2780{
2781 if (skb->pkt_type == PACKET_HOST) {
2782 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2783
2784 memcpy(dest, master->dev_addr, ETH_ALEN);
2785 }
2786}
2787
2788/* On bonding slaves other than the currently active slave, suppress
2789 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2790 * ARP on active-backup slaves with arp_validate enabled.
2791 */
2792int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2793{
2794 struct net_device *dev = skb->dev;
2795
2796 if (master->priv_flags & IFF_MASTER_ARPMON)
2797 dev->last_rx = jiffies;
2798
f350a0a8
JP
2799 if ((master->priv_flags & IFF_MASTER_ALB) &&
2800 (master->priv_flags & IFF_BRIDGE_PORT)) {
acbbc071
ED
2801 /* Do address unmangle. The local destination address
2802 * will be always the one master has. Provides the right
2803 * functionality in a bridge.
2804 */
2805 skb_bond_set_mac_by_master(skb, master);
2806 }
2807
2808 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2809 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2810 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2811 return 0;
2812
2813 if (master->priv_flags & IFF_MASTER_ALB) {
2814 if (skb->pkt_type != PACKET_BROADCAST &&
2815 skb->pkt_type != PACKET_MULTICAST)
2816 return 0;
2817 }
2818 if (master->priv_flags & IFF_MASTER_8023AD &&
2819 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2820 return 0;
2821
2822 return 1;
2823 }
2824 return 0;
2825}
2826EXPORT_SYMBOL(__skb_bond_should_drop);
2827
10f744d2 2828static int __netif_receive_skb(struct sk_buff *skb)
1da177e4
LT
2829{
2830 struct packet_type *ptype, *pt_prev;
ab95bfe0 2831 rx_handler_func_t *rx_handler;
f2ccd8fa 2832 struct net_device *orig_dev;
0641e4fb 2833 struct net_device *master;
0d7a3681 2834 struct net_device *null_or_orig;
2df4a0fa 2835 struct net_device *orig_or_bond;
1da177e4 2836 int ret = NET_RX_DROP;
252e3346 2837 __be16 type;
1da177e4 2838
3b098e2d
ED
2839 if (!netdev_tstamp_prequeue)
2840 net_timestamp_check(skb);
81bbb3d4 2841
05532121
CG
2842 if (vlan_tx_tag_present(skb))
2843 vlan_hwaccel_do_receive(skb);
9b22ea56 2844
1da177e4 2845 /* if we've gotten here through NAPI, check netpoll */
bea3348e 2846 if (netpoll_receive_skb(skb))
1da177e4
LT
2847 return NET_RX_DROP;
2848
8964be4a
ED
2849 if (!skb->skb_iif)
2850 skb->skb_iif = skb->dev->ifindex;
86e65da9 2851
597a264b
JF
2852 /*
2853 * bonding note: skbs received on inactive slaves should only
2854 * be delivered to pkt handlers that are exact matches. Also
2855 * the deliver_no_wcard flag will be set. If packet handlers
2856 * are sensitive to duplicate packets these skbs will need to
2857 * be dropped at the handler. The vlan accel path may have
2858 * already set the deliver_no_wcard flag.
2859 */
0d7a3681 2860 null_or_orig = NULL;
cc9bd5ce 2861 orig_dev = skb->dev;
0641e4fb 2862 master = ACCESS_ONCE(orig_dev->master);
597a264b
JF
2863 if (skb->deliver_no_wcard)
2864 null_or_orig = orig_dev;
2865 else if (master) {
2866 if (skb_bond_should_drop(skb, master)) {
2867 skb->deliver_no_wcard = 1;
0d7a3681 2868 null_or_orig = orig_dev; /* deliver only exact match */
597a264b 2869 } else
0641e4fb 2870 skb->dev = master;
cc9bd5ce 2871 }
8f903c70 2872
27f39c73 2873 __this_cpu_inc(softnet_data.processed);
c1d2bbe1 2874 skb_reset_network_header(skb);
badff6d0 2875 skb_reset_transport_header(skb);
b0e380b1 2876 skb->mac_len = skb->network_header - skb->mac_header;
1da177e4
LT
2877
2878 pt_prev = NULL;
2879
2880 rcu_read_lock();
2881
2882#ifdef CONFIG_NET_CLS_ACT
2883 if (skb->tc_verd & TC_NCLS) {
2884 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2885 goto ncls;
2886 }
2887#endif
2888
2889 list_for_each_entry_rcu(ptype, &ptype_all, list) {
f982307f
JE
2890 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2891 ptype->dev == orig_dev) {
4ec93edb 2892 if (pt_prev)
f2ccd8fa 2893 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
2894 pt_prev = ptype;
2895 }
2896 }
2897
2898#ifdef CONFIG_NET_CLS_ACT
f697c3e8
HX
2899 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2900 if (!skb)
1da177e4 2901 goto out;
1da177e4
LT
2902ncls:
2903#endif
2904
ab95bfe0
JP
2905 /* Handle special case of bridge or macvlan */
2906 rx_handler = rcu_dereference(skb->dev->rx_handler);
2907 if (rx_handler) {
2908 if (pt_prev) {
2909 ret = deliver_skb(skb, pt_prev, orig_dev);
2910 pt_prev = NULL;
2911 }
2912 skb = rx_handler(skb);
2913 if (!skb)
2914 goto out;
2915 }
1da177e4 2916
1f3c8804
AG
2917 /*
2918 * Make sure frames received on VLAN interfaces stacked on
2919 * bonding interfaces still make their way to any base bonding
2920 * device that may have registered for a specific ptype. The
2921 * handler may have to adjust skb->dev and orig_dev.
1f3c8804 2922 */
2df4a0fa 2923 orig_or_bond = orig_dev;
1f3c8804
AG
2924 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2925 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2df4a0fa 2926 orig_or_bond = vlan_dev_real_dev(skb->dev);
1f3c8804
AG
2927 }
2928
1da177e4 2929 type = skb->protocol;
82d8a867
PE
2930 list_for_each_entry_rcu(ptype,
2931 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1f3c8804 2932 if (ptype->type == type && (ptype->dev == null_or_orig ||
ca8d9ea3 2933 ptype->dev == skb->dev || ptype->dev == orig_dev ||
2df4a0fa 2934 ptype->dev == orig_or_bond)) {
4ec93edb 2935 if (pt_prev)
f2ccd8fa 2936 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
2937 pt_prev = ptype;
2938 }
2939 }
2940
2941 if (pt_prev) {
f2ccd8fa 2942 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4
LT
2943 } else {
2944 kfree_skb(skb);
2945 /* Jamal, now you will not able to escape explaining
2946 * me how you were going to use this. :-)
2947 */
2948 ret = NET_RX_DROP;
2949 }
2950
2951out:
2952 rcu_read_unlock();
2953 return ret;
2954}
0a9627f2
TH
2955
2956/**
2957 * netif_receive_skb - process receive buffer from network
2958 * @skb: buffer to process
2959 *
2960 * netif_receive_skb() is the main receive data processing function.
2961 * It always succeeds. The buffer may be dropped during processing
2962 * for congestion control or by the protocol layers.
2963 *
2964 * This function may only be called from softirq context and interrupts
2965 * should be enabled.
2966 *
2967 * Return values (usually ignored):
2968 * NET_RX_SUCCESS: no congestion
2969 * NET_RX_DROP: packet was dropped
2970 */
2971int netif_receive_skb(struct sk_buff *skb)
2972{
3b098e2d
ED
2973 if (netdev_tstamp_prequeue)
2974 net_timestamp_check(skb);
2975
c1f19b51
RC
2976 if (skb_defer_rx_timestamp(skb))
2977 return NET_RX_SUCCESS;
2978
df334545 2979#ifdef CONFIG_RPS
3b098e2d
ED
2980 {
2981 struct rps_dev_flow voidflow, *rflow = &voidflow;
2982 int cpu, ret;
fec5e652 2983
3b098e2d
ED
2984 rcu_read_lock();
2985
2986 cpu = get_rps_cpu(skb->dev, skb, &rflow);
0a9627f2 2987
3b098e2d
ED
2988 if (cpu >= 0) {
2989 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2990 rcu_read_unlock();
2991 } else {
2992 rcu_read_unlock();
2993 ret = __netif_receive_skb(skb);
2994 }
0a9627f2 2995
3b098e2d 2996 return ret;
fec5e652 2997 }
1e94d72f
TH
2998#else
2999 return __netif_receive_skb(skb);
3000#endif
0a9627f2 3001}
d1b19dff 3002EXPORT_SYMBOL(netif_receive_skb);
1da177e4 3003
88751275
ED
3004/* Network device is going away, flush any packets still pending
3005 * Called with irqs disabled.
3006 */
152102c7 3007static void flush_backlog(void *arg)
6e583ce5 3008{
152102c7 3009 struct net_device *dev = arg;
e36fa2f7 3010 struct softnet_data *sd = &__get_cpu_var(softnet_data);
6e583ce5
SH
3011 struct sk_buff *skb, *tmp;
3012
e36fa2f7 3013 rps_lock(sd);
6e7676c1 3014 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
6e583ce5 3015 if (skb->dev == dev) {
e36fa2f7 3016 __skb_unlink(skb, &sd->input_pkt_queue);
6e583ce5 3017 kfree_skb(skb);
76cc8b13 3018 input_queue_head_incr(sd);
6e583ce5 3019 }
6e7676c1 3020 }
e36fa2f7 3021 rps_unlock(sd);
6e7676c1
CG
3022
3023 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3024 if (skb->dev == dev) {
3025 __skb_unlink(skb, &sd->process_queue);
3026 kfree_skb(skb);
76cc8b13 3027 input_queue_head_incr(sd);
6e7676c1
CG
3028 }
3029 }
6e583ce5
SH
3030}
3031
d565b0a1
HX
3032static int napi_gro_complete(struct sk_buff *skb)
3033{
3034 struct packet_type *ptype;
3035 __be16 type = skb->protocol;
3036 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3037 int err = -ENOENT;
3038
fc59f9a3
HX
3039 if (NAPI_GRO_CB(skb)->count == 1) {
3040 skb_shinfo(skb)->gso_size = 0;
d565b0a1 3041 goto out;
fc59f9a3 3042 }
d565b0a1
HX
3043
3044 rcu_read_lock();
3045 list_for_each_entry_rcu(ptype, head, list) {
3046 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3047 continue;
3048
3049 err = ptype->gro_complete(skb);
3050 break;
3051 }
3052 rcu_read_unlock();
3053
3054 if (err) {
3055 WARN_ON(&ptype->list == head);
3056 kfree_skb(skb);
3057 return NET_RX_SUCCESS;
3058 }
3059
3060out:
d565b0a1
HX
3061 return netif_receive_skb(skb);
3062}
3063
86cac58b 3064inline void napi_gro_flush(struct napi_struct *napi)
d565b0a1
HX
3065{
3066 struct sk_buff *skb, *next;
3067
3068 for (skb = napi->gro_list; skb; skb = next) {
3069 next = skb->next;
3070 skb->next = NULL;
3071 napi_gro_complete(skb);
3072 }
3073
4ae5544f 3074 napi->gro_count = 0;
d565b0a1
HX
3075 napi->gro_list = NULL;
3076}
86cac58b 3077EXPORT_SYMBOL(napi_gro_flush);
d565b0a1 3078
5b252f0c 3079enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
d565b0a1
HX
3080{
3081 struct sk_buff **pp = NULL;
3082 struct packet_type *ptype;
3083 __be16 type = skb->protocol;
3084 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
0da2afd5 3085 int same_flow;
d565b0a1 3086 int mac_len;
5b252f0c 3087 enum gro_result ret;
d565b0a1 3088
ce9e76c8 3089 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
d565b0a1
HX
3090 goto normal;
3091
21dc3301 3092 if (skb_is_gso(skb) || skb_has_frag_list(skb))
f17f5c91
HX
3093 goto normal;
3094
d565b0a1
HX
3095 rcu_read_lock();
3096 list_for_each_entry_rcu(ptype, head, list) {
d565b0a1
HX
3097 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3098 continue;
3099
86911732 3100 skb_set_network_header(skb, skb_gro_offset(skb));
d565b0a1
HX
3101 mac_len = skb->network_header - skb->mac_header;
3102 skb->mac_len = mac_len;
3103 NAPI_GRO_CB(skb)->same_flow = 0;
3104 NAPI_GRO_CB(skb)->flush = 0;
5d38a079 3105 NAPI_GRO_CB(skb)->free = 0;
d565b0a1 3106
d565b0a1
HX
3107 pp = ptype->gro_receive(&napi->gro_list, skb);
3108 break;
3109 }
3110 rcu_read_unlock();
3111
3112 if (&ptype->list == head)
3113 goto normal;
3114
0da2afd5 3115 same_flow = NAPI_GRO_CB(skb)->same_flow;
5d0d9be8 3116 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
0da2afd5 3117
d565b0a1
HX
3118 if (pp) {
3119 struct sk_buff *nskb = *pp;
3120
3121 *pp = nskb->next;
3122 nskb->next = NULL;
3123 napi_gro_complete(nskb);
4ae5544f 3124 napi->gro_count--;
d565b0a1
HX
3125 }
3126
0da2afd5 3127 if (same_flow)
d565b0a1
HX
3128 goto ok;
3129
4ae5544f 3130 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
d565b0a1 3131 goto normal;
d565b0a1 3132
4ae5544f 3133 napi->gro_count++;
d565b0a1 3134 NAPI_GRO_CB(skb)->count = 1;
86911732 3135 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
d565b0a1
HX
3136 skb->next = napi->gro_list;
3137 napi->gro_list = skb;
5d0d9be8 3138 ret = GRO_HELD;
d565b0a1 3139
ad0f9904 3140pull:
cb18978c
HX
3141 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3142 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3143
3144 BUG_ON(skb->end - skb->tail < grow);
3145
3146 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3147
3148 skb->tail += grow;
3149 skb->data_len -= grow;
3150
3151 skb_shinfo(skb)->frags[0].page_offset += grow;
3152 skb_shinfo(skb)->frags[0].size -= grow;
3153
3154 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3155 put_page(skb_shinfo(skb)->frags[0].page);
3156 memmove(skb_shinfo(skb)->frags,
3157 skb_shinfo(skb)->frags + 1,
e5093aec 3158 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
cb18978c 3159 }
ad0f9904
HX
3160 }
3161
d565b0a1 3162ok:
5d0d9be8 3163 return ret;
d565b0a1
HX
3164
3165normal:
ad0f9904
HX
3166 ret = GRO_NORMAL;
3167 goto pull;
5d38a079 3168}
96e93eab
HX
3169EXPORT_SYMBOL(dev_gro_receive);
3170
40d0802b 3171static inline gro_result_t
5b252f0c 3172__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
96e93eab
HX
3173{
3174 struct sk_buff *p;
3175
3176 for (p = napi->gro_list; p; p = p->next) {
40d0802b
ED
3177 unsigned long diffs;
3178
3179 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3180 diffs |= compare_ether_header(skb_mac_header(p),
f64f9e71 3181 skb_gro_mac_header(skb));
40d0802b 3182 NAPI_GRO_CB(p)->same_flow = !diffs;
96e93eab
HX
3183 NAPI_GRO_CB(p)->flush = 0;
3184 }
3185
3186 return dev_gro_receive(napi, skb);
3187}
5d38a079 3188
c7c4b3b6 3189gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5d38a079 3190{
5d0d9be8
HX
3191 switch (ret) {
3192 case GRO_NORMAL:
c7c4b3b6
BH
3193 if (netif_receive_skb(skb))
3194 ret = GRO_DROP;
3195 break;
5d38a079 3196
5d0d9be8 3197 case GRO_DROP:
5d0d9be8 3198 case GRO_MERGED_FREE:
5d38a079
HX
3199 kfree_skb(skb);
3200 break;
5b252f0c
BH
3201
3202 case GRO_HELD:
3203 case GRO_MERGED:
3204 break;
5d38a079
HX
3205 }
3206
c7c4b3b6 3207 return ret;
5d0d9be8
HX
3208}
3209EXPORT_SYMBOL(napi_skb_finish);
3210
78a478d0
HX
3211void skb_gro_reset_offset(struct sk_buff *skb)
3212{
3213 NAPI_GRO_CB(skb)->data_offset = 0;
3214 NAPI_GRO_CB(skb)->frag0 = NULL;
7489594c 3215 NAPI_GRO_CB(skb)->frag0_len = 0;
78a478d0 3216
78d3fd0b 3217 if (skb->mac_header == skb->tail &&
7489594c 3218 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
78a478d0
HX
3219 NAPI_GRO_CB(skb)->frag0 =
3220 page_address(skb_shinfo(skb)->frags[0].page) +
3221 skb_shinfo(skb)->frags[0].page_offset;
7489594c
HX
3222 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3223 }
78a478d0
HX
3224}
3225EXPORT_SYMBOL(skb_gro_reset_offset);
3226
c7c4b3b6 3227gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5d0d9be8 3228{
86911732
HX
3229 skb_gro_reset_offset(skb);
3230
5d0d9be8 3231 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
d565b0a1
HX
3232}
3233EXPORT_SYMBOL(napi_gro_receive);
3234
96e93eab
HX
3235void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3236{
96e93eab
HX
3237 __skb_pull(skb, skb_headlen(skb));
3238 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3239
3240 napi->skb = skb;
3241}
3242EXPORT_SYMBOL(napi_reuse_skb);
3243
76620aaf 3244struct sk_buff *napi_get_frags(struct napi_struct *napi)
5d38a079 3245{
5d38a079 3246 struct sk_buff *skb = napi->skb;
5d38a079
HX
3247
3248 if (!skb) {
89d71a66
ED
3249 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3250 if (skb)
3251 napi->skb = skb;
80595d59 3252 }
96e93eab
HX
3253 return skb;
3254}
76620aaf 3255EXPORT_SYMBOL(napi_get_frags);
96e93eab 3256
c7c4b3b6
BH
3257gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3258 gro_result_t ret)
96e93eab 3259{
5d0d9be8
HX
3260 switch (ret) {
3261 case GRO_NORMAL:
86911732 3262 case GRO_HELD:
e76b69cc 3263 skb->protocol = eth_type_trans(skb, skb->dev);
86911732 3264
c7c4b3b6
BH
3265 if (ret == GRO_HELD)
3266 skb_gro_pull(skb, -ETH_HLEN);
3267 else if (netif_receive_skb(skb))
3268 ret = GRO_DROP;
86911732 3269 break;
5d38a079 3270
5d0d9be8 3271 case GRO_DROP:
5d0d9be8
HX
3272 case GRO_MERGED_FREE:
3273 napi_reuse_skb(napi, skb);
3274 break;
5b252f0c
BH
3275
3276 case GRO_MERGED:
3277 break;
5d0d9be8 3278 }
5d38a079 3279
c7c4b3b6 3280 return ret;
5d38a079 3281}
5d0d9be8
HX
3282EXPORT_SYMBOL(napi_frags_finish);
3283
76620aaf
HX
3284struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3285{
3286 struct sk_buff *skb = napi->skb;
3287 struct ethhdr *eth;
a5b1cf28
HX
3288 unsigned int hlen;
3289 unsigned int off;
76620aaf
HX
3290
3291 napi->skb = NULL;
3292
3293 skb_reset_mac_header(skb);
3294 skb_gro_reset_offset(skb);
3295
a5b1cf28
HX
3296 off = skb_gro_offset(skb);
3297 hlen = off + sizeof(*eth);
3298 eth = skb_gro_header_fast(skb, off);
3299 if (skb_gro_header_hard(skb, hlen)) {
3300 eth = skb_gro_header_slow(skb, hlen, off);
3301 if (unlikely(!eth)) {
3302 napi_reuse_skb(napi, skb);
3303 skb = NULL;
3304 goto out;
3305 }
76620aaf
HX
3306 }
3307
3308 skb_gro_pull(skb, sizeof(*eth));
3309
3310 /*
3311 * This works because the only protocols we care about don't require
3312 * special handling. We'll fix it up properly at the end.
3313 */
3314 skb->protocol = eth->h_proto;
3315
3316out:
3317 return skb;
3318}
3319EXPORT_SYMBOL(napi_frags_skb);
3320
c7c4b3b6 3321gro_result_t napi_gro_frags(struct napi_struct *napi)
5d0d9be8 3322{
76620aaf 3323 struct sk_buff *skb = napi_frags_skb(napi);
5d0d9be8
HX
3324
3325 if (!skb)
c7c4b3b6 3326 return GRO_DROP;
5d0d9be8
HX
3327
3328 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3329}
5d38a079
HX
3330EXPORT_SYMBOL(napi_gro_frags);
3331
e326bed2
ED
3332/*
3333 * net_rps_action sends any pending IPI's for rps.
3334 * Note: called with local irq disabled, but exits with local irq enabled.
3335 */
3336static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3337{
3338#ifdef CONFIG_RPS
3339 struct softnet_data *remsd = sd->rps_ipi_list;
3340
3341 if (remsd) {
3342 sd->rps_ipi_list = NULL;
3343
3344 local_irq_enable();
3345
3346 /* Send pending IPI's to kick RPS processing on remote cpus. */
3347 while (remsd) {
3348 struct softnet_data *next = remsd->rps_ipi_next;
3349
3350 if (cpu_online(remsd->cpu))
3351 __smp_call_function_single(remsd->cpu,
3352 &remsd->csd, 0);
3353 remsd = next;
3354 }
3355 } else
3356#endif
3357 local_irq_enable();
3358}
3359
bea3348e 3360static int process_backlog(struct napi_struct *napi, int quota)
1da177e4
LT
3361{
3362 int work = 0;
eecfd7c4 3363 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
1da177e4 3364
e326bed2
ED
3365#ifdef CONFIG_RPS
3366 /* Check if we have pending ipi, its better to send them now,
3367 * not waiting net_rx_action() end.
3368 */
3369 if (sd->rps_ipi_list) {
3370 local_irq_disable();
3371 net_rps_action_and_irq_enable(sd);
3372 }
3373#endif
bea3348e 3374 napi->weight = weight_p;
6e7676c1
CG
3375 local_irq_disable();
3376 while (work < quota) {
1da177e4 3377 struct sk_buff *skb;
6e7676c1
CG
3378 unsigned int qlen;
3379
3380 while ((skb = __skb_dequeue(&sd->process_queue))) {
3381 local_irq_enable();
3382 __netif_receive_skb(skb);
6e7676c1 3383 local_irq_disable();
76cc8b13
TH
3384 input_queue_head_incr(sd);
3385 if (++work >= quota) {
3386 local_irq_enable();
3387 return work;
3388 }
6e7676c1 3389 }
1da177e4 3390
e36fa2f7 3391 rps_lock(sd);
6e7676c1 3392 qlen = skb_queue_len(&sd->input_pkt_queue);
76cc8b13 3393 if (qlen)
6e7676c1
CG
3394 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3395 &sd->process_queue);
76cc8b13 3396
6e7676c1 3397 if (qlen < quota - work) {
eecfd7c4
ED
3398 /*
3399 * Inline a custom version of __napi_complete().
3400 * only current cpu owns and manipulates this napi,
3401 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3402 * we can use a plain write instead of clear_bit(),
3403 * and we dont need an smp_mb() memory barrier.
3404 */
3405 list_del(&napi->poll_list);
3406 napi->state = 0;
3407
6e7676c1 3408 quota = work + qlen;
bea3348e 3409 }
e36fa2f7 3410 rps_unlock(sd);
6e7676c1
CG
3411 }
3412 local_irq_enable();
1da177e4 3413
bea3348e
SH
3414 return work;
3415}
1da177e4 3416
bea3348e
SH
3417/**
3418 * __napi_schedule - schedule for receive
c4ea43c5 3419 * @n: entry to schedule
bea3348e
SH
3420 *
3421 * The entry's receive function will be scheduled to run
3422 */
b5606c2d 3423void __napi_schedule(struct napi_struct *n)
bea3348e
SH
3424{
3425 unsigned long flags;
1da177e4 3426
bea3348e 3427 local_irq_save(flags);
eecfd7c4 3428 ____napi_schedule(&__get_cpu_var(softnet_data), n);
bea3348e 3429 local_irq_restore(flags);
1da177e4 3430}
bea3348e
SH
3431EXPORT_SYMBOL(__napi_schedule);
3432
d565b0a1
HX
3433void __napi_complete(struct napi_struct *n)
3434{
3435 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3436 BUG_ON(n->gro_list);
3437
3438 list_del(&n->poll_list);
3439 smp_mb__before_clear_bit();
3440 clear_bit(NAPI_STATE_SCHED, &n->state);
3441}
3442EXPORT_SYMBOL(__napi_complete);
3443
3444void napi_complete(struct napi_struct *n)
3445{
3446 unsigned long flags;
3447
3448 /*
3449 * don't let napi dequeue from the cpu poll list
3450 * just in case its running on a different cpu
3451 */
3452 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3453 return;
3454
3455 napi_gro_flush(n);
3456 local_irq_save(flags);
3457 __napi_complete(n);
3458 local_irq_restore(flags);
3459}
3460EXPORT_SYMBOL(napi_complete);
3461
3462void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3463 int (*poll)(struct napi_struct *, int), int weight)
3464{
3465 INIT_LIST_HEAD(&napi->poll_list);
4ae5544f 3466 napi->gro_count = 0;
d565b0a1 3467 napi->gro_list = NULL;
5d38a079 3468 napi->skb = NULL;
d565b0a1
HX
3469 napi->poll = poll;
3470 napi->weight = weight;
3471 list_add(&napi->dev_list, &dev->napi_list);
d565b0a1 3472 napi->dev = dev;
5d38a079 3473#ifdef CONFIG_NETPOLL
d565b0a1
HX
3474 spin_lock_init(&napi->poll_lock);
3475 napi->poll_owner = -1;
3476#endif
3477 set_bit(NAPI_STATE_SCHED, &napi->state);
3478}
3479EXPORT_SYMBOL(netif_napi_add);
3480
3481void netif_napi_del(struct napi_struct *napi)
3482{
3483 struct sk_buff *skb, *next;
3484
d7b06636 3485 list_del_init(&napi->dev_list);
76620aaf 3486 napi_free_frags(napi);
d565b0a1
HX
3487
3488 for (skb = napi->gro_list; skb; skb = next) {
3489 next = skb->next;
3490 skb->next = NULL;
3491 kfree_skb(skb);
3492 }
3493
3494 napi->gro_list = NULL;
4ae5544f 3495 napi->gro_count = 0;
d565b0a1
HX
3496}
3497EXPORT_SYMBOL(netif_napi_del);
3498
1da177e4
LT
3499static void net_rx_action(struct softirq_action *h)
3500{
e326bed2 3501 struct softnet_data *sd = &__get_cpu_var(softnet_data);
24f8b238 3502 unsigned long time_limit = jiffies + 2;
51b0bded 3503 int budget = netdev_budget;
53fb95d3
MM
3504 void *have;
3505
1da177e4
LT
3506 local_irq_disable();
3507
e326bed2 3508 while (!list_empty(&sd->poll_list)) {
bea3348e
SH
3509 struct napi_struct *n;
3510 int work, weight;
1da177e4 3511
bea3348e 3512 /* If softirq window is exhuasted then punt.
24f8b238
SH
3513 * Allow this to run for 2 jiffies since which will allow
3514 * an average latency of 1.5/HZ.
bea3348e 3515 */
24f8b238 3516 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
1da177e4
LT
3517 goto softnet_break;
3518
3519 local_irq_enable();
3520
bea3348e
SH
3521 /* Even though interrupts have been re-enabled, this
3522 * access is safe because interrupts can only add new
3523 * entries to the tail of this list, and only ->poll()
3524 * calls can remove this head entry from the list.
3525 */
e326bed2 3526 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
1da177e4 3527
bea3348e
SH
3528 have = netpoll_poll_lock(n);
3529
3530 weight = n->weight;
3531
0a7606c1
DM
3532 /* This NAPI_STATE_SCHED test is for avoiding a race
3533 * with netpoll's poll_napi(). Only the entity which
3534 * obtains the lock and sees NAPI_STATE_SCHED set will
3535 * actually make the ->poll() call. Therefore we avoid
3536 * accidently calling ->poll() when NAPI is not scheduled.
3537 */
3538 work = 0;
4ea7e386 3539 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
0a7606c1 3540 work = n->poll(n, weight);
4ea7e386
NH
3541 trace_napi_poll(n);
3542 }
bea3348e
SH
3543
3544 WARN_ON_ONCE(work > weight);
3545
3546 budget -= work;
3547
3548 local_irq_disable();
3549
3550 /* Drivers must not modify the NAPI state if they
3551 * consume the entire weight. In such cases this code
3552 * still "owns" the NAPI instance and therefore can
3553 * move the instance around on the list at-will.
3554 */
fed17f30 3555 if (unlikely(work == weight)) {
ff780cd8
HX
3556 if (unlikely(napi_disable_pending(n))) {
3557 local_irq_enable();
3558 napi_complete(n);
3559 local_irq_disable();
3560 } else
e326bed2 3561 list_move_tail(&n->poll_list, &sd->poll_list);
fed17f30 3562 }
bea3348e
SH
3563
3564 netpoll_poll_unlock(have);
1da177e4
LT
3565 }
3566out:
e326bed2 3567 net_rps_action_and_irq_enable(sd);
0a9627f2 3568
db217334
CL
3569#ifdef CONFIG_NET_DMA
3570 /*
3571 * There may not be any more sk_buffs coming right now, so push
3572 * any pending DMA copies to hardware
3573 */
2ba05622 3574 dma_issue_pending_all();
db217334 3575#endif
bea3348e 3576
1da177e4
LT
3577 return;
3578
3579softnet_break:
dee42870 3580 sd->time_squeeze++;
1da177e4
LT
3581 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3582 goto out;
3583}
3584
d1b19dff 3585static gifconf_func_t *gifconf_list[NPROTO];
1da177e4
LT
3586
3587/**
3588 * register_gifconf - register a SIOCGIF handler
3589 * @family: Address family
3590 * @gifconf: Function handler
3591 *
3592 * Register protocol dependent address dumping routines. The handler
3593 * that is passed must not be freed or reused until it has been replaced
3594 * by another handler.
3595 */
d1b19dff 3596int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
1da177e4
LT
3597{
3598 if (family >= NPROTO)
3599 return -EINVAL;
3600 gifconf_list[family] = gifconf;
3601 return 0;
3602}
d1b19dff 3603EXPORT_SYMBOL(register_gifconf);
1da177e4
LT
3604
3605
3606/*
3607 * Map an interface index to its name (SIOCGIFNAME)
3608 */
3609
3610/*
3611 * We need this ioctl for efficient implementation of the
3612 * if_indextoname() function required by the IPv6 API. Without
3613 * it, we would have to search all the interfaces to find a
3614 * match. --pb
3615 */
3616
881d966b 3617static int dev_ifname(struct net *net, struct ifreq __user *arg)
1da177e4
LT
3618{
3619 struct net_device *dev;
3620 struct ifreq ifr;
3621
3622 /*
3623 * Fetch the caller's info block.
3624 */
3625
3626 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3627 return -EFAULT;
3628
fb699dfd
ED
3629 rcu_read_lock();
3630 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
1da177e4 3631 if (!dev) {
fb699dfd 3632 rcu_read_unlock();
1da177e4
LT
3633 return -ENODEV;
3634 }
3635
3636 strcpy(ifr.ifr_name, dev->name);
fb699dfd 3637 rcu_read_unlock();
1da177e4
LT
3638
3639 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3640 return -EFAULT;
3641 return 0;
3642}
3643
3644/*
3645 * Perform a SIOCGIFCONF call. This structure will change
3646 * size eventually, and there is nothing I can do about it.
3647 * Thus we will need a 'compatibility mode'.
3648 */
3649
881d966b 3650static int dev_ifconf(struct net *net, char __user *arg)
1da177e4
LT
3651{
3652 struct ifconf ifc;
3653 struct net_device *dev;
3654 char __user *pos;
3655 int len;
3656 int total;
3657 int i;
3658
3659 /*
3660 * Fetch the caller's info block.
3661 */
3662
3663 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3664 return -EFAULT;
3665
3666 pos = ifc.ifc_buf;
3667 len = ifc.ifc_len;
3668
3669 /*
3670 * Loop over the interfaces, and write an info block for each.
3671 */
3672
3673 total = 0;
881d966b 3674 for_each_netdev(net, dev) {
1da177e4
LT
3675 for (i = 0; i < NPROTO; i++) {
3676 if (gifconf_list[i]) {
3677 int done;
3678 if (!pos)
3679 done = gifconf_list[i](dev, NULL, 0);
3680 else
3681 done = gifconf_list[i](dev, pos + total,
3682 len - total);
3683 if (done < 0)
3684 return -EFAULT;
3685 total += done;
3686 }
3687 }
4ec93edb 3688 }
1da177e4
LT
3689
3690 /*
3691 * All done. Write the updated control block back to the caller.
3692 */
3693 ifc.ifc_len = total;
3694
3695 /*
3696 * Both BSD and Solaris return 0 here, so we do too.
3697 */
3698 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3699}
3700
3701#ifdef CONFIG_PROC_FS
3702/*
3703 * This is invoked by the /proc filesystem handler to display a device
3704 * in detail.
3705 */
7562f876 3706void *dev_seq_start(struct seq_file *seq, loff_t *pos)
c6d14c84 3707 __acquires(RCU)
1da177e4 3708{
e372c414 3709 struct net *net = seq_file_net(seq);
7562f876 3710 loff_t off;
1da177e4 3711 struct net_device *dev;
1da177e4 3712
c6d14c84 3713 rcu_read_lock();
7562f876
PE
3714 if (!*pos)
3715 return SEQ_START_TOKEN;
1da177e4 3716
7562f876 3717 off = 1;
c6d14c84 3718 for_each_netdev_rcu(net, dev)
7562f876
PE
3719 if (off++ == *pos)
3720 return dev;
1da177e4 3721
7562f876 3722 return NULL;
1da177e4
LT
3723}
3724
3725void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3726{
c6d14c84
ED
3727 struct net_device *dev = (v == SEQ_START_TOKEN) ?
3728 first_net_device(seq_file_net(seq)) :
3729 next_net_device((struct net_device *)v);
3730
1da177e4 3731 ++*pos;
c6d14c84 3732 return rcu_dereference(dev);
1da177e4
LT
3733}
3734
3735void dev_seq_stop(struct seq_file *seq, void *v)
c6d14c84 3736 __releases(RCU)
1da177e4 3737{
c6d14c84 3738 rcu_read_unlock();
1da177e4
LT
3739}
3740
3741static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3742{
28172739
ED
3743 struct rtnl_link_stats64 temp;
3744 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
1da177e4 3745
be1f3c2c
BH
3746 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3747 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
5a1b5898
RR
3748 dev->name, stats->rx_bytes, stats->rx_packets,
3749 stats->rx_errors,
3750 stats->rx_dropped + stats->rx_missed_errors,
3751 stats->rx_fifo_errors,
3752 stats->rx_length_errors + stats->rx_over_errors +
3753 stats->rx_crc_errors + stats->rx_frame_errors,
3754 stats->rx_compressed, stats->multicast,
3755 stats->tx_bytes, stats->tx_packets,
3756 stats->tx_errors, stats->tx_dropped,
3757 stats->tx_fifo_errors, stats->collisions,
3758 stats->tx_carrier_errors +
3759 stats->tx_aborted_errors +
3760 stats->tx_window_errors +
3761 stats->tx_heartbeat_errors,
3762 stats->tx_compressed);
1da177e4
LT
3763}
3764
3765/*
3766 * Called from the PROCfs module. This now uses the new arbitrary sized
3767 * /proc/net interface to create /proc/net/dev
3768 */
3769static int dev_seq_show(struct seq_file *seq, void *v)
3770{
3771 if (v == SEQ_START_TOKEN)
3772 seq_puts(seq, "Inter-| Receive "
3773 " | Transmit\n"
3774 " face |bytes packets errs drop fifo frame "
3775 "compressed multicast|bytes packets errs "
3776 "drop fifo colls carrier compressed\n");
3777 else
3778 dev_seq_printf_stats(seq, v);
3779 return 0;
3780}
3781
dee42870 3782static struct softnet_data *softnet_get_online(loff_t *pos)
1da177e4 3783{
dee42870 3784 struct softnet_data *sd = NULL;
1da177e4 3785
0c0b0aca 3786 while (*pos < nr_cpu_ids)
4ec93edb 3787 if (cpu_online(*pos)) {
dee42870 3788 sd = &per_cpu(softnet_data, *pos);
1da177e4
LT
3789 break;
3790 } else
3791 ++*pos;
dee42870 3792 return sd;
1da177e4
LT
3793}
3794
3795static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3796{
3797 return softnet_get_online(pos);
3798}
3799
3800static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3801{
3802 ++*pos;
3803 return softnet_get_online(pos);
3804}
3805
3806static void softnet_seq_stop(struct seq_file *seq, void *v)
3807{
3808}
3809
3810static int softnet_seq_show(struct seq_file *seq, void *v)
3811{
dee42870 3812 struct softnet_data *sd = v;
1da177e4 3813
0a9627f2 3814 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
dee42870 3815 sd->processed, sd->dropped, sd->time_squeeze, 0,
c1ebcdb8 3816 0, 0, 0, 0, /* was fastroute */
dee42870 3817 sd->cpu_collision, sd->received_rps);
1da177e4
LT
3818 return 0;
3819}
3820
f690808e 3821static const struct seq_operations dev_seq_ops = {
1da177e4
LT
3822 .start = dev_seq_start,
3823 .next = dev_seq_next,
3824 .stop = dev_seq_stop,
3825 .show = dev_seq_show,
3826};
3827
3828static int dev_seq_open(struct inode *inode, struct file *file)
3829{
e372c414
DL
3830 return seq_open_net(inode, file, &dev_seq_ops,
3831 sizeof(struct seq_net_private));
1da177e4
LT
3832}
3833
9a32144e 3834static const struct file_operations dev_seq_fops = {
1da177e4
LT
3835 .owner = THIS_MODULE,
3836 .open = dev_seq_open,
3837 .read = seq_read,
3838 .llseek = seq_lseek,
e372c414 3839 .release = seq_release_net,
1da177e4
LT
3840};
3841
f690808e 3842static const struct seq_operations softnet_seq_ops = {
1da177e4
LT
3843 .start = softnet_seq_start,
3844 .next = softnet_seq_next,
3845 .stop = softnet_seq_stop,
3846 .show = softnet_seq_show,
3847};
3848
3849static int softnet_seq_open(struct inode *inode, struct file *file)
3850{
3851 return seq_open(file, &softnet_seq_ops);
3852}
3853
9a32144e 3854static const struct file_operations softnet_seq_fops = {
1da177e4
LT
3855 .owner = THIS_MODULE,
3856 .open = softnet_seq_open,
3857 .read = seq_read,
3858 .llseek = seq_lseek,
3859 .release = seq_release,
3860};
3861
0e1256ff
SH
3862static void *ptype_get_idx(loff_t pos)
3863{
3864 struct packet_type *pt = NULL;
3865 loff_t i = 0;
3866 int t;
3867
3868 list_for_each_entry_rcu(pt, &ptype_all, list) {
3869 if (i == pos)
3870 return pt;
3871 ++i;
3872 }
3873
82d8a867 3874 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
0e1256ff
SH
3875 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3876 if (i == pos)
3877 return pt;
3878 ++i;
3879 }
3880 }
3881 return NULL;
3882}
3883
3884static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
72348a42 3885 __acquires(RCU)
0e1256ff
SH
3886{
3887 rcu_read_lock();
3888 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3889}
3890
3891static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3892{
3893 struct packet_type *pt;
3894 struct list_head *nxt;
3895 int hash;
3896
3897 ++*pos;
3898 if (v == SEQ_START_TOKEN)
3899 return ptype_get_idx(0);
3900
3901 pt = v;
3902 nxt = pt->list.next;
3903 if (pt->type == htons(ETH_P_ALL)) {
3904 if (nxt != &ptype_all)
3905 goto found;
3906 hash = 0;
3907 nxt = ptype_base[0].next;
3908 } else
82d8a867 3909 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
0e1256ff
SH
3910
3911 while (nxt == &ptype_base[hash]) {
82d8a867 3912 if (++hash >= PTYPE_HASH_SIZE)
0e1256ff
SH
3913 return NULL;
3914 nxt = ptype_base[hash].next;
3915 }
3916found:
3917 return list_entry(nxt, struct packet_type, list);
3918}
3919
3920static void ptype_seq_stop(struct seq_file *seq, void *v)
72348a42 3921 __releases(RCU)
0e1256ff
SH
3922{
3923 rcu_read_unlock();
3924}
3925
0e1256ff
SH
3926static int ptype_seq_show(struct seq_file *seq, void *v)
3927{
3928 struct packet_type *pt = v;
3929
3930 if (v == SEQ_START_TOKEN)
3931 seq_puts(seq, "Type Device Function\n");
c346dca1 3932 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
0e1256ff
SH
3933 if (pt->type == htons(ETH_P_ALL))
3934 seq_puts(seq, "ALL ");
3935 else
3936 seq_printf(seq, "%04x", ntohs(pt->type));
3937
908cd2da
AD
3938 seq_printf(seq, " %-8s %pF\n",
3939 pt->dev ? pt->dev->name : "", pt->func);
0e1256ff
SH
3940 }
3941
3942 return 0;
3943}
3944
3945static const struct seq_operations ptype_seq_ops = {
3946 .start = ptype_seq_start,
3947 .next = ptype_seq_next,
3948 .stop = ptype_seq_stop,
3949 .show = ptype_seq_show,
3950};
3951
3952static int ptype_seq_open(struct inode *inode, struct file *file)
3953{
2feb27db
PE
3954 return seq_open_net(inode, file, &ptype_seq_ops,
3955 sizeof(struct seq_net_private));
0e1256ff
SH
3956}
3957
3958static const struct file_operations ptype_seq_fops = {
3959 .owner = THIS_MODULE,
3960 .open = ptype_seq_open,
3961 .read = seq_read,
3962 .llseek = seq_lseek,
2feb27db 3963 .release = seq_release_net,
0e1256ff
SH
3964};
3965
3966
4665079c 3967static int __net_init dev_proc_net_init(struct net *net)
1da177e4
LT
3968{
3969 int rc = -ENOMEM;
3970
881d966b 3971 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
1da177e4 3972 goto out;
881d966b 3973 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
1da177e4 3974 goto out_dev;
881d966b 3975 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
457c4cbc 3976 goto out_softnet;
0e1256ff 3977
881d966b 3978 if (wext_proc_init(net))
457c4cbc 3979 goto out_ptype;
1da177e4
LT
3980 rc = 0;
3981out:
3982 return rc;
457c4cbc 3983out_ptype:
881d966b 3984 proc_net_remove(net, "ptype");
1da177e4 3985out_softnet:
881d966b 3986 proc_net_remove(net, "softnet_stat");
1da177e4 3987out_dev:
881d966b 3988 proc_net_remove(net, "dev");
1da177e4
LT
3989 goto out;
3990}
881d966b 3991
4665079c 3992static void __net_exit dev_proc_net_exit(struct net *net)
881d966b
EB
3993{
3994 wext_proc_exit(net);
3995
3996 proc_net_remove(net, "ptype");
3997 proc_net_remove(net, "softnet_stat");
3998 proc_net_remove(net, "dev");
3999}
4000
022cbae6 4001static struct pernet_operations __net_initdata dev_proc_ops = {
881d966b
EB
4002 .init = dev_proc_net_init,
4003 .exit = dev_proc_net_exit,
4004};
4005
4006static int __init dev_proc_init(void)
4007{
4008 return register_pernet_subsys(&dev_proc_ops);
4009}
1da177e4
LT
4010#else
4011#define dev_proc_init() 0
4012#endif /* CONFIG_PROC_FS */
4013
4014
4015/**
4016 * netdev_set_master - set up master/slave pair
4017 * @slave: slave device
4018 * @master: new master device
4019 *
4020 * Changes the master device of the slave. Pass %NULL to break the
4021 * bonding. The caller must hold the RTNL semaphore. On a failure
4022 * a negative errno code is returned. On success the reference counts
4023 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4024 * function returns zero.
4025 */
4026int netdev_set_master(struct net_device *slave, struct net_device *master)
4027{
4028 struct net_device *old = slave->master;
4029
4030 ASSERT_RTNL();
4031
4032 if (master) {
4033 if (old)
4034 return -EBUSY;
4035 dev_hold(master);
4036 }
4037
4038 slave->master = master;
4ec93edb 4039
283f2fe8
ED
4040 if (old) {
4041 synchronize_net();
1da177e4 4042 dev_put(old);
283f2fe8 4043 }
1da177e4
LT
4044 if (master)
4045 slave->flags |= IFF_SLAVE;
4046 else
4047 slave->flags &= ~IFF_SLAVE;
4048
4049 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4050 return 0;
4051}
d1b19dff 4052EXPORT_SYMBOL(netdev_set_master);
1da177e4 4053
b6c40d68
PM
4054static void dev_change_rx_flags(struct net_device *dev, int flags)
4055{
d314774c
SH
4056 const struct net_device_ops *ops = dev->netdev_ops;
4057
4058 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4059 ops->ndo_change_rx_flags(dev, flags);
b6c40d68
PM
4060}
4061
dad9b335 4062static int __dev_set_promiscuity(struct net_device *dev, int inc)
1da177e4
LT
4063{
4064 unsigned short old_flags = dev->flags;
8192b0c4
DH
4065 uid_t uid;
4066 gid_t gid;
1da177e4 4067
24023451
PM
4068 ASSERT_RTNL();
4069
dad9b335
WC
4070 dev->flags |= IFF_PROMISC;
4071 dev->promiscuity += inc;
4072 if (dev->promiscuity == 0) {
4073 /*
4074 * Avoid overflow.
4075 * If inc causes overflow, untouch promisc and return error.
4076 */
4077 if (inc < 0)
4078 dev->flags &= ~IFF_PROMISC;
4079 else {
4080 dev->promiscuity -= inc;
4081 printk(KERN_WARNING "%s: promiscuity touches roof, "
4082 "set promiscuity failed, promiscuity feature "
4083 "of device might be broken.\n", dev->name);
4084 return -EOVERFLOW;
4085 }
4086 }
52609c0b 4087 if (dev->flags != old_flags) {
1da177e4
LT
4088 printk(KERN_INFO "device %s %s promiscuous mode\n",
4089 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4ec93edb 4090 "left");
8192b0c4
DH
4091 if (audit_enabled) {
4092 current_uid_gid(&uid, &gid);
7759db82
KHK
4093 audit_log(current->audit_context, GFP_ATOMIC,
4094 AUDIT_ANOM_PROMISCUOUS,
4095 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4096 dev->name, (dev->flags & IFF_PROMISC),
4097 (old_flags & IFF_PROMISC),
4098 audit_get_loginuid(current),
8192b0c4 4099 uid, gid,
7759db82 4100 audit_get_sessionid(current));
8192b0c4 4101 }
24023451 4102
b6c40d68 4103 dev_change_rx_flags(dev, IFF_PROMISC);
1da177e4 4104 }
dad9b335 4105 return 0;
1da177e4
LT
4106}
4107
4417da66
PM
4108/**
4109 * dev_set_promiscuity - update promiscuity count on a device
4110 * @dev: device
4111 * @inc: modifier
4112 *
4113 * Add or remove promiscuity from a device. While the count in the device
4114 * remains above zero the interface remains promiscuous. Once it hits zero
4115 * the device reverts back to normal filtering operation. A negative inc
4116 * value is used to drop promiscuity on the device.
dad9b335 4117 * Return 0 if successful or a negative errno code on error.
4417da66 4118 */
dad9b335 4119int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66
PM
4120{
4121 unsigned short old_flags = dev->flags;
dad9b335 4122 int err;
4417da66 4123
dad9b335 4124 err = __dev_set_promiscuity(dev, inc);
4b5a698e 4125 if (err < 0)
dad9b335 4126 return err;
4417da66
PM
4127 if (dev->flags != old_flags)
4128 dev_set_rx_mode(dev);
dad9b335 4129 return err;
4417da66 4130}
d1b19dff 4131EXPORT_SYMBOL(dev_set_promiscuity);
4417da66 4132
1da177e4
LT
4133/**
4134 * dev_set_allmulti - update allmulti count on a device
4135 * @dev: device
4136 * @inc: modifier
4137 *
4138 * Add or remove reception of all multicast frames to a device. While the
4139 * count in the device remains above zero the interface remains listening
4140 * to all interfaces. Once it hits zero the device reverts back to normal
4141 * filtering operation. A negative @inc value is used to drop the counter
4142 * when releasing a resource needing all multicasts.
dad9b335 4143 * Return 0 if successful or a negative errno code on error.
1da177e4
LT
4144 */
4145
dad9b335 4146int dev_set_allmulti(struct net_device *dev, int inc)
1da177e4
LT
4147{
4148 unsigned short old_flags = dev->flags;
4149
24023451
PM
4150 ASSERT_RTNL();
4151
1da177e4 4152 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
4153 dev->allmulti += inc;
4154 if (dev->allmulti == 0) {
4155 /*
4156 * Avoid overflow.
4157 * If inc causes overflow, untouch allmulti and return error.
4158 */
4159 if (inc < 0)
4160 dev->flags &= ~IFF_ALLMULTI;
4161 else {
4162 dev->allmulti -= inc;
4163 printk(KERN_WARNING "%s: allmulti touches roof, "
4164 "set allmulti failed, allmulti feature of "
4165 "device might be broken.\n", dev->name);
4166 return -EOVERFLOW;
4167 }
4168 }
24023451 4169 if (dev->flags ^ old_flags) {
b6c40d68 4170 dev_change_rx_flags(dev, IFF_ALLMULTI);
4417da66 4171 dev_set_rx_mode(dev);
24023451 4172 }
dad9b335 4173 return 0;
4417da66 4174}
d1b19dff 4175EXPORT_SYMBOL(dev_set_allmulti);
4417da66
PM
4176
4177/*
4178 * Upload unicast and multicast address lists to device and
4179 * configure RX filtering. When the device doesn't support unicast
53ccaae1 4180 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
4181 * are present.
4182 */
4183void __dev_set_rx_mode(struct net_device *dev)
4184{
d314774c
SH
4185 const struct net_device_ops *ops = dev->netdev_ops;
4186
4417da66
PM
4187 /* dev_open will call this function so the list will stay sane. */
4188 if (!(dev->flags&IFF_UP))
4189 return;
4190
4191 if (!netif_device_present(dev))
40b77c94 4192 return;
4417da66 4193
d314774c
SH
4194 if (ops->ndo_set_rx_mode)
4195 ops->ndo_set_rx_mode(dev);
4417da66
PM
4196 else {
4197 /* Unicast addresses changes may only happen under the rtnl,
4198 * therefore calling __dev_set_promiscuity here is safe.
4199 */
32e7bfc4 4200 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4417da66
PM
4201 __dev_set_promiscuity(dev, 1);
4202 dev->uc_promisc = 1;
32e7bfc4 4203 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4417da66
PM
4204 __dev_set_promiscuity(dev, -1);
4205 dev->uc_promisc = 0;
4206 }
4207
d314774c
SH
4208 if (ops->ndo_set_multicast_list)
4209 ops->ndo_set_multicast_list(dev);
4417da66
PM
4210 }
4211}
4212
4213void dev_set_rx_mode(struct net_device *dev)
4214{
b9e40857 4215 netif_addr_lock_bh(dev);
4417da66 4216 __dev_set_rx_mode(dev);
b9e40857 4217 netif_addr_unlock_bh(dev);
1da177e4
LT
4218}
4219
f0db275a
SH
4220/**
4221 * dev_get_flags - get flags reported to userspace
4222 * @dev: device
4223 *
4224 * Get the combination of flag bits exported through APIs to userspace.
4225 */
1da177e4
LT
4226unsigned dev_get_flags(const struct net_device *dev)
4227{
4228 unsigned flags;
4229
4230 flags = (dev->flags & ~(IFF_PROMISC |
4231 IFF_ALLMULTI |
b00055aa
SR
4232 IFF_RUNNING |
4233 IFF_LOWER_UP |
4234 IFF_DORMANT)) |
1da177e4
LT
4235 (dev->gflags & (IFF_PROMISC |
4236 IFF_ALLMULTI));
4237
b00055aa
SR
4238 if (netif_running(dev)) {
4239 if (netif_oper_up(dev))
4240 flags |= IFF_RUNNING;
4241 if (netif_carrier_ok(dev))
4242 flags |= IFF_LOWER_UP;
4243 if (netif_dormant(dev))
4244 flags |= IFF_DORMANT;
4245 }
1da177e4
LT
4246
4247 return flags;
4248}
d1b19dff 4249EXPORT_SYMBOL(dev_get_flags);
1da177e4 4250
bd380811 4251int __dev_change_flags(struct net_device *dev, unsigned int flags)
1da177e4 4252{
1da177e4 4253 int old_flags = dev->flags;
bd380811 4254 int ret;
1da177e4 4255
24023451
PM
4256 ASSERT_RTNL();
4257
1da177e4
LT
4258 /*
4259 * Set the flags on our device.
4260 */
4261
4262 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4263 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4264 IFF_AUTOMEDIA)) |
4265 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4266 IFF_ALLMULTI));
4267
4268 /*
4269 * Load in the correct multicast list now the flags have changed.
4270 */
4271
b6c40d68
PM
4272 if ((old_flags ^ flags) & IFF_MULTICAST)
4273 dev_change_rx_flags(dev, IFF_MULTICAST);
24023451 4274
4417da66 4275 dev_set_rx_mode(dev);
1da177e4
LT
4276
4277 /*
4278 * Have we downed the interface. We handle IFF_UP ourselves
4279 * according to user attempts to set it, rather than blindly
4280 * setting it.
4281 */
4282
4283 ret = 0;
4284 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
bd380811 4285 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
1da177e4
LT
4286
4287 if (!ret)
4417da66 4288 dev_set_rx_mode(dev);
1da177e4
LT
4289 }
4290
1da177e4 4291 if ((flags ^ dev->gflags) & IFF_PROMISC) {
d1b19dff
ED
4292 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4293
1da177e4
LT
4294 dev->gflags ^= IFF_PROMISC;
4295 dev_set_promiscuity(dev, inc);
4296 }
4297
4298 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4299 is important. Some (broken) drivers set IFF_PROMISC, when
4300 IFF_ALLMULTI is requested not asking us and not reporting.
4301 */
4302 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
d1b19dff
ED
4303 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4304
1da177e4
LT
4305 dev->gflags ^= IFF_ALLMULTI;
4306 dev_set_allmulti(dev, inc);
4307 }
4308
bd380811
PM
4309 return ret;
4310}
4311
4312void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4313{
4314 unsigned int changes = dev->flags ^ old_flags;
4315
4316 if (changes & IFF_UP) {
4317 if (dev->flags & IFF_UP)
4318 call_netdevice_notifiers(NETDEV_UP, dev);
4319 else
4320 call_netdevice_notifiers(NETDEV_DOWN, dev);
4321 }
4322
4323 if (dev->flags & IFF_UP &&
4324 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4325 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4326}
4327
4328/**
4329 * dev_change_flags - change device settings
4330 * @dev: device
4331 * @flags: device state flags
4332 *
4333 * Change settings on device based state flags. The flags are
4334 * in the userspace exported format.
4335 */
4336int dev_change_flags(struct net_device *dev, unsigned flags)
4337{
4338 int ret, changes;
4339 int old_flags = dev->flags;
4340
4341 ret = __dev_change_flags(dev, flags);
4342 if (ret < 0)
4343 return ret;
4344
4345 changes = old_flags ^ dev->flags;
7c355f53
TG
4346 if (changes)
4347 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
1da177e4 4348
bd380811 4349 __dev_notify_flags(dev, old_flags);
1da177e4
LT
4350 return ret;
4351}
d1b19dff 4352EXPORT_SYMBOL(dev_change_flags);
1da177e4 4353
f0db275a
SH
4354/**
4355 * dev_set_mtu - Change maximum transfer unit
4356 * @dev: device
4357 * @new_mtu: new transfer unit
4358 *
4359 * Change the maximum transfer size of the network device.
4360 */
1da177e4
LT
4361int dev_set_mtu(struct net_device *dev, int new_mtu)
4362{
d314774c 4363 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4364 int err;
4365
4366 if (new_mtu == dev->mtu)
4367 return 0;
4368
4369 /* MTU must be positive. */
4370 if (new_mtu < 0)
4371 return -EINVAL;
4372
4373 if (!netif_device_present(dev))
4374 return -ENODEV;
4375
4376 err = 0;
d314774c
SH
4377 if (ops->ndo_change_mtu)
4378 err = ops->ndo_change_mtu(dev, new_mtu);
1da177e4
LT
4379 else
4380 dev->mtu = new_mtu;
d314774c 4381
1da177e4 4382 if (!err && dev->flags & IFF_UP)
056925ab 4383 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
1da177e4
LT
4384 return err;
4385}
d1b19dff 4386EXPORT_SYMBOL(dev_set_mtu);
1da177e4 4387
f0db275a
SH
4388/**
4389 * dev_set_mac_address - Change Media Access Control Address
4390 * @dev: device
4391 * @sa: new address
4392 *
4393 * Change the hardware (MAC) address of the device
4394 */
1da177e4
LT
4395int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4396{
d314774c 4397 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4398 int err;
4399
d314774c 4400 if (!ops->ndo_set_mac_address)
1da177e4
LT
4401 return -EOPNOTSUPP;
4402 if (sa->sa_family != dev->type)
4403 return -EINVAL;
4404 if (!netif_device_present(dev))
4405 return -ENODEV;
d314774c 4406 err = ops->ndo_set_mac_address(dev, sa);
1da177e4 4407 if (!err)
056925ab 4408 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
1da177e4
LT
4409 return err;
4410}
d1b19dff 4411EXPORT_SYMBOL(dev_set_mac_address);
1da177e4
LT
4412
4413/*
3710becf 4414 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
1da177e4 4415 */
14e3e079 4416static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
1da177e4
LT
4417{
4418 int err;
3710becf 4419 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
1da177e4
LT
4420
4421 if (!dev)
4422 return -ENODEV;
4423
4424 switch (cmd) {
d1b19dff
ED
4425 case SIOCGIFFLAGS: /* Get interface flags */
4426 ifr->ifr_flags = (short) dev_get_flags(dev);
4427 return 0;
1da177e4 4428
d1b19dff
ED
4429 case SIOCGIFMETRIC: /* Get the metric on the interface
4430 (currently unused) */
4431 ifr->ifr_metric = 0;
4432 return 0;
1da177e4 4433
d1b19dff
ED
4434 case SIOCGIFMTU: /* Get the MTU of a device */
4435 ifr->ifr_mtu = dev->mtu;
4436 return 0;
1da177e4 4437
d1b19dff
ED
4438 case SIOCGIFHWADDR:
4439 if (!dev->addr_len)
4440 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4441 else
4442 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4443 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4444 ifr->ifr_hwaddr.sa_family = dev->type;
4445 return 0;
1da177e4 4446
d1b19dff
ED
4447 case SIOCGIFSLAVE:
4448 err = -EINVAL;
4449 break;
14e3e079 4450
d1b19dff
ED
4451 case SIOCGIFMAP:
4452 ifr->ifr_map.mem_start = dev->mem_start;
4453 ifr->ifr_map.mem_end = dev->mem_end;
4454 ifr->ifr_map.base_addr = dev->base_addr;
4455 ifr->ifr_map.irq = dev->irq;
4456 ifr->ifr_map.dma = dev->dma;
4457 ifr->ifr_map.port = dev->if_port;
4458 return 0;
14e3e079 4459
d1b19dff
ED
4460 case SIOCGIFINDEX:
4461 ifr->ifr_ifindex = dev->ifindex;
4462 return 0;
14e3e079 4463
d1b19dff
ED
4464 case SIOCGIFTXQLEN:
4465 ifr->ifr_qlen = dev->tx_queue_len;
4466 return 0;
14e3e079 4467
d1b19dff
ED
4468 default:
4469 /* dev_ioctl() should ensure this case
4470 * is never reached
4471 */
4472 WARN_ON(1);
4473 err = -EINVAL;
4474 break;
14e3e079
JG
4475
4476 }
4477 return err;
4478}
4479
4480/*
4481 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4482 */
4483static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4484{
4485 int err;
4486 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5f2f6da7 4487 const struct net_device_ops *ops;
14e3e079
JG
4488
4489 if (!dev)
4490 return -ENODEV;
4491
5f2f6da7
JP
4492 ops = dev->netdev_ops;
4493
14e3e079 4494 switch (cmd) {
d1b19dff
ED
4495 case SIOCSIFFLAGS: /* Set interface flags */
4496 return dev_change_flags(dev, ifr->ifr_flags);
14e3e079 4497
d1b19dff
ED
4498 case SIOCSIFMETRIC: /* Set the metric on the interface
4499 (currently unused) */
4500 return -EOPNOTSUPP;
14e3e079 4501
d1b19dff
ED
4502 case SIOCSIFMTU: /* Set the MTU of a device */
4503 return dev_set_mtu(dev, ifr->ifr_mtu);
1da177e4 4504
d1b19dff
ED
4505 case SIOCSIFHWADDR:
4506 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
1da177e4 4507
d1b19dff
ED
4508 case SIOCSIFHWBROADCAST:
4509 if (ifr->ifr_hwaddr.sa_family != dev->type)
4510 return -EINVAL;
4511 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4512 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4513 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4514 return 0;
1da177e4 4515
d1b19dff
ED
4516 case SIOCSIFMAP:
4517 if (ops->ndo_set_config) {
1da177e4
LT
4518 if (!netif_device_present(dev))
4519 return -ENODEV;
d1b19dff
ED
4520 return ops->ndo_set_config(dev, &ifr->ifr_map);
4521 }
4522 return -EOPNOTSUPP;
1da177e4 4523
d1b19dff
ED
4524 case SIOCADDMULTI:
4525 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4526 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4527 return -EINVAL;
4528 if (!netif_device_present(dev))
4529 return -ENODEV;
22bedad3 4530 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
d1b19dff
ED
4531
4532 case SIOCDELMULTI:
4533 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4534 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4535 return -EINVAL;
4536 if (!netif_device_present(dev))
4537 return -ENODEV;
22bedad3 4538 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
1da177e4 4539
d1b19dff
ED
4540 case SIOCSIFTXQLEN:
4541 if (ifr->ifr_qlen < 0)
4542 return -EINVAL;
4543 dev->tx_queue_len = ifr->ifr_qlen;
4544 return 0;
1da177e4 4545
d1b19dff
ED
4546 case SIOCSIFNAME:
4547 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4548 return dev_change_name(dev, ifr->ifr_newname);
1da177e4 4549
d1b19dff
ED
4550 /*
4551 * Unknown or private ioctl
4552 */
4553 default:
4554 if ((cmd >= SIOCDEVPRIVATE &&
4555 cmd <= SIOCDEVPRIVATE + 15) ||
4556 cmd == SIOCBONDENSLAVE ||
4557 cmd == SIOCBONDRELEASE ||
4558 cmd == SIOCBONDSETHWADDR ||
4559 cmd == SIOCBONDSLAVEINFOQUERY ||
4560 cmd == SIOCBONDINFOQUERY ||
4561 cmd == SIOCBONDCHANGEACTIVE ||
4562 cmd == SIOCGMIIPHY ||
4563 cmd == SIOCGMIIREG ||
4564 cmd == SIOCSMIIREG ||
4565 cmd == SIOCBRADDIF ||
4566 cmd == SIOCBRDELIF ||
4567 cmd == SIOCSHWTSTAMP ||
4568 cmd == SIOCWANDEV) {
4569 err = -EOPNOTSUPP;
4570 if (ops->ndo_do_ioctl) {
4571 if (netif_device_present(dev))
4572 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4573 else
4574 err = -ENODEV;
4575 }
4576 } else
4577 err = -EINVAL;
1da177e4
LT
4578
4579 }
4580 return err;
4581}
4582
4583/*
4584 * This function handles all "interface"-type I/O control requests. The actual
4585 * 'doing' part of this is dev_ifsioc above.
4586 */
4587
4588/**
4589 * dev_ioctl - network device ioctl
c4ea43c5 4590 * @net: the applicable net namespace
1da177e4
LT
4591 * @cmd: command to issue
4592 * @arg: pointer to a struct ifreq in user space
4593 *
4594 * Issue ioctl functions to devices. This is normally called by the
4595 * user space syscall interfaces but can sometimes be useful for
4596 * other purposes. The return value is the return from the syscall if
4597 * positive or a negative errno code on error.
4598 */
4599
881d966b 4600int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1da177e4
LT
4601{
4602 struct ifreq ifr;
4603 int ret;
4604 char *colon;
4605
4606 /* One special case: SIOCGIFCONF takes ifconf argument
4607 and requires shared lock, because it sleeps writing
4608 to user space.
4609 */
4610
4611 if (cmd == SIOCGIFCONF) {
6756ae4b 4612 rtnl_lock();
881d966b 4613 ret = dev_ifconf(net, (char __user *) arg);
6756ae4b 4614 rtnl_unlock();
1da177e4
LT
4615 return ret;
4616 }
4617 if (cmd == SIOCGIFNAME)
881d966b 4618 return dev_ifname(net, (struct ifreq __user *)arg);
1da177e4
LT
4619
4620 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4621 return -EFAULT;
4622
4623 ifr.ifr_name[IFNAMSIZ-1] = 0;
4624
4625 colon = strchr(ifr.ifr_name, ':');
4626 if (colon)
4627 *colon = 0;
4628
4629 /*
4630 * See which interface the caller is talking about.
4631 */
4632
4633 switch (cmd) {
d1b19dff
ED
4634 /*
4635 * These ioctl calls:
4636 * - can be done by all.
4637 * - atomic and do not require locking.
4638 * - return a value
4639 */
4640 case SIOCGIFFLAGS:
4641 case SIOCGIFMETRIC:
4642 case SIOCGIFMTU:
4643 case SIOCGIFHWADDR:
4644 case SIOCGIFSLAVE:
4645 case SIOCGIFMAP:
4646 case SIOCGIFINDEX:
4647 case SIOCGIFTXQLEN:
4648 dev_load(net, ifr.ifr_name);
3710becf 4649 rcu_read_lock();
d1b19dff 4650 ret = dev_ifsioc_locked(net, &ifr, cmd);
3710becf 4651 rcu_read_unlock();
d1b19dff
ED
4652 if (!ret) {
4653 if (colon)
4654 *colon = ':';
4655 if (copy_to_user(arg, &ifr,
4656 sizeof(struct ifreq)))
4657 ret = -EFAULT;
4658 }
4659 return ret;
1da177e4 4660
d1b19dff
ED
4661 case SIOCETHTOOL:
4662 dev_load(net, ifr.ifr_name);
4663 rtnl_lock();
4664 ret = dev_ethtool(net, &ifr);
4665 rtnl_unlock();
4666 if (!ret) {
4667 if (colon)
4668 *colon = ':';
4669 if (copy_to_user(arg, &ifr,
4670 sizeof(struct ifreq)))
4671 ret = -EFAULT;
4672 }
4673 return ret;
1da177e4 4674
d1b19dff
ED
4675 /*
4676 * These ioctl calls:
4677 * - require superuser power.
4678 * - require strict serialization.
4679 * - return a value
4680 */
4681 case SIOCGMIIPHY:
4682 case SIOCGMIIREG:
4683 case SIOCSIFNAME:
4684 if (!capable(CAP_NET_ADMIN))
4685 return -EPERM;
4686 dev_load(net, ifr.ifr_name);
4687 rtnl_lock();
4688 ret = dev_ifsioc(net, &ifr, cmd);
4689 rtnl_unlock();
4690 if (!ret) {
4691 if (colon)
4692 *colon = ':';
4693 if (copy_to_user(arg, &ifr,
4694 sizeof(struct ifreq)))
4695 ret = -EFAULT;
4696 }
4697 return ret;
1da177e4 4698
d1b19dff
ED
4699 /*
4700 * These ioctl calls:
4701 * - require superuser power.
4702 * - require strict serialization.
4703 * - do not return a value
4704 */
4705 case SIOCSIFFLAGS:
4706 case SIOCSIFMETRIC:
4707 case SIOCSIFMTU:
4708 case SIOCSIFMAP:
4709 case SIOCSIFHWADDR:
4710 case SIOCSIFSLAVE:
4711 case SIOCADDMULTI:
4712 case SIOCDELMULTI:
4713 case SIOCSIFHWBROADCAST:
4714 case SIOCSIFTXQLEN:
4715 case SIOCSMIIREG:
4716 case SIOCBONDENSLAVE:
4717 case SIOCBONDRELEASE:
4718 case SIOCBONDSETHWADDR:
4719 case SIOCBONDCHANGEACTIVE:
4720 case SIOCBRADDIF:
4721 case SIOCBRDELIF:
4722 case SIOCSHWTSTAMP:
4723 if (!capable(CAP_NET_ADMIN))
4724 return -EPERM;
4725 /* fall through */
4726 case SIOCBONDSLAVEINFOQUERY:
4727 case SIOCBONDINFOQUERY:
4728 dev_load(net, ifr.ifr_name);
4729 rtnl_lock();
4730 ret = dev_ifsioc(net, &ifr, cmd);
4731 rtnl_unlock();
4732 return ret;
4733
4734 case SIOCGIFMEM:
4735 /* Get the per device memory space. We can add this but
4736 * currently do not support it */
4737 case SIOCSIFMEM:
4738 /* Set the per device memory buffer space.
4739 * Not applicable in our case */
4740 case SIOCSIFLINK:
4741 return -EINVAL;
4742
4743 /*
4744 * Unknown or private ioctl.
4745 */
4746 default:
4747 if (cmd == SIOCWANDEV ||
4748 (cmd >= SIOCDEVPRIVATE &&
4749 cmd <= SIOCDEVPRIVATE + 15)) {
881d966b 4750 dev_load(net, ifr.ifr_name);
1da177e4 4751 rtnl_lock();
881d966b 4752 ret = dev_ifsioc(net, &ifr, cmd);
1da177e4 4753 rtnl_unlock();
d1b19dff
ED
4754 if (!ret && copy_to_user(arg, &ifr,
4755 sizeof(struct ifreq)))
4756 ret = -EFAULT;
1da177e4 4757 return ret;
d1b19dff
ED
4758 }
4759 /* Take care of Wireless Extensions */
4760 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4761 return wext_handle_ioctl(net, &ifr, cmd, arg);
4762 return -EINVAL;
1da177e4
LT
4763 }
4764}
4765
4766
4767/**
4768 * dev_new_index - allocate an ifindex
c4ea43c5 4769 * @net: the applicable net namespace
1da177e4
LT
4770 *
4771 * Returns a suitable unique value for a new device interface
4772 * number. The caller must hold the rtnl semaphore or the
4773 * dev_base_lock to be sure it remains unique.
4774 */
881d966b 4775static int dev_new_index(struct net *net)
1da177e4
LT
4776{
4777 static int ifindex;
4778 for (;;) {
4779 if (++ifindex <= 0)
4780 ifindex = 1;
881d966b 4781 if (!__dev_get_by_index(net, ifindex))
1da177e4
LT
4782 return ifindex;
4783 }
4784}
4785
1da177e4 4786/* Delayed registration/unregisteration */
3b5b34fd 4787static LIST_HEAD(net_todo_list);
1da177e4 4788
6f05f629 4789static void net_set_todo(struct net_device *dev)
1da177e4 4790{
1da177e4 4791 list_add_tail(&dev->todo_list, &net_todo_list);
1da177e4
LT
4792}
4793
9b5e383c 4794static void rollback_registered_many(struct list_head *head)
93ee31f1 4795{
e93737b0 4796 struct net_device *dev, *tmp;
9b5e383c 4797
93ee31f1
DL
4798 BUG_ON(dev_boot_phase);
4799 ASSERT_RTNL();
4800
e93737b0 4801 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9b5e383c 4802 /* Some devices call without registering
e93737b0
KK
4803 * for initialization unwind. Remove those
4804 * devices and proceed with the remaining.
9b5e383c
ED
4805 */
4806 if (dev->reg_state == NETREG_UNINITIALIZED) {
4807 pr_debug("unregister_netdevice: device %s/%p never "
4808 "was registered\n", dev->name, dev);
93ee31f1 4809
9b5e383c 4810 WARN_ON(1);
e93737b0
KK
4811 list_del(&dev->unreg_list);
4812 continue;
9b5e383c 4813 }
93ee31f1 4814
9b5e383c 4815 BUG_ON(dev->reg_state != NETREG_REGISTERED);
93ee31f1 4816
9b5e383c
ED
4817 /* If device is running, close it first. */
4818 dev_close(dev);
93ee31f1 4819
9b5e383c
ED
4820 /* And unlink it from device chain. */
4821 unlist_netdevice(dev);
93ee31f1 4822
9b5e383c
ED
4823 dev->reg_state = NETREG_UNREGISTERING;
4824 }
93ee31f1
DL
4825
4826 synchronize_net();
4827
9b5e383c
ED
4828 list_for_each_entry(dev, head, unreg_list) {
4829 /* Shutdown queueing discipline. */
4830 dev_shutdown(dev);
93ee31f1
DL
4831
4832
9b5e383c
ED
4833 /* Notify protocols, that we are about to destroy
4834 this device. They should clean all the things.
4835 */
4836 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
93ee31f1 4837
a2835763
PM
4838 if (!dev->rtnl_link_ops ||
4839 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4840 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4841
9b5e383c
ED
4842 /*
4843 * Flush the unicast and multicast chains
4844 */
a748ee24 4845 dev_uc_flush(dev);
22bedad3 4846 dev_mc_flush(dev);
93ee31f1 4847
9b5e383c
ED
4848 if (dev->netdev_ops->ndo_uninit)
4849 dev->netdev_ops->ndo_uninit(dev);
93ee31f1 4850
9b5e383c
ED
4851 /* Notifier chain MUST detach us from master device. */
4852 WARN_ON(dev->master);
93ee31f1 4853
9b5e383c
ED
4854 /* Remove entries from kobject tree */
4855 netdev_unregister_kobject(dev);
4856 }
93ee31f1 4857
a5ee1551 4858 /* Process any work delayed until the end of the batch */
e5e26d75 4859 dev = list_first_entry(head, struct net_device, unreg_list);
a5ee1551 4860 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
93ee31f1 4861
a5ee1551 4862 synchronize_net();
395264d5 4863
a5ee1551 4864 list_for_each_entry(dev, head, unreg_list)
9b5e383c
ED
4865 dev_put(dev);
4866}
4867
4868static void rollback_registered(struct net_device *dev)
4869{
4870 LIST_HEAD(single);
4871
4872 list_add(&dev->unreg_list, &single);
4873 rollback_registered_many(&single);
93ee31f1
DL
4874}
4875
e8a0464c
DM
4876static void __netdev_init_queue_locks_one(struct net_device *dev,
4877 struct netdev_queue *dev_queue,
4878 void *_unused)
c773e847
DM
4879{
4880 spin_lock_init(&dev_queue->_xmit_lock);
cf508b12 4881 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
c773e847
DM
4882 dev_queue->xmit_lock_owner = -1;
4883}
4884
4885static void netdev_init_queue_locks(struct net_device *dev)
4886{
e8a0464c
DM
4887 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4888 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
c773e847
DM
4889}
4890
b63365a2
HX
4891unsigned long netdev_fix_features(unsigned long features, const char *name)
4892{
4893 /* Fix illegal SG+CSUM combinations. */
4894 if ((features & NETIF_F_SG) &&
4895 !(features & NETIF_F_ALL_CSUM)) {
4896 if (name)
4897 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4898 "checksum feature.\n", name);
4899 features &= ~NETIF_F_SG;
4900 }
4901
4902 /* TSO requires that SG is present as well. */
4903 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4904 if (name)
4905 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4906 "SG feature.\n", name);
4907 features &= ~NETIF_F_TSO;
4908 }
4909
4910 if (features & NETIF_F_UFO) {
4911 if (!(features & NETIF_F_GEN_CSUM)) {
4912 if (name)
4913 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4914 "since no NETIF_F_HW_CSUM feature.\n",
4915 name);
4916 features &= ~NETIF_F_UFO;
4917 }
4918
4919 if (!(features & NETIF_F_SG)) {
4920 if (name)
4921 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4922 "since no NETIF_F_SG feature.\n", name);
4923 features &= ~NETIF_F_UFO;
4924 }
4925 }
4926
4927 return features;
4928}
4929EXPORT_SYMBOL(netdev_fix_features);
4930
fc4a7489
PM
4931/**
4932 * netif_stacked_transfer_operstate - transfer operstate
4933 * @rootdev: the root or lower level device to transfer state from
4934 * @dev: the device to transfer operstate to
4935 *
4936 * Transfer operational state from root to device. This is normally
4937 * called when a stacking relationship exists between the root
4938 * device and the device(a leaf device).
4939 */
4940void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4941 struct net_device *dev)
4942{
4943 if (rootdev->operstate == IF_OPER_DORMANT)
4944 netif_dormant_on(dev);
4945 else
4946 netif_dormant_off(dev);
4947
4948 if (netif_carrier_ok(rootdev)) {
4949 if (!netif_carrier_ok(dev))
4950 netif_carrier_on(dev);
4951 } else {
4952 if (netif_carrier_ok(dev))
4953 netif_carrier_off(dev);
4954 }
4955}
4956EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4957
1da177e4
LT
4958/**
4959 * register_netdevice - register a network device
4960 * @dev: device to register
4961 *
4962 * Take a completed network device structure and add it to the kernel
4963 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4964 * chain. 0 is returned on success. A negative errno code is returned
4965 * on a failure to set up the device, or if the name is a duplicate.
4966 *
4967 * Callers must hold the rtnl semaphore. You may want
4968 * register_netdev() instead of this.
4969 *
4970 * BUGS:
4971 * The locking appears insufficient to guarantee two parallel registers
4972 * will not get the same name.
4973 */
4974
4975int register_netdevice(struct net_device *dev)
4976{
1da177e4 4977 int ret;
d314774c 4978 struct net *net = dev_net(dev);
1da177e4
LT
4979
4980 BUG_ON(dev_boot_phase);
4981 ASSERT_RTNL();
4982
b17a7c17
SH
4983 might_sleep();
4984
1da177e4
LT
4985 /* When net_device's are persistent, this will be fatal. */
4986 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
d314774c 4987 BUG_ON(!net);
1da177e4 4988
f1f28aa3 4989 spin_lock_init(&dev->addr_list_lock);
cf508b12 4990 netdev_set_addr_lockdep_class(dev);
c773e847 4991 netdev_init_queue_locks(dev);
1da177e4 4992
1da177e4
LT
4993 dev->iflink = -1;
4994
df334545 4995#ifdef CONFIG_RPS
0a9627f2
TH
4996 if (!dev->num_rx_queues) {
4997 /*
4998 * Allocate a single RX queue if driver never called
4999 * alloc_netdev_mq
5000 */
5001
5002 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
5003 if (!dev->_rx) {
5004 ret = -ENOMEM;
5005 goto out;
5006 }
5007
5008 dev->_rx->first = dev->_rx;
5009 atomic_set(&dev->_rx->count, 1);
5010 dev->num_rx_queues = 1;
5011 }
df334545 5012#endif
1da177e4 5013 /* Init, if this function is available */
d314774c
SH
5014 if (dev->netdev_ops->ndo_init) {
5015 ret = dev->netdev_ops->ndo_init(dev);
1da177e4
LT
5016 if (ret) {
5017 if (ret > 0)
5018 ret = -EIO;
90833aa4 5019 goto out;
1da177e4
LT
5020 }
5021 }
4ec93edb 5022
8ce6cebc 5023 ret = dev_get_valid_name(dev, dev->name, 0);
d9031024 5024 if (ret)
7ce1b0ed 5025 goto err_uninit;
1da177e4 5026
881d966b 5027 dev->ifindex = dev_new_index(net);
1da177e4
LT
5028 if (dev->iflink == -1)
5029 dev->iflink = dev->ifindex;
5030
d212f87b
SH
5031 /* Fix illegal checksum combinations */
5032 if ((dev->features & NETIF_F_HW_CSUM) &&
5033 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5034 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5035 dev->name);
5036 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5037 }
5038
5039 if ((dev->features & NETIF_F_NO_CSUM) &&
5040 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5041 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5042 dev->name);
5043 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5044 }
5045
b63365a2 5046 dev->features = netdev_fix_features(dev->features, dev->name);
1da177e4 5047
e5a4a72d
LB
5048 /* Enable software GSO if SG is supported. */
5049 if (dev->features & NETIF_F_SG)
5050 dev->features |= NETIF_F_GSO;
5051
7ffbe3fd
JB
5052 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5053 ret = notifier_to_errno(ret);
5054 if (ret)
5055 goto err_uninit;
5056
8b41d188 5057 ret = netdev_register_kobject(dev);
b17a7c17 5058 if (ret)
7ce1b0ed 5059 goto err_uninit;
b17a7c17
SH
5060 dev->reg_state = NETREG_REGISTERED;
5061
1da177e4
LT
5062 /*
5063 * Default initial state at registry is that the
5064 * device is present.
5065 */
5066
5067 set_bit(__LINK_STATE_PRESENT, &dev->state);
5068
1da177e4 5069 dev_init_scheduler(dev);
1da177e4 5070 dev_hold(dev);
ce286d32 5071 list_netdevice(dev);
1da177e4
LT
5072
5073 /* Notify protocols, that a new device appeared. */
056925ab 5074 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 5075 ret = notifier_to_errno(ret);
93ee31f1
DL
5076 if (ret) {
5077 rollback_registered(dev);
5078 dev->reg_state = NETREG_UNREGISTERED;
5079 }
d90a909e
EB
5080 /*
5081 * Prevent userspace races by waiting until the network
5082 * device is fully setup before sending notifications.
5083 */
a2835763
PM
5084 if (!dev->rtnl_link_ops ||
5085 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5086 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
1da177e4
LT
5087
5088out:
5089 return ret;
7ce1b0ed
HX
5090
5091err_uninit:
d314774c
SH
5092 if (dev->netdev_ops->ndo_uninit)
5093 dev->netdev_ops->ndo_uninit(dev);
7ce1b0ed 5094 goto out;
1da177e4 5095}
d1b19dff 5096EXPORT_SYMBOL(register_netdevice);
1da177e4 5097
937f1ba5
BH
5098/**
5099 * init_dummy_netdev - init a dummy network device for NAPI
5100 * @dev: device to init
5101 *
5102 * This takes a network device structure and initialize the minimum
5103 * amount of fields so it can be used to schedule NAPI polls without
5104 * registering a full blown interface. This is to be used by drivers
5105 * that need to tie several hardware interfaces to a single NAPI
5106 * poll scheduler due to HW limitations.
5107 */
5108int init_dummy_netdev(struct net_device *dev)
5109{
5110 /* Clear everything. Note we don't initialize spinlocks
5111 * are they aren't supposed to be taken by any of the
5112 * NAPI code and this dummy netdev is supposed to be
5113 * only ever used for NAPI polls
5114 */
5115 memset(dev, 0, sizeof(struct net_device));
5116
5117 /* make sure we BUG if trying to hit standard
5118 * register/unregister code path
5119 */
5120 dev->reg_state = NETREG_DUMMY;
5121
5122 /* initialize the ref count */
5123 atomic_set(&dev->refcnt, 1);
5124
5125 /* NAPI wants this */
5126 INIT_LIST_HEAD(&dev->napi_list);
5127
5128 /* a dummy interface is started by default */
5129 set_bit(__LINK_STATE_PRESENT, &dev->state);
5130 set_bit(__LINK_STATE_START, &dev->state);
5131
5132 return 0;
5133}
5134EXPORT_SYMBOL_GPL(init_dummy_netdev);
5135
5136
1da177e4
LT
5137/**
5138 * register_netdev - register a network device
5139 * @dev: device to register
5140 *
5141 * Take a completed network device structure and add it to the kernel
5142 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5143 * chain. 0 is returned on success. A negative errno code is returned
5144 * on a failure to set up the device, or if the name is a duplicate.
5145 *
38b4da38 5146 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
5147 * and expands the device name if you passed a format string to
5148 * alloc_netdev.
5149 */
5150int register_netdev(struct net_device *dev)
5151{
5152 int err;
5153
5154 rtnl_lock();
5155
5156 /*
5157 * If the name is a format string the caller wants us to do a
5158 * name allocation.
5159 */
5160 if (strchr(dev->name, '%')) {
5161 err = dev_alloc_name(dev, dev->name);
5162 if (err < 0)
5163 goto out;
5164 }
4ec93edb 5165
1da177e4
LT
5166 err = register_netdevice(dev);
5167out:
5168 rtnl_unlock();
5169 return err;
5170}
5171EXPORT_SYMBOL(register_netdev);
5172
5173/*
5174 * netdev_wait_allrefs - wait until all references are gone.
5175 *
5176 * This is called when unregistering network devices.
5177 *
5178 * Any protocol or device that holds a reference should register
5179 * for netdevice notification, and cleanup and put back the
5180 * reference if they receive an UNREGISTER event.
5181 * We can get stuck here if buggy protocols don't correctly
4ec93edb 5182 * call dev_put.
1da177e4
LT
5183 */
5184static void netdev_wait_allrefs(struct net_device *dev)
5185{
5186 unsigned long rebroadcast_time, warning_time;
5187
e014debe
ED
5188 linkwatch_forget_dev(dev);
5189
1da177e4
LT
5190 rebroadcast_time = warning_time = jiffies;
5191 while (atomic_read(&dev->refcnt) != 0) {
5192 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 5193 rtnl_lock();
1da177e4
LT
5194
5195 /* Rebroadcast unregister notification */
056925ab 5196 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
a5ee1551 5197 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
395264d5 5198 * should have already handle it the first time */
1da177e4
LT
5199
5200 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5201 &dev->state)) {
5202 /* We must not have linkwatch events
5203 * pending on unregister. If this
5204 * happens, we simply run the queue
5205 * unscheduled, resulting in a noop
5206 * for this device.
5207 */
5208 linkwatch_run_queue();
5209 }
5210
6756ae4b 5211 __rtnl_unlock();
1da177e4
LT
5212
5213 rebroadcast_time = jiffies;
5214 }
5215
5216 msleep(250);
5217
5218 if (time_after(jiffies, warning_time + 10 * HZ)) {
5219 printk(KERN_EMERG "unregister_netdevice: "
5220 "waiting for %s to become free. Usage "
5221 "count = %d\n",
5222 dev->name, atomic_read(&dev->refcnt));
5223 warning_time = jiffies;
5224 }
5225 }
5226}
5227
5228/* The sequence is:
5229 *
5230 * rtnl_lock();
5231 * ...
5232 * register_netdevice(x1);
5233 * register_netdevice(x2);
5234 * ...
5235 * unregister_netdevice(y1);
5236 * unregister_netdevice(y2);
5237 * ...
5238 * rtnl_unlock();
5239 * free_netdev(y1);
5240 * free_netdev(y2);
5241 *
58ec3b4d 5242 * We are invoked by rtnl_unlock().
1da177e4 5243 * This allows us to deal with problems:
b17a7c17 5244 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
5245 * without deadlocking with linkwatch via keventd.
5246 * 2) Since we run with the RTNL semaphore not held, we can sleep
5247 * safely in order to wait for the netdev refcnt to drop to zero.
58ec3b4d
HX
5248 *
5249 * We must not return until all unregister events added during
5250 * the interval the lock was held have been completed.
1da177e4 5251 */
1da177e4
LT
5252void netdev_run_todo(void)
5253{
626ab0e6 5254 struct list_head list;
1da177e4 5255
1da177e4 5256 /* Snapshot list, allow later requests */
626ab0e6 5257 list_replace_init(&net_todo_list, &list);
58ec3b4d
HX
5258
5259 __rtnl_unlock();
626ab0e6 5260
1da177e4
LT
5261 while (!list_empty(&list)) {
5262 struct net_device *dev
e5e26d75 5263 = list_first_entry(&list, struct net_device, todo_list);
1da177e4
LT
5264 list_del(&dev->todo_list);
5265
b17a7c17
SH
5266 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5267 printk(KERN_ERR "network todo '%s' but state %d\n",
5268 dev->name, dev->reg_state);
5269 dump_stack();
5270 continue;
5271 }
1da177e4 5272
b17a7c17 5273 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 5274
152102c7 5275 on_each_cpu(flush_backlog, dev, 1);
6e583ce5 5276
b17a7c17 5277 netdev_wait_allrefs(dev);
1da177e4 5278
b17a7c17
SH
5279 /* paranoia */
5280 BUG_ON(atomic_read(&dev->refcnt));
547b792c
IJ
5281 WARN_ON(dev->ip_ptr);
5282 WARN_ON(dev->ip6_ptr);
5283 WARN_ON(dev->dn_ptr);
1da177e4 5284
b17a7c17
SH
5285 if (dev->destructor)
5286 dev->destructor(dev);
9093bbb2
SH
5287
5288 /* Free network device */
5289 kobject_put(&dev->dev.kobj);
1da177e4 5290 }
1da177e4
LT
5291}
5292
d83345ad
ED
5293/**
5294 * dev_txq_stats_fold - fold tx_queues stats
5295 * @dev: device to get statistics from
3cfde79c 5296 * @stats: struct rtnl_link_stats64 to hold results
d83345ad
ED
5297 */
5298void dev_txq_stats_fold(const struct net_device *dev,
3cfde79c 5299 struct rtnl_link_stats64 *stats)
d83345ad 5300{
bd27290a 5301 u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
d83345ad
ED
5302 unsigned int i;
5303 struct netdev_queue *txq;
5304
5305 for (i = 0; i < dev->num_tx_queues; i++) {
5306 txq = netdev_get_tx_queue(dev, i);
bd27290a 5307 spin_lock_bh(&txq->_xmit_lock);
d83345ad
ED
5308 tx_bytes += txq->tx_bytes;
5309 tx_packets += txq->tx_packets;
5310 tx_dropped += txq->tx_dropped;
bd27290a 5311 spin_unlock_bh(&txq->_xmit_lock);
d83345ad
ED
5312 }
5313 if (tx_bytes || tx_packets || tx_dropped) {
5314 stats->tx_bytes = tx_bytes;
5315 stats->tx_packets = tx_packets;
5316 stats->tx_dropped = tx_dropped;
5317 }
5318}
5319EXPORT_SYMBOL(dev_txq_stats_fold);
5320
3cfde79c
BH
5321/* Convert net_device_stats to rtnl_link_stats64. They have the same
5322 * fields in the same order, with only the type differing.
5323 */
5324static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5325 const struct net_device_stats *netdev_stats)
5326{
5327#if BITS_PER_LONG == 64
5328 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5329 memcpy(stats64, netdev_stats, sizeof(*stats64));
5330#else
5331 size_t i, n = sizeof(*stats64) / sizeof(u64);
5332 const unsigned long *src = (const unsigned long *)netdev_stats;
5333 u64 *dst = (u64 *)stats64;
5334
5335 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5336 sizeof(*stats64) / sizeof(u64));
5337 for (i = 0; i < n; i++)
5338 dst[i] = src[i];
5339#endif
5340}
5341
eeda3fd6
SH
5342/**
5343 * dev_get_stats - get network device statistics
5344 * @dev: device to get statistics from
28172739 5345 * @storage: place to store stats
eeda3fd6 5346 *
d7753516
BH
5347 * Get network statistics from device. Return @storage.
5348 * The device driver may provide its own method by setting
5349 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5350 * otherwise the internal statistics structure is used.
eeda3fd6 5351 */
d7753516
BH
5352struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5353 struct rtnl_link_stats64 *storage)
7004bf25 5354{
eeda3fd6
SH
5355 const struct net_device_ops *ops = dev->netdev_ops;
5356
28172739
ED
5357 if (ops->ndo_get_stats64) {
5358 memset(storage, 0, sizeof(*storage));
5359 return ops->ndo_get_stats64(dev, storage);
5360 }
5361 if (ops->ndo_get_stats) {
3cfde79c 5362 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
28172739
ED
5363 return storage;
5364 }
3cfde79c
BH
5365 netdev_stats_to_stats64(storage, &dev->stats);
5366 dev_txq_stats_fold(dev, storage);
28172739 5367 return storage;
c45d286e 5368}
eeda3fd6 5369EXPORT_SYMBOL(dev_get_stats);
c45d286e 5370
dc2b4847 5371static void netdev_init_one_queue(struct net_device *dev,
e8a0464c
DM
5372 struct netdev_queue *queue,
5373 void *_unused)
dc2b4847 5374{
dc2b4847
DM
5375 queue->dev = dev;
5376}
5377
bb949fbd
DM
5378static void netdev_init_queues(struct net_device *dev)
5379{
e8a0464c
DM
5380 netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5381 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
c3f26a26 5382 spin_lock_init(&dev->tx_global_lock);
bb949fbd
DM
5383}
5384
1da177e4 5385/**
f25f4e44 5386 * alloc_netdev_mq - allocate network device
1da177e4
LT
5387 * @sizeof_priv: size of private data to allocate space for
5388 * @name: device name format string
5389 * @setup: callback to initialize device
f25f4e44 5390 * @queue_count: the number of subqueues to allocate
1da177e4
LT
5391 *
5392 * Allocates a struct net_device with private data area for driver use
f25f4e44
PWJ
5393 * and performs basic initialization. Also allocates subquue structs
5394 * for each queue on the device at the end of the netdevice.
1da177e4 5395 */
f25f4e44
PWJ
5396struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5397 void (*setup)(struct net_device *), unsigned int queue_count)
1da177e4 5398{
e8a0464c 5399 struct netdev_queue *tx;
1da177e4 5400 struct net_device *dev;
7943986c 5401 size_t alloc_size;
1ce8e7b5 5402 struct net_device *p;
df334545
ED
5403#ifdef CONFIG_RPS
5404 struct netdev_rx_queue *rx;
0a9627f2 5405 int i;
df334545 5406#endif
1da177e4 5407
b6fe17d6
SH
5408 BUG_ON(strlen(name) >= sizeof(dev->name));
5409
fd2ea0a7 5410 alloc_size = sizeof(struct net_device);
d1643d24
AD
5411 if (sizeof_priv) {
5412 /* ensure 32-byte alignment of private area */
1ce8e7b5 5413 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
d1643d24
AD
5414 alloc_size += sizeof_priv;
5415 }
5416 /* ensure 32-byte alignment of whole construct */
1ce8e7b5 5417 alloc_size += NETDEV_ALIGN - 1;
1da177e4 5418
31380de9 5419 p = kzalloc(alloc_size, GFP_KERNEL);
1da177e4 5420 if (!p) {
b6fe17d6 5421 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
1da177e4
LT
5422 return NULL;
5423 }
1da177e4 5424
7943986c 5425 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
e8a0464c
DM
5426 if (!tx) {
5427 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5428 "tx qdiscs.\n");
ab9c73cc 5429 goto free_p;
e8a0464c
DM
5430 }
5431
df334545 5432#ifdef CONFIG_RPS
0a9627f2
TH
5433 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5434 if (!rx) {
5435 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5436 "rx queues.\n");
5437 goto free_tx;
5438 }
5439
5440 atomic_set(&rx->count, queue_count);
5441
5442 /*
5443 * Set a pointer to first element in the array which holds the
5444 * reference count.
5445 */
5446 for (i = 0; i < queue_count; i++)
5447 rx[i].first = rx;
df334545 5448#endif
0a9627f2 5449
1ce8e7b5 5450 dev = PTR_ALIGN(p, NETDEV_ALIGN);
1da177e4 5451 dev->padded = (char *)dev - (char *)p;
ab9c73cc
JP
5452
5453 if (dev_addr_init(dev))
0a9627f2 5454 goto free_rx;
ab9c73cc 5455
22bedad3 5456 dev_mc_init(dev);
a748ee24 5457 dev_uc_init(dev);
ccffad25 5458
c346dca1 5459 dev_net_set(dev, &init_net);
1da177e4 5460
e8a0464c
DM
5461 dev->_tx = tx;
5462 dev->num_tx_queues = queue_count;
fd2ea0a7 5463 dev->real_num_tx_queues = queue_count;
e8a0464c 5464
df334545 5465#ifdef CONFIG_RPS
0a9627f2
TH
5466 dev->_rx = rx;
5467 dev->num_rx_queues = queue_count;
df334545 5468#endif
0a9627f2 5469
82cc1a7a 5470 dev->gso_max_size = GSO_MAX_SIZE;
1da177e4 5471
bb949fbd
DM
5472 netdev_init_queues(dev);
5473
15682bc4
PWJ
5474 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5475 dev->ethtool_ntuple_list.count = 0;
d565b0a1 5476 INIT_LIST_HEAD(&dev->napi_list);
9fdce099 5477 INIT_LIST_HEAD(&dev->unreg_list);
e014debe 5478 INIT_LIST_HEAD(&dev->link_watch_list);
93f154b5 5479 dev->priv_flags = IFF_XMIT_DST_RELEASE;
1da177e4
LT
5480 setup(dev);
5481 strcpy(dev->name, name);
5482 return dev;
ab9c73cc 5483
0a9627f2 5484free_rx:
df334545 5485#ifdef CONFIG_RPS
0a9627f2 5486 kfree(rx);
ab9c73cc 5487free_tx:
df334545 5488#endif
ab9c73cc 5489 kfree(tx);
ab9c73cc
JP
5490free_p:
5491 kfree(p);
5492 return NULL;
1da177e4 5493}
f25f4e44 5494EXPORT_SYMBOL(alloc_netdev_mq);
1da177e4
LT
5495
5496/**
5497 * free_netdev - free network device
5498 * @dev: device
5499 *
4ec93edb
YH
5500 * This function does the last stage of destroying an allocated device
5501 * interface. The reference to the device object is released.
1da177e4
LT
5502 * If this is the last reference then it will be freed.
5503 */
5504void free_netdev(struct net_device *dev)
5505{
d565b0a1
HX
5506 struct napi_struct *p, *n;
5507
f3005d7f
DL
5508 release_net(dev_net(dev));
5509
e8a0464c
DM
5510 kfree(dev->_tx);
5511
f001fde5
JP
5512 /* Flush device addresses */
5513 dev_addr_flush(dev);
5514
15682bc4
PWJ
5515 /* Clear ethtool n-tuple list */
5516 ethtool_ntuple_flush(dev);
5517
d565b0a1
HX
5518 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5519 netif_napi_del(p);
5520
3041a069 5521 /* Compatibility with error handling in drivers */
1da177e4
LT
5522 if (dev->reg_state == NETREG_UNINITIALIZED) {
5523 kfree((char *)dev - dev->padded);
5524 return;
5525 }
5526
5527 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5528 dev->reg_state = NETREG_RELEASED;
5529
43cb76d9
GKH
5530 /* will free via device release */
5531 put_device(&dev->dev);
1da177e4 5532}
d1b19dff 5533EXPORT_SYMBOL(free_netdev);
4ec93edb 5534
f0db275a
SH
5535/**
5536 * synchronize_net - Synchronize with packet receive processing
5537 *
5538 * Wait for packets currently being received to be done.
5539 * Does not block later packets from starting.
5540 */
4ec93edb 5541void synchronize_net(void)
1da177e4
LT
5542{
5543 might_sleep();
fbd568a3 5544 synchronize_rcu();
1da177e4 5545}
d1b19dff 5546EXPORT_SYMBOL(synchronize_net);
1da177e4
LT
5547
5548/**
44a0873d 5549 * unregister_netdevice_queue - remove device from the kernel
1da177e4 5550 * @dev: device
44a0873d 5551 * @head: list
6ebfbc06 5552 *
1da177e4 5553 * This function shuts down a device interface and removes it
d59b54b1 5554 * from the kernel tables.
44a0873d 5555 * If head not NULL, device is queued to be unregistered later.
1da177e4
LT
5556 *
5557 * Callers must hold the rtnl semaphore. You may want
5558 * unregister_netdev() instead of this.
5559 */
5560
44a0873d 5561void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1da177e4 5562{
a6620712
HX
5563 ASSERT_RTNL();
5564
44a0873d 5565 if (head) {
9fdce099 5566 list_move_tail(&dev->unreg_list, head);
44a0873d
ED
5567 } else {
5568 rollback_registered(dev);
5569 /* Finish processing unregister after unlock */
5570 net_set_todo(dev);
5571 }
1da177e4 5572}
44a0873d 5573EXPORT_SYMBOL(unregister_netdevice_queue);
1da177e4 5574
9b5e383c
ED
5575/**
5576 * unregister_netdevice_many - unregister many devices
5577 * @head: list of devices
9b5e383c
ED
5578 */
5579void unregister_netdevice_many(struct list_head *head)
5580{
5581 struct net_device *dev;
5582
5583 if (!list_empty(head)) {
5584 rollback_registered_many(head);
5585 list_for_each_entry(dev, head, unreg_list)
5586 net_set_todo(dev);
5587 }
5588}
63c8099d 5589EXPORT_SYMBOL(unregister_netdevice_many);
9b5e383c 5590
1da177e4
LT
5591/**
5592 * unregister_netdev - remove device from the kernel
5593 * @dev: device
5594 *
5595 * This function shuts down a device interface and removes it
d59b54b1 5596 * from the kernel tables.
1da177e4
LT
5597 *
5598 * This is just a wrapper for unregister_netdevice that takes
5599 * the rtnl semaphore. In general you want to use this and not
5600 * unregister_netdevice.
5601 */
5602void unregister_netdev(struct net_device *dev)
5603{
5604 rtnl_lock();
5605 unregister_netdevice(dev);
5606 rtnl_unlock();
5607}
1da177e4
LT
5608EXPORT_SYMBOL(unregister_netdev);
5609
ce286d32
EB
5610/**
5611 * dev_change_net_namespace - move device to different nethost namespace
5612 * @dev: device
5613 * @net: network namespace
5614 * @pat: If not NULL name pattern to try if the current device name
5615 * is already taken in the destination network namespace.
5616 *
5617 * This function shuts down a device interface and moves it
5618 * to a new network namespace. On success 0 is returned, on
5619 * a failure a netagive errno code is returned.
5620 *
5621 * Callers must hold the rtnl semaphore.
5622 */
5623
5624int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5625{
ce286d32
EB
5626 int err;
5627
5628 ASSERT_RTNL();
5629
5630 /* Don't allow namespace local devices to be moved. */
5631 err = -EINVAL;
5632 if (dev->features & NETIF_F_NETNS_LOCAL)
5633 goto out;
5634
5635 /* Ensure the device has been registrered */
5636 err = -EINVAL;
5637 if (dev->reg_state != NETREG_REGISTERED)
5638 goto out;
5639
5640 /* Get out if there is nothing todo */
5641 err = 0;
878628fb 5642 if (net_eq(dev_net(dev), net))
ce286d32
EB
5643 goto out;
5644
5645 /* Pick the destination device name, and ensure
5646 * we can use it in the destination network namespace.
5647 */
5648 err = -EEXIST;
d9031024 5649 if (__dev_get_by_name(net, dev->name)) {
ce286d32
EB
5650 /* We get here if we can't use the current device name */
5651 if (!pat)
5652 goto out;
8ce6cebc 5653 if (dev_get_valid_name(dev, pat, 1))
ce286d32
EB
5654 goto out;
5655 }
5656
5657 /*
5658 * And now a mini version of register_netdevice unregister_netdevice.
5659 */
5660
5661 /* If device is running close it first. */
9b772652 5662 dev_close(dev);
ce286d32
EB
5663
5664 /* And unlink it from device chain */
5665 err = -ENODEV;
5666 unlist_netdevice(dev);
5667
5668 synchronize_net();
5669
5670 /* Shutdown queueing discipline. */
5671 dev_shutdown(dev);
5672
5673 /* Notify protocols, that we are about to destroy
5674 this device. They should clean all the things.
5675 */
5676 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
a5ee1551 5677 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
ce286d32
EB
5678
5679 /*
5680 * Flush the unicast and multicast chains
5681 */
a748ee24 5682 dev_uc_flush(dev);
22bedad3 5683 dev_mc_flush(dev);
ce286d32
EB
5684
5685 /* Actually switch the network namespace */
c346dca1 5686 dev_net_set(dev, net);
ce286d32 5687
ce286d32
EB
5688 /* If there is an ifindex conflict assign a new one */
5689 if (__dev_get_by_index(net, dev->ifindex)) {
5690 int iflink = (dev->iflink == dev->ifindex);
5691 dev->ifindex = dev_new_index(net);
5692 if (iflink)
5693 dev->iflink = dev->ifindex;
5694 }
5695
8b41d188 5696 /* Fixup kobjects */
a1b3f594 5697 err = device_rename(&dev->dev, dev->name);
8b41d188 5698 WARN_ON(err);
ce286d32
EB
5699
5700 /* Add the device back in the hashes */
5701 list_netdevice(dev);
5702
5703 /* Notify protocols, that a new device appeared. */
5704 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5705
d90a909e
EB
5706 /*
5707 * Prevent userspace races by waiting until the network
5708 * device is fully setup before sending notifications.
5709 */
5710 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5711
ce286d32
EB
5712 synchronize_net();
5713 err = 0;
5714out:
5715 return err;
5716}
463d0183 5717EXPORT_SYMBOL_GPL(dev_change_net_namespace);
ce286d32 5718
1da177e4
LT
5719static int dev_cpu_callback(struct notifier_block *nfb,
5720 unsigned long action,
5721 void *ocpu)
5722{
5723 struct sk_buff **list_skb;
1da177e4
LT
5724 struct sk_buff *skb;
5725 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5726 struct softnet_data *sd, *oldsd;
5727
8bb78442 5728 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
5729 return NOTIFY_OK;
5730
5731 local_irq_disable();
5732 cpu = smp_processor_id();
5733 sd = &per_cpu(softnet_data, cpu);
5734 oldsd = &per_cpu(softnet_data, oldcpu);
5735
5736 /* Find end of our completion_queue. */
5737 list_skb = &sd->completion_queue;
5738 while (*list_skb)
5739 list_skb = &(*list_skb)->next;
5740 /* Append completion queue from offline CPU. */
5741 *list_skb = oldsd->completion_queue;
5742 oldsd->completion_queue = NULL;
5743
1da177e4 5744 /* Append output queue from offline CPU. */
a9cbd588
CG
5745 if (oldsd->output_queue) {
5746 *sd->output_queue_tailp = oldsd->output_queue;
5747 sd->output_queue_tailp = oldsd->output_queue_tailp;
5748 oldsd->output_queue = NULL;
5749 oldsd->output_queue_tailp = &oldsd->output_queue;
5750 }
1da177e4
LT
5751
5752 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5753 local_irq_enable();
5754
5755 /* Process offline CPU's input_pkt_queue */
76cc8b13 5756 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
1da177e4 5757 netif_rx(skb);
76cc8b13 5758 input_queue_head_incr(oldsd);
fec5e652 5759 }
76cc8b13 5760 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6e7676c1 5761 netif_rx(skb);
76cc8b13
TH
5762 input_queue_head_incr(oldsd);
5763 }
1da177e4
LT
5764
5765 return NOTIFY_OK;
5766}
1da177e4
LT
5767
5768
7f353bf2 5769/**
b63365a2
HX
5770 * netdev_increment_features - increment feature set by one
5771 * @all: current feature set
5772 * @one: new feature set
5773 * @mask: mask feature set
7f353bf2
HX
5774 *
5775 * Computes a new feature set after adding a device with feature set
b63365a2
HX
5776 * @one to the master device with current feature set @all. Will not
5777 * enable anything that is off in @mask. Returns the new feature set.
7f353bf2 5778 */
b63365a2
HX
5779unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5780 unsigned long mask)
5781{
5782 /* If device needs checksumming, downgrade to it. */
d1b19dff 5783 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
b63365a2
HX
5784 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5785 else if (mask & NETIF_F_ALL_CSUM) {
5786 /* If one device supports v4/v6 checksumming, set for all. */
5787 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5788 !(all & NETIF_F_GEN_CSUM)) {
5789 all &= ~NETIF_F_ALL_CSUM;
5790 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5791 }
e2a6b852 5792
b63365a2
HX
5793 /* If one device supports hw checksumming, set for all. */
5794 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5795 all &= ~NETIF_F_ALL_CSUM;
5796 all |= NETIF_F_HW_CSUM;
5797 }
5798 }
7f353bf2 5799
b63365a2 5800 one |= NETIF_F_ALL_CSUM;
7f353bf2 5801
b63365a2 5802 one |= all & NETIF_F_ONE_FOR_ALL;
d9f5950f 5803 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
b63365a2 5804 all |= one & mask & NETIF_F_ONE_FOR_ALL;
7f353bf2
HX
5805
5806 return all;
5807}
b63365a2 5808EXPORT_SYMBOL(netdev_increment_features);
7f353bf2 5809
30d97d35
PE
5810static struct hlist_head *netdev_create_hash(void)
5811{
5812 int i;
5813 struct hlist_head *hash;
5814
5815 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5816 if (hash != NULL)
5817 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5818 INIT_HLIST_HEAD(&hash[i]);
5819
5820 return hash;
5821}
5822
881d966b 5823/* Initialize per network namespace state */
4665079c 5824static int __net_init netdev_init(struct net *net)
881d966b 5825{
881d966b 5826 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 5827
30d97d35
PE
5828 net->dev_name_head = netdev_create_hash();
5829 if (net->dev_name_head == NULL)
5830 goto err_name;
881d966b 5831
30d97d35
PE
5832 net->dev_index_head = netdev_create_hash();
5833 if (net->dev_index_head == NULL)
5834 goto err_idx;
881d966b
EB
5835
5836 return 0;
30d97d35
PE
5837
5838err_idx:
5839 kfree(net->dev_name_head);
5840err_name:
5841 return -ENOMEM;
881d966b
EB
5842}
5843
f0db275a
SH
5844/**
5845 * netdev_drivername - network driver for the device
5846 * @dev: network device
5847 * @buffer: buffer for resulting name
5848 * @len: size of buffer
5849 *
5850 * Determine network driver for device.
5851 */
cf04a4c7 5852char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6579e57b 5853{
cf04a4c7
SH
5854 const struct device_driver *driver;
5855 const struct device *parent;
6579e57b
AV
5856
5857 if (len <= 0 || !buffer)
5858 return buffer;
5859 buffer[0] = 0;
5860
5861 parent = dev->dev.parent;
5862
5863 if (!parent)
5864 return buffer;
5865
5866 driver = parent->driver;
5867 if (driver && driver->name)
5868 strlcpy(buffer, driver->name, len);
5869 return buffer;
5870}
5871
256df2f3
JP
5872static int __netdev_printk(const char *level, const struct net_device *dev,
5873 struct va_format *vaf)
5874{
5875 int r;
5876
5877 if (dev && dev->dev.parent)
5878 r = dev_printk(level, dev->dev.parent, "%s: %pV",
5879 netdev_name(dev), vaf);
5880 else if (dev)
5881 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
5882 else
5883 r = printk("%s(NULL net_device): %pV", level, vaf);
5884
5885 return r;
5886}
5887
5888int netdev_printk(const char *level, const struct net_device *dev,
5889 const char *format, ...)
5890{
5891 struct va_format vaf;
5892 va_list args;
5893 int r;
5894
5895 va_start(args, format);
5896
5897 vaf.fmt = format;
5898 vaf.va = &args;
5899
5900 r = __netdev_printk(level, dev, &vaf);
5901 va_end(args);
5902
5903 return r;
5904}
5905EXPORT_SYMBOL(netdev_printk);
5906
5907#define define_netdev_printk_level(func, level) \
5908int func(const struct net_device *dev, const char *fmt, ...) \
5909{ \
5910 int r; \
5911 struct va_format vaf; \
5912 va_list args; \
5913 \
5914 va_start(args, fmt); \
5915 \
5916 vaf.fmt = fmt; \
5917 vaf.va = &args; \
5918 \
5919 r = __netdev_printk(level, dev, &vaf); \
5920 va_end(args); \
5921 \
5922 return r; \
5923} \
5924EXPORT_SYMBOL(func);
5925
5926define_netdev_printk_level(netdev_emerg, KERN_EMERG);
5927define_netdev_printk_level(netdev_alert, KERN_ALERT);
5928define_netdev_printk_level(netdev_crit, KERN_CRIT);
5929define_netdev_printk_level(netdev_err, KERN_ERR);
5930define_netdev_printk_level(netdev_warn, KERN_WARNING);
5931define_netdev_printk_level(netdev_notice, KERN_NOTICE);
5932define_netdev_printk_level(netdev_info, KERN_INFO);
5933
4665079c 5934static void __net_exit netdev_exit(struct net *net)
881d966b
EB
5935{
5936 kfree(net->dev_name_head);
5937 kfree(net->dev_index_head);
5938}
5939
022cbae6 5940static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
5941 .init = netdev_init,
5942 .exit = netdev_exit,
5943};
5944
4665079c 5945static void __net_exit default_device_exit(struct net *net)
ce286d32 5946{
e008b5fc 5947 struct net_device *dev, *aux;
ce286d32 5948 /*
e008b5fc 5949 * Push all migratable network devices back to the
ce286d32
EB
5950 * initial network namespace
5951 */
5952 rtnl_lock();
e008b5fc 5953 for_each_netdev_safe(net, dev, aux) {
ce286d32 5954 int err;
aca51397 5955 char fb_name[IFNAMSIZ];
ce286d32
EB
5956
5957 /* Ignore unmoveable devices (i.e. loopback) */
5958 if (dev->features & NETIF_F_NETNS_LOCAL)
5959 continue;
5960
e008b5fc
EB
5961 /* Leave virtual devices for the generic cleanup */
5962 if (dev->rtnl_link_ops)
5963 continue;
d0c082ce 5964
ce286d32 5965 /* Push remaing network devices to init_net */
aca51397
PE
5966 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5967 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 5968 if (err) {
aca51397 5969 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
ce286d32 5970 __func__, dev->name, err);
aca51397 5971 BUG();
ce286d32
EB
5972 }
5973 }
5974 rtnl_unlock();
5975}
5976
04dc7f6b
EB
5977static void __net_exit default_device_exit_batch(struct list_head *net_list)
5978{
5979 /* At exit all network devices most be removed from a network
5980 * namespace. Do this in the reverse order of registeration.
5981 * Do this across as many network namespaces as possible to
5982 * improve batching efficiency.
5983 */
5984 struct net_device *dev;
5985 struct net *net;
5986 LIST_HEAD(dev_kill_list);
5987
5988 rtnl_lock();
5989 list_for_each_entry(net, net_list, exit_list) {
5990 for_each_netdev_reverse(net, dev) {
5991 if (dev->rtnl_link_ops)
5992 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5993 else
5994 unregister_netdevice_queue(dev, &dev_kill_list);
5995 }
5996 }
5997 unregister_netdevice_many(&dev_kill_list);
5998 rtnl_unlock();
5999}
6000
022cbae6 6001static struct pernet_operations __net_initdata default_device_ops = {
ce286d32 6002 .exit = default_device_exit,
04dc7f6b 6003 .exit_batch = default_device_exit_batch,
ce286d32
EB
6004};
6005
1da177e4
LT
6006/*
6007 * Initialize the DEV module. At boot time this walks the device list and
6008 * unhooks any devices that fail to initialise (normally hardware not
6009 * present) and leaves us with a valid list of present and active devices.
6010 *
6011 */
6012
6013/*
6014 * This is called single threaded during boot, so no need
6015 * to take the rtnl semaphore.
6016 */
6017static int __init net_dev_init(void)
6018{
6019 int i, rc = -ENOMEM;
6020
6021 BUG_ON(!dev_boot_phase);
6022
1da177e4
LT
6023 if (dev_proc_init())
6024 goto out;
6025
8b41d188 6026 if (netdev_kobject_init())
1da177e4
LT
6027 goto out;
6028
6029 INIT_LIST_HEAD(&ptype_all);
82d8a867 6030 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
6031 INIT_LIST_HEAD(&ptype_base[i]);
6032
881d966b
EB
6033 if (register_pernet_subsys(&netdev_net_ops))
6034 goto out;
1da177e4
LT
6035
6036 /*
6037 * Initialise the packet receive queues.
6038 */
6039
6f912042 6040 for_each_possible_cpu(i) {
e36fa2f7 6041 struct softnet_data *sd = &per_cpu(softnet_data, i);
1da177e4 6042
dee42870 6043 memset(sd, 0, sizeof(*sd));
e36fa2f7 6044 skb_queue_head_init(&sd->input_pkt_queue);
6e7676c1 6045 skb_queue_head_init(&sd->process_queue);
e36fa2f7
ED
6046 sd->completion_queue = NULL;
6047 INIT_LIST_HEAD(&sd->poll_list);
a9cbd588
CG
6048 sd->output_queue = NULL;
6049 sd->output_queue_tailp = &sd->output_queue;
df334545 6050#ifdef CONFIG_RPS
e36fa2f7
ED
6051 sd->csd.func = rps_trigger_softirq;
6052 sd->csd.info = sd;
6053 sd->csd.flags = 0;
6054 sd->cpu = i;
1e94d72f 6055#endif
0a9627f2 6056
e36fa2f7
ED
6057 sd->backlog.poll = process_backlog;
6058 sd->backlog.weight = weight_p;
6059 sd->backlog.gro_list = NULL;
6060 sd->backlog.gro_count = 0;
1da177e4
LT
6061 }
6062
1da177e4
LT
6063 dev_boot_phase = 0;
6064
505d4f73
EB
6065 /* The loopback device is special if any other network devices
6066 * is present in a network namespace the loopback device must
6067 * be present. Since we now dynamically allocate and free the
6068 * loopback device ensure this invariant is maintained by
6069 * keeping the loopback device as the first device on the
6070 * list of network devices. Ensuring the loopback devices
6071 * is the first device that appears and the last network device
6072 * that disappears.
6073 */
6074 if (register_pernet_device(&loopback_net_ops))
6075 goto out;
6076
6077 if (register_pernet_device(&default_device_ops))
6078 goto out;
6079
962cf36c
CM
6080 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6081 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
6082
6083 hotcpu_notifier(dev_cpu_callback, 0);
6084 dst_init();
6085 dev_mcast_init();
6086 rc = 0;
6087out:
6088 return rc;
6089}
6090
6091subsys_initcall(net_dev_init);
6092
e88721f8
KK
6093static int __init initialize_hashrnd(void)
6094{
0a9627f2 6095 get_random_bytes(&hashrnd, sizeof(hashrnd));
e88721f8
KK
6096 return 0;
6097}
6098
6099late_initcall_sync(initialize_hashrnd);
6100