]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/core/dev.c
rps: static functions
[net-next-2.6.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <asm/system.h>
77#include <linux/bitops.h>
4fc268d2 78#include <linux/capability.h>
1da177e4
LT
79#include <linux/cpu.h>
80#include <linux/types.h>
81#include <linux/kernel.h>
08e9897d 82#include <linux/hash.h>
5a0e3ad6 83#include <linux/slab.h>
1da177e4 84#include <linux/sched.h>
4a3e2f71 85#include <linux/mutex.h>
1da177e4
LT
86#include <linux/string.h>
87#include <linux/mm.h>
88#include <linux/socket.h>
89#include <linux/sockios.h>
90#include <linux/errno.h>
91#include <linux/interrupt.h>
92#include <linux/if_ether.h>
93#include <linux/netdevice.h>
94#include <linux/etherdevice.h>
0187bdfb 95#include <linux/ethtool.h>
1da177e4
LT
96#include <linux/notifier.h>
97#include <linux/skbuff.h>
457c4cbc 98#include <net/net_namespace.h>
1da177e4
LT
99#include <net/sock.h>
100#include <linux/rtnetlink.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/stat.h>
104#include <linux/if_bridge.h>
b863ceb7 105#include <linux/if_macvlan.h>
1da177e4
LT
106#include <net/dst.h>
107#include <net/pkt_sched.h>
108#include <net/checksum.h>
44540960 109#include <net/xfrm.h>
1da177e4
LT
110#include <linux/highmem.h>
111#include <linux/init.h>
112#include <linux/kmod.h>
113#include <linux/module.h>
1da177e4
LT
114#include <linux/netpoll.h>
115#include <linux/rcupdate.h>
116#include <linux/delay.h>
295f4a1f 117#include <net/wext.h>
1da177e4 118#include <net/iw_handler.h>
1da177e4 119#include <asm/current.h>
5bdb9886 120#include <linux/audit.h>
db217334 121#include <linux/dmaengine.h>
f6a78bfc 122#include <linux/err.h>
c7fa9d18 123#include <linux/ctype.h>
723e98b7 124#include <linux/if_arp.h>
6de329e2 125#include <linux/if_vlan.h>
8f0f2223 126#include <linux/ip.h>
ad55dcaf 127#include <net/ip.h>
8f0f2223
DM
128#include <linux/ipv6.h>
129#include <linux/in.h>
b6b2fed1
DM
130#include <linux/jhash.h>
131#include <linux/random.h>
9cbc1cb8 132#include <trace/events/napi.h>
5acbbd42 133#include <linux/pci.h>
1da177e4 134
342709ef
PE
135#include "net-sysfs.h"
136
d565b0a1
HX
137/* Instead of increasing this, you should create a hash table. */
138#define MAX_GRO_SKBS 8
139
5d38a079
HX
140/* This should be increased if a protocol with a bigger head is added. */
141#define GRO_MAX_HEAD (MAX_HEADER + 128)
142
1da177e4
LT
143/*
144 * The list of packet types we will receive (as opposed to discard)
145 * and the routines to invoke.
146 *
147 * Why 16. Because with 16 the only overlap we get on a hash of the
148 * low nibble of the protocol value is RARP/SNAP/X.25.
149 *
150 * NOTE: That is no longer true with the addition of VLAN tags. Not
151 * sure which should go first, but I bet it won't make much
152 * difference if we are running VLANs. The good news is that
153 * this protocol won't be in the list unless compiled in, so
3041a069 154 * the average user (w/out VLANs) will not be adversely affected.
1da177e4
LT
155 * --BLG
156 *
157 * 0800 IP
158 * 8100 802.1Q VLAN
159 * 0001 802.3
160 * 0002 AX.25
161 * 0004 802.2
162 * 8035 RARP
163 * 0005 SNAP
164 * 0805 X.25
165 * 0806 ARP
166 * 8137 IPX
167 * 0009 Localtalk
168 * 86DD IPv6
169 */
170
82d8a867
PE
171#define PTYPE_HASH_SIZE (16)
172#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
173
1da177e4 174static DEFINE_SPINLOCK(ptype_lock);
82d8a867 175static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
6b2bedc3 176static struct list_head ptype_all __read_mostly; /* Taps */
1da177e4 177
1da177e4 178/*
7562f876 179 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
180 * semaphore.
181 *
c6d14c84 182 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
1da177e4
LT
183 *
184 * Writers must hold the rtnl semaphore while they loop through the
7562f876 185 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
186 * actual updates. This allows pure readers to access the list even
187 * while a writer is preparing to update it.
188 *
189 * To put it another way, dev_base_lock is held for writing only to
190 * protect against pure readers; the rtnl semaphore provides the
191 * protection against other writers.
192 *
193 * See, for example usages, register_netdevice() and
194 * unregister_netdevice(), which must be called with the rtnl
195 * semaphore held.
196 */
1da177e4 197DEFINE_RWLOCK(dev_base_lock);
1da177e4
LT
198EXPORT_SYMBOL(dev_base_lock);
199
881d966b 200static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4
LT
201{
202 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
08e9897d 203 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
1da177e4
LT
204}
205
881d966b 206static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 207{
7c28bd0b 208 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
1da177e4
LT
209}
210
152102c7
CG
211static inline void rps_lock(struct softnet_data *queue)
212{
213#ifdef CONFIG_RPS
214 spin_lock(&queue->input_pkt_queue.lock);
215#endif
216}
217
218static inline void rps_unlock(struct softnet_data *queue)
219{
220#ifdef CONFIG_RPS
221 spin_unlock(&queue->input_pkt_queue.lock);
222#endif
223}
224
ce286d32
EB
225/* Device list insertion */
226static int list_netdevice(struct net_device *dev)
227{
c346dca1 228 struct net *net = dev_net(dev);
ce286d32
EB
229
230 ASSERT_RTNL();
231
232 write_lock_bh(&dev_base_lock);
c6d14c84 233 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
72c9528b 234 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
fb699dfd
ED
235 hlist_add_head_rcu(&dev->index_hlist,
236 dev_index_hash(net, dev->ifindex));
ce286d32
EB
237 write_unlock_bh(&dev_base_lock);
238 return 0;
239}
240
fb699dfd
ED
241/* Device list removal
242 * caller must respect a RCU grace period before freeing/reusing dev
243 */
ce286d32
EB
244static void unlist_netdevice(struct net_device *dev)
245{
246 ASSERT_RTNL();
247
248 /* Unlink dev from the device chain */
249 write_lock_bh(&dev_base_lock);
c6d14c84 250 list_del_rcu(&dev->dev_list);
72c9528b 251 hlist_del_rcu(&dev->name_hlist);
fb699dfd 252 hlist_del_rcu(&dev->index_hlist);
ce286d32
EB
253 write_unlock_bh(&dev_base_lock);
254}
255
1da177e4
LT
256/*
257 * Our notifier list
258 */
259
f07d5b94 260static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
261
262/*
263 * Device drivers call our routines to queue packets here. We empty the
264 * queue in the local softnet handler.
265 */
bea3348e 266
9958da05 267DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
d1b19dff 268EXPORT_PER_CPU_SYMBOL(softnet_data);
1da177e4 269
cf508b12 270#ifdef CONFIG_LOCKDEP
723e98b7 271/*
c773e847 272 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
273 * according to dev->type
274 */
275static const unsigned short netdev_lock_type[] =
276 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
277 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
278 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
279 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
280 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
281 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
282 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
283 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
284 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
285 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
286 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
287 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
288 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
2d91d78b 289 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
929122cd 290 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
fcb94e42 291 ARPHRD_VOID, ARPHRD_NONE};
723e98b7 292
36cbd3dc 293static const char *const netdev_lock_name[] =
723e98b7
JP
294 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
295 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
296 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
297 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
298 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
299 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
300 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
301 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
302 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
303 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
304 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
305 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
306 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
2d91d78b 307 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
929122cd 308 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
fcb94e42 309 "_xmit_VOID", "_xmit_NONE"};
723e98b7
JP
310
311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
313
314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315{
316 int i;
317
318 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319 if (netdev_lock_type[i] == dev_type)
320 return i;
321 /* the last key is used by default */
322 return ARRAY_SIZE(netdev_lock_type) - 1;
323}
324
cf508b12
DM
325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 unsigned short dev_type)
723e98b7
JP
327{
328 int i;
329
330 i = netdev_lock_pos(dev_type);
331 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332 netdev_lock_name[i]);
333}
cf508b12
DM
334
335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336{
337 int i;
338
339 i = netdev_lock_pos(dev->type);
340 lockdep_set_class_and_name(&dev->addr_list_lock,
341 &netdev_addr_lock_key[i],
342 netdev_lock_name[i]);
343}
723e98b7 344#else
cf508b12
DM
345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346 unsigned short dev_type)
347{
348}
349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
350{
351}
352#endif
1da177e4
LT
353
354/*******************************************************************************
355
356 Protocol management and registration routines
357
358*******************************************************************************/
359
1da177e4
LT
360/*
361 * Add a protocol ID to the list. Now that the input handler is
362 * smarter we can dispense with all the messy stuff that used to be
363 * here.
364 *
365 * BEWARE!!! Protocol handlers, mangling input packets,
366 * MUST BE last in hash buckets and checking protocol handlers
367 * MUST start from promiscuous ptype_all chain in net_bh.
368 * It is true now, do not change it.
369 * Explanation follows: if protocol handler, mangling packet, will
370 * be the first on list, it is not able to sense, that packet
371 * is cloned and should be copied-on-write, so that it will
372 * change it and subsequent readers will get broken packet.
373 * --ANK (980803)
374 */
375
376/**
377 * dev_add_pack - add packet handler
378 * @pt: packet type declaration
379 *
380 * Add a protocol handler to the networking stack. The passed &packet_type
381 * is linked into kernel lists and may not be freed until it has been
382 * removed from the kernel lists.
383 *
4ec93edb 384 * This call does not sleep therefore it can not
1da177e4
LT
385 * guarantee all CPU's that are in middle of receiving packets
386 * will see the new packet type (until the next received packet).
387 */
388
389void dev_add_pack(struct packet_type *pt)
390{
391 int hash;
392
393 spin_lock_bh(&ptype_lock);
9be9a6b9 394 if (pt->type == htons(ETH_P_ALL))
1da177e4 395 list_add_rcu(&pt->list, &ptype_all);
9be9a6b9 396 else {
82d8a867 397 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
1da177e4
LT
398 list_add_rcu(&pt->list, &ptype_base[hash]);
399 }
400 spin_unlock_bh(&ptype_lock);
401}
d1b19dff 402EXPORT_SYMBOL(dev_add_pack);
1da177e4 403
1da177e4
LT
404/**
405 * __dev_remove_pack - remove packet handler
406 * @pt: packet type declaration
407 *
408 * Remove a protocol handler that was previously added to the kernel
409 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
410 * from the kernel lists and can be freed or reused once this function
4ec93edb 411 * returns.
1da177e4
LT
412 *
413 * The packet type might still be in use by receivers
414 * and must not be freed until after all the CPU's have gone
415 * through a quiescent state.
416 */
417void __dev_remove_pack(struct packet_type *pt)
418{
419 struct list_head *head;
420 struct packet_type *pt1;
421
422 spin_lock_bh(&ptype_lock);
423
9be9a6b9 424 if (pt->type == htons(ETH_P_ALL))
1da177e4 425 head = &ptype_all;
9be9a6b9 426 else
82d8a867 427 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
1da177e4
LT
428
429 list_for_each_entry(pt1, head, list) {
430 if (pt == pt1) {
431 list_del_rcu(&pt->list);
432 goto out;
433 }
434 }
435
436 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
437out:
438 spin_unlock_bh(&ptype_lock);
439}
d1b19dff
ED
440EXPORT_SYMBOL(__dev_remove_pack);
441
1da177e4
LT
442/**
443 * dev_remove_pack - remove packet handler
444 * @pt: packet type declaration
445 *
446 * Remove a protocol handler that was previously added to the kernel
447 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
448 * from the kernel lists and can be freed or reused once this function
449 * returns.
450 *
451 * This call sleeps to guarantee that no CPU is looking at the packet
452 * type after return.
453 */
454void dev_remove_pack(struct packet_type *pt)
455{
456 __dev_remove_pack(pt);
4ec93edb 457
1da177e4
LT
458 synchronize_net();
459}
d1b19dff 460EXPORT_SYMBOL(dev_remove_pack);
1da177e4
LT
461
462/******************************************************************************
463
464 Device Boot-time Settings Routines
465
466*******************************************************************************/
467
468/* Boot time configuration table */
469static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
470
471/**
472 * netdev_boot_setup_add - add new setup entry
473 * @name: name of the device
474 * @map: configured settings for the device
475 *
476 * Adds new setup entry to the dev_boot_setup list. The function
477 * returns 0 on error and 1 on success. This is a generic routine to
478 * all netdevices.
479 */
480static int netdev_boot_setup_add(char *name, struct ifmap *map)
481{
482 struct netdev_boot_setup *s;
483 int i;
484
485 s = dev_boot_setup;
486 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 489 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
490 memcpy(&s[i].map, map, sizeof(s[i].map));
491 break;
492 }
493 }
494
495 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
496}
497
498/**
499 * netdev_boot_setup_check - check boot time settings
500 * @dev: the netdevice
501 *
502 * Check boot time settings for the device.
503 * The found settings are set for the device to be used
504 * later in the device probing.
505 * Returns 0 if no settings found, 1 if they are.
506 */
507int netdev_boot_setup_check(struct net_device *dev)
508{
509 struct netdev_boot_setup *s = dev_boot_setup;
510 int i;
511
512 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 514 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
515 dev->irq = s[i].map.irq;
516 dev->base_addr = s[i].map.base_addr;
517 dev->mem_start = s[i].map.mem_start;
518 dev->mem_end = s[i].map.mem_end;
519 return 1;
520 }
521 }
522 return 0;
523}
d1b19dff 524EXPORT_SYMBOL(netdev_boot_setup_check);
1da177e4
LT
525
526
527/**
528 * netdev_boot_base - get address from boot time settings
529 * @prefix: prefix for network device
530 * @unit: id for network device
531 *
532 * Check boot time settings for the base address of device.
533 * The found settings are set for the device to be used
534 * later in the device probing.
535 * Returns 0 if no settings found.
536 */
537unsigned long netdev_boot_base(const char *prefix, int unit)
538{
539 const struct netdev_boot_setup *s = dev_boot_setup;
540 char name[IFNAMSIZ];
541 int i;
542
543 sprintf(name, "%s%d", prefix, unit);
544
545 /*
546 * If device already registered then return base of 1
547 * to indicate not to probe for this interface
548 */
881d966b 549 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
550 return 1;
551
552 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 if (!strcmp(name, s[i].name))
554 return s[i].map.base_addr;
555 return 0;
556}
557
558/*
559 * Saves at boot time configured settings for any netdevice.
560 */
561int __init netdev_boot_setup(char *str)
562{
563 int ints[5];
564 struct ifmap map;
565
566 str = get_options(str, ARRAY_SIZE(ints), ints);
567 if (!str || !*str)
568 return 0;
569
570 /* Save settings */
571 memset(&map, 0, sizeof(map));
572 if (ints[0] > 0)
573 map.irq = ints[1];
574 if (ints[0] > 1)
575 map.base_addr = ints[2];
576 if (ints[0] > 2)
577 map.mem_start = ints[3];
578 if (ints[0] > 3)
579 map.mem_end = ints[4];
580
581 /* Add new entry to the list */
582 return netdev_boot_setup_add(str, &map);
583}
584
585__setup("netdev=", netdev_boot_setup);
586
587/*******************************************************************************
588
589 Device Interface Subroutines
590
591*******************************************************************************/
592
593/**
594 * __dev_get_by_name - find a device by its name
c4ea43c5 595 * @net: the applicable net namespace
1da177e4
LT
596 * @name: name to find
597 *
598 * Find an interface by name. Must be called under RTNL semaphore
599 * or @dev_base_lock. If the name is found a pointer to the device
600 * is returned. If the name is not found then %NULL is returned. The
601 * reference counters are not incremented so the caller must be
602 * careful with locks.
603 */
604
881d966b 605struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
606{
607 struct hlist_node *p;
0bd8d536
ED
608 struct net_device *dev;
609 struct hlist_head *head = dev_name_hash(net, name);
1da177e4 610
0bd8d536 611 hlist_for_each_entry(dev, p, head, name_hlist)
1da177e4
LT
612 if (!strncmp(dev->name, name, IFNAMSIZ))
613 return dev;
0bd8d536 614
1da177e4
LT
615 return NULL;
616}
d1b19dff 617EXPORT_SYMBOL(__dev_get_by_name);
1da177e4 618
72c9528b
ED
619/**
620 * dev_get_by_name_rcu - find a device by its name
621 * @net: the applicable net namespace
622 * @name: name to find
623 *
624 * Find an interface by name.
625 * If the name is found a pointer to the device is returned.
626 * If the name is not found then %NULL is returned.
627 * The reference counters are not incremented so the caller must be
628 * careful with locks. The caller must hold RCU lock.
629 */
630
631struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
632{
633 struct hlist_node *p;
634 struct net_device *dev;
635 struct hlist_head *head = dev_name_hash(net, name);
636
637 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 if (!strncmp(dev->name, name, IFNAMSIZ))
639 return dev;
640
641 return NULL;
642}
643EXPORT_SYMBOL(dev_get_by_name_rcu);
644
1da177e4
LT
645/**
646 * dev_get_by_name - find a device by its name
c4ea43c5 647 * @net: the applicable net namespace
1da177e4
LT
648 * @name: name to find
649 *
650 * Find an interface by name. This can be called from any
651 * context and does its own locking. The returned handle has
652 * the usage count incremented and the caller must use dev_put() to
653 * release it when it is no longer needed. %NULL is returned if no
654 * matching device is found.
655 */
656
881d966b 657struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
658{
659 struct net_device *dev;
660
72c9528b
ED
661 rcu_read_lock();
662 dev = dev_get_by_name_rcu(net, name);
1da177e4
LT
663 if (dev)
664 dev_hold(dev);
72c9528b 665 rcu_read_unlock();
1da177e4
LT
666 return dev;
667}
d1b19dff 668EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
669
670/**
671 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 672 * @net: the applicable net namespace
1da177e4
LT
673 * @ifindex: index of device
674 *
675 * Search for an interface by index. Returns %NULL if the device
676 * is not found or a pointer to the device. The device has not
677 * had its reference counter increased so the caller must be careful
678 * about locking. The caller must hold either the RTNL semaphore
679 * or @dev_base_lock.
680 */
681
881d966b 682struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
683{
684 struct hlist_node *p;
0bd8d536
ED
685 struct net_device *dev;
686 struct hlist_head *head = dev_index_hash(net, ifindex);
1da177e4 687
0bd8d536 688 hlist_for_each_entry(dev, p, head, index_hlist)
1da177e4
LT
689 if (dev->ifindex == ifindex)
690 return dev;
0bd8d536 691
1da177e4
LT
692 return NULL;
693}
d1b19dff 694EXPORT_SYMBOL(__dev_get_by_index);
1da177e4 695
fb699dfd
ED
696/**
697 * dev_get_by_index_rcu - find a device by its ifindex
698 * @net: the applicable net namespace
699 * @ifindex: index of device
700 *
701 * Search for an interface by index. Returns %NULL if the device
702 * is not found or a pointer to the device. The device has not
703 * had its reference counter increased so the caller must be careful
704 * about locking. The caller must hold RCU lock.
705 */
706
707struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
708{
709 struct hlist_node *p;
710 struct net_device *dev;
711 struct hlist_head *head = dev_index_hash(net, ifindex);
712
713 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 if (dev->ifindex == ifindex)
715 return dev;
716
717 return NULL;
718}
719EXPORT_SYMBOL(dev_get_by_index_rcu);
720
1da177e4
LT
721
722/**
723 * dev_get_by_index - find a device by its ifindex
c4ea43c5 724 * @net: the applicable net namespace
1da177e4
LT
725 * @ifindex: index of device
726 *
727 * Search for an interface by index. Returns NULL if the device
728 * is not found or a pointer to the device. The device returned has
729 * had a reference added and the pointer is safe until the user calls
730 * dev_put to indicate they have finished with it.
731 */
732
881d966b 733struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
734{
735 struct net_device *dev;
736
fb699dfd
ED
737 rcu_read_lock();
738 dev = dev_get_by_index_rcu(net, ifindex);
1da177e4
LT
739 if (dev)
740 dev_hold(dev);
fb699dfd 741 rcu_read_unlock();
1da177e4
LT
742 return dev;
743}
d1b19dff 744EXPORT_SYMBOL(dev_get_by_index);
1da177e4
LT
745
746/**
747 * dev_getbyhwaddr - find a device by its hardware address
c4ea43c5 748 * @net: the applicable net namespace
1da177e4
LT
749 * @type: media type of device
750 * @ha: hardware address
751 *
752 * Search for an interface by MAC address. Returns NULL if the device
753 * is not found or a pointer to the device. The caller must hold the
754 * rtnl semaphore. The returned device has not had its ref count increased
755 * and the caller must therefore be careful about locking
756 *
757 * BUGS:
758 * If the API was consistent this would be __dev_get_by_hwaddr
759 */
760
881d966b 761struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
1da177e4
LT
762{
763 struct net_device *dev;
764
765 ASSERT_RTNL();
766
81103a52 767 for_each_netdev(net, dev)
1da177e4
LT
768 if (dev->type == type &&
769 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
770 return dev;
771
772 return NULL;
1da177e4 773}
cf309e3f
JF
774EXPORT_SYMBOL(dev_getbyhwaddr);
775
881d966b 776struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
777{
778 struct net_device *dev;
779
4e9cac2b 780 ASSERT_RTNL();
881d966b 781 for_each_netdev(net, dev)
4e9cac2b 782 if (dev->type == type)
7562f876
PE
783 return dev;
784
785 return NULL;
4e9cac2b 786}
4e9cac2b
PM
787EXPORT_SYMBOL(__dev_getfirstbyhwtype);
788
881d966b 789struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b 790{
99fe3c39 791 struct net_device *dev, *ret = NULL;
4e9cac2b 792
99fe3c39
ED
793 rcu_read_lock();
794 for_each_netdev_rcu(net, dev)
795 if (dev->type == type) {
796 dev_hold(dev);
797 ret = dev;
798 break;
799 }
800 rcu_read_unlock();
801 return ret;
1da177e4 802}
1da177e4
LT
803EXPORT_SYMBOL(dev_getfirstbyhwtype);
804
805/**
806 * dev_get_by_flags - find any device with given flags
c4ea43c5 807 * @net: the applicable net namespace
1da177e4
LT
808 * @if_flags: IFF_* values
809 * @mask: bitmask of bits in if_flags to check
810 *
811 * Search for any interface with the given flags. Returns NULL if a device
4ec93edb 812 * is not found or a pointer to the device. The device returned has
1da177e4
LT
813 * had a reference added and the pointer is safe until the user calls
814 * dev_put to indicate they have finished with it.
815 */
816
d1b19dff
ED
817struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
818 unsigned short mask)
1da177e4 819{
7562f876 820 struct net_device *dev, *ret;
1da177e4 821
7562f876 822 ret = NULL;
c6d14c84
ED
823 rcu_read_lock();
824 for_each_netdev_rcu(net, dev) {
1da177e4
LT
825 if (((dev->flags ^ if_flags) & mask) == 0) {
826 dev_hold(dev);
7562f876 827 ret = dev;
1da177e4
LT
828 break;
829 }
830 }
c6d14c84 831 rcu_read_unlock();
7562f876 832 return ret;
1da177e4 833}
d1b19dff 834EXPORT_SYMBOL(dev_get_by_flags);
1da177e4
LT
835
836/**
837 * dev_valid_name - check if name is okay for network device
838 * @name: name string
839 *
840 * Network device names need to be valid file names to
c7fa9d18
DM
841 * to allow sysfs to work. We also disallow any kind of
842 * whitespace.
1da177e4 843 */
c2373ee9 844int dev_valid_name(const char *name)
1da177e4 845{
c7fa9d18
DM
846 if (*name == '\0')
847 return 0;
b6fe17d6
SH
848 if (strlen(name) >= IFNAMSIZ)
849 return 0;
c7fa9d18
DM
850 if (!strcmp(name, ".") || !strcmp(name, ".."))
851 return 0;
852
853 while (*name) {
854 if (*name == '/' || isspace(*name))
855 return 0;
856 name++;
857 }
858 return 1;
1da177e4 859}
d1b19dff 860EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
861
862/**
b267b179
EB
863 * __dev_alloc_name - allocate a name for a device
864 * @net: network namespace to allocate the device name in
1da177e4 865 * @name: name format string
b267b179 866 * @buf: scratch buffer and result name string
1da177e4
LT
867 *
868 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
869 * id. It scans list of devices to build up a free map, then chooses
870 * the first empty slot. The caller must hold the dev_base or rtnl lock
871 * while allocating the name and adding the device in order to avoid
872 * duplicates.
873 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
874 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
875 */
876
b267b179 877static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
878{
879 int i = 0;
1da177e4
LT
880 const char *p;
881 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 882 unsigned long *inuse;
1da177e4
LT
883 struct net_device *d;
884
885 p = strnchr(name, IFNAMSIZ-1, '%');
886 if (p) {
887 /*
888 * Verify the string as this thing may have come from
889 * the user. There must be either one "%d" and no other "%"
890 * characters.
891 */
892 if (p[1] != 'd' || strchr(p + 2, '%'))
893 return -EINVAL;
894
895 /* Use one page as a bit array of possible slots */
cfcabdcc 896 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
897 if (!inuse)
898 return -ENOMEM;
899
881d966b 900 for_each_netdev(net, d) {
1da177e4
LT
901 if (!sscanf(d->name, name, &i))
902 continue;
903 if (i < 0 || i >= max_netdevices)
904 continue;
905
906 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 907 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
908 if (!strncmp(buf, d->name, IFNAMSIZ))
909 set_bit(i, inuse);
910 }
911
912 i = find_first_zero_bit(inuse, max_netdevices);
913 free_page((unsigned long) inuse);
914 }
915
d9031024
OP
916 if (buf != name)
917 snprintf(buf, IFNAMSIZ, name, i);
b267b179 918 if (!__dev_get_by_name(net, buf))
1da177e4 919 return i;
1da177e4
LT
920
921 /* It is possible to run out of possible slots
922 * when the name is long and there isn't enough space left
923 * for the digits, or if all bits are used.
924 */
925 return -ENFILE;
926}
927
b267b179
EB
928/**
929 * dev_alloc_name - allocate a name for a device
930 * @dev: device
931 * @name: name format string
932 *
933 * Passed a format string - eg "lt%d" it will try and find a suitable
934 * id. It scans list of devices to build up a free map, then chooses
935 * the first empty slot. The caller must hold the dev_base or rtnl lock
936 * while allocating the name and adding the device in order to avoid
937 * duplicates.
938 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
939 * Returns the number of the unit assigned or a negative errno code.
940 */
941
942int dev_alloc_name(struct net_device *dev, const char *name)
943{
944 char buf[IFNAMSIZ];
945 struct net *net;
946 int ret;
947
c346dca1
YH
948 BUG_ON(!dev_net(dev));
949 net = dev_net(dev);
b267b179
EB
950 ret = __dev_alloc_name(net, name, buf);
951 if (ret >= 0)
952 strlcpy(dev->name, buf, IFNAMSIZ);
953 return ret;
954}
d1b19dff 955EXPORT_SYMBOL(dev_alloc_name);
b267b179 956
d9031024
OP
957static int dev_get_valid_name(struct net *net, const char *name, char *buf,
958 bool fmt)
959{
960 if (!dev_valid_name(name))
961 return -EINVAL;
962
963 if (fmt && strchr(name, '%'))
964 return __dev_alloc_name(net, name, buf);
965 else if (__dev_get_by_name(net, name))
966 return -EEXIST;
967 else if (buf != name)
968 strlcpy(buf, name, IFNAMSIZ);
969
970 return 0;
971}
1da177e4
LT
972
973/**
974 * dev_change_name - change name of a device
975 * @dev: device
976 * @newname: name (or format string) must be at least IFNAMSIZ
977 *
978 * Change name of a device, can pass format strings "eth%d".
979 * for wildcarding.
980 */
cf04a4c7 981int dev_change_name(struct net_device *dev, const char *newname)
1da177e4 982{
fcc5a03a 983 char oldname[IFNAMSIZ];
1da177e4 984 int err = 0;
fcc5a03a 985 int ret;
881d966b 986 struct net *net;
1da177e4
LT
987
988 ASSERT_RTNL();
c346dca1 989 BUG_ON(!dev_net(dev));
1da177e4 990
c346dca1 991 net = dev_net(dev);
1da177e4
LT
992 if (dev->flags & IFF_UP)
993 return -EBUSY;
994
c8d90dca
SH
995 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
996 return 0;
997
fcc5a03a
HX
998 memcpy(oldname, dev->name, IFNAMSIZ);
999
d9031024
OP
1000 err = dev_get_valid_name(net, newname, dev->name, 1);
1001 if (err < 0)
1002 return err;
1da177e4 1003
fcc5a03a 1004rollback:
3891845e
EB
1005 /* For now only devices in the initial network namespace
1006 * are in sysfs.
1007 */
09ad9bc7 1008 if (net_eq(net, &init_net)) {
3891845e
EB
1009 ret = device_rename(&dev->dev, dev->name);
1010 if (ret) {
1011 memcpy(dev->name, oldname, IFNAMSIZ);
1012 return ret;
1013 }
dcc99773 1014 }
7f988eab
HX
1015
1016 write_lock_bh(&dev_base_lock);
92749821 1017 hlist_del(&dev->name_hlist);
72c9528b
ED
1018 write_unlock_bh(&dev_base_lock);
1019
1020 synchronize_rcu();
1021
1022 write_lock_bh(&dev_base_lock);
1023 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
1024 write_unlock_bh(&dev_base_lock);
1025
056925ab 1026 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
1027 ret = notifier_to_errno(ret);
1028
1029 if (ret) {
91e9c07b
ED
1030 /* err >= 0 after dev_alloc_name() or stores the first errno */
1031 if (err >= 0) {
fcc5a03a
HX
1032 err = ret;
1033 memcpy(dev->name, oldname, IFNAMSIZ);
1034 goto rollback;
91e9c07b
ED
1035 } else {
1036 printk(KERN_ERR
1037 "%s: name change rollback failed: %d.\n",
1038 dev->name, ret);
fcc5a03a
HX
1039 }
1040 }
1da177e4
LT
1041
1042 return err;
1043}
1044
0b815a1a
SH
1045/**
1046 * dev_set_alias - change ifalias of a device
1047 * @dev: device
1048 * @alias: name up to IFALIASZ
f0db275a 1049 * @len: limit of bytes to copy from info
0b815a1a
SH
1050 *
1051 * Set ifalias for a device,
1052 */
1053int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1054{
1055 ASSERT_RTNL();
1056
1057 if (len >= IFALIASZ)
1058 return -EINVAL;
1059
96ca4a2c
OH
1060 if (!len) {
1061 if (dev->ifalias) {
1062 kfree(dev->ifalias);
1063 dev->ifalias = NULL;
1064 }
1065 return 0;
1066 }
1067
d1b19dff 1068 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
0b815a1a
SH
1069 if (!dev->ifalias)
1070 return -ENOMEM;
1071
1072 strlcpy(dev->ifalias, alias, len+1);
1073 return len;
1074}
1075
1076
d8a33ac4 1077/**
3041a069 1078 * netdev_features_change - device changes features
d8a33ac4
SH
1079 * @dev: device to cause notification
1080 *
1081 * Called to indicate a device has changed features.
1082 */
1083void netdev_features_change(struct net_device *dev)
1084{
056925ab 1085 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
1086}
1087EXPORT_SYMBOL(netdev_features_change);
1088
1da177e4
LT
1089/**
1090 * netdev_state_change - device changes state
1091 * @dev: device to cause notification
1092 *
1093 * Called to indicate a device has changed state. This function calls
1094 * the notifier chains for netdev_chain and sends a NEWLINK message
1095 * to the routing socket.
1096 */
1097void netdev_state_change(struct net_device *dev)
1098{
1099 if (dev->flags & IFF_UP) {
056925ab 1100 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1da177e4
LT
1101 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1102 }
1103}
d1b19dff 1104EXPORT_SYMBOL(netdev_state_change);
1da177e4 1105
3ca5b404 1106int netdev_bonding_change(struct net_device *dev, unsigned long event)
c1da4ac7 1107{
3ca5b404 1108 return call_netdevice_notifiers(event, dev);
c1da4ac7
OG
1109}
1110EXPORT_SYMBOL(netdev_bonding_change);
1111
1da177e4
LT
1112/**
1113 * dev_load - load a network module
c4ea43c5 1114 * @net: the applicable net namespace
1da177e4
LT
1115 * @name: name of interface
1116 *
1117 * If a network interface is not present and the process has suitable
1118 * privileges this function loads the module. If module loading is not
1119 * available in this kernel then it becomes a nop.
1120 */
1121
881d966b 1122void dev_load(struct net *net, const char *name)
1da177e4 1123{
4ec93edb 1124 struct net_device *dev;
1da177e4 1125
72c9528b
ED
1126 rcu_read_lock();
1127 dev = dev_get_by_name_rcu(net, name);
1128 rcu_read_unlock();
1da177e4 1129
a8f80e8f 1130 if (!dev && capable(CAP_NET_ADMIN))
1da177e4
LT
1131 request_module("%s", name);
1132}
d1b19dff 1133EXPORT_SYMBOL(dev_load);
1da177e4 1134
bd380811 1135static int __dev_open(struct net_device *dev)
1da177e4 1136{
d314774c 1137 const struct net_device_ops *ops = dev->netdev_ops;
3b8bcfd5 1138 int ret;
1da177e4 1139
e46b66bc
BH
1140 ASSERT_RTNL();
1141
1da177e4
LT
1142 /*
1143 * Is it even present?
1144 */
1145 if (!netif_device_present(dev))
1146 return -ENODEV;
1147
3b8bcfd5
JB
1148 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1149 ret = notifier_to_errno(ret);
1150 if (ret)
1151 return ret;
1152
1da177e4
LT
1153 /*
1154 * Call device private open method
1155 */
1156 set_bit(__LINK_STATE_START, &dev->state);
bada339b 1157
d314774c
SH
1158 if (ops->ndo_validate_addr)
1159 ret = ops->ndo_validate_addr(dev);
bada339b 1160
d314774c
SH
1161 if (!ret && ops->ndo_open)
1162 ret = ops->ndo_open(dev);
1da177e4 1163
4ec93edb 1164 /*
1da177e4
LT
1165 * If it went open OK then:
1166 */
1167
bada339b
JG
1168 if (ret)
1169 clear_bit(__LINK_STATE_START, &dev->state);
1170 else {
1da177e4
LT
1171 /*
1172 * Set the flags.
1173 */
1174 dev->flags |= IFF_UP;
1175
649274d9
DW
1176 /*
1177 * Enable NET_DMA
1178 */
b4bd07c2 1179 net_dmaengine_get();
649274d9 1180
1da177e4
LT
1181 /*
1182 * Initialize multicasting status
1183 */
4417da66 1184 dev_set_rx_mode(dev);
1da177e4
LT
1185
1186 /*
1187 * Wakeup transmit queue engine
1188 */
1189 dev_activate(dev);
1da177e4 1190 }
bada339b 1191
1da177e4
LT
1192 return ret;
1193}
1194
1195/**
bd380811
PM
1196 * dev_open - prepare an interface for use.
1197 * @dev: device to open
1da177e4 1198 *
bd380811
PM
1199 * Takes a device from down to up state. The device's private open
1200 * function is invoked and then the multicast lists are loaded. Finally
1201 * the device is moved into the up state and a %NETDEV_UP message is
1202 * sent to the netdev notifier chain.
1203 *
1204 * Calling this function on an active interface is a nop. On a failure
1205 * a negative errno code is returned.
1da177e4 1206 */
bd380811
PM
1207int dev_open(struct net_device *dev)
1208{
1209 int ret;
1210
1211 /*
1212 * Is it already up?
1213 */
1214 if (dev->flags & IFF_UP)
1215 return 0;
1216
1217 /*
1218 * Open device
1219 */
1220 ret = __dev_open(dev);
1221 if (ret < 0)
1222 return ret;
1223
1224 /*
1225 * ... and announce new interface.
1226 */
1227 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1228 call_netdevice_notifiers(NETDEV_UP, dev);
1229
1230 return ret;
1231}
1232EXPORT_SYMBOL(dev_open);
1233
1234static int __dev_close(struct net_device *dev)
1da177e4 1235{
d314774c 1236 const struct net_device_ops *ops = dev->netdev_ops;
e46b66bc 1237
bd380811 1238 ASSERT_RTNL();
9d5010db
DM
1239 might_sleep();
1240
1da177e4
LT
1241 /*
1242 * Tell people we are going down, so that they can
1243 * prepare to death, when device is still operating.
1244 */
056925ab 1245 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1246
1da177e4
LT
1247 clear_bit(__LINK_STATE_START, &dev->state);
1248
1249 /* Synchronize to scheduled poll. We cannot touch poll list,
bea3348e
SH
1250 * it can be even on different cpu. So just clear netif_running().
1251 *
1252 * dev->stop() will invoke napi_disable() on all of it's
1253 * napi_struct instances on this device.
1254 */
1da177e4 1255 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1da177e4 1256
d8b2a4d2
ML
1257 dev_deactivate(dev);
1258
1da177e4
LT
1259 /*
1260 * Call the device specific close. This cannot fail.
1261 * Only if device is UP
1262 *
1263 * We allow it to be called even after a DETACH hot-plug
1264 * event.
1265 */
d314774c
SH
1266 if (ops->ndo_stop)
1267 ops->ndo_stop(dev);
1da177e4
LT
1268
1269 /*
1270 * Device is now down.
1271 */
1272
1273 dev->flags &= ~IFF_UP;
1274
1275 /*
bd380811 1276 * Shutdown NET_DMA
1da177e4 1277 */
bd380811
PM
1278 net_dmaengine_put();
1279
1280 return 0;
1281}
1282
1283/**
1284 * dev_close - shutdown an interface.
1285 * @dev: device to shutdown
1286 *
1287 * This function moves an active device into down state. A
1288 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1289 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1290 * chain.
1291 */
1292int dev_close(struct net_device *dev)
1293{
1294 if (!(dev->flags & IFF_UP))
1295 return 0;
1296
1297 __dev_close(dev);
1da177e4 1298
649274d9 1299 /*
bd380811 1300 * Tell people we are down
649274d9 1301 */
bd380811
PM
1302 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1303 call_netdevice_notifiers(NETDEV_DOWN, dev);
649274d9 1304
1da177e4
LT
1305 return 0;
1306}
d1b19dff 1307EXPORT_SYMBOL(dev_close);
1da177e4
LT
1308
1309
0187bdfb
BH
1310/**
1311 * dev_disable_lro - disable Large Receive Offload on a device
1312 * @dev: device
1313 *
1314 * Disable Large Receive Offload (LRO) on a net device. Must be
1315 * called under RTNL. This is needed if received packets may be
1316 * forwarded to another interface.
1317 */
1318void dev_disable_lro(struct net_device *dev)
1319{
1320 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1321 dev->ethtool_ops->set_flags) {
1322 u32 flags = dev->ethtool_ops->get_flags(dev);
1323 if (flags & ETH_FLAG_LRO) {
1324 flags &= ~ETH_FLAG_LRO;
1325 dev->ethtool_ops->set_flags(dev, flags);
1326 }
1327 }
1328 WARN_ON(dev->features & NETIF_F_LRO);
1329}
1330EXPORT_SYMBOL(dev_disable_lro);
1331
1332
881d966b
EB
1333static int dev_boot_phase = 1;
1334
1da177e4
LT
1335/*
1336 * Device change register/unregister. These are not inline or static
1337 * as we export them to the world.
1338 */
1339
1340/**
1341 * register_netdevice_notifier - register a network notifier block
1342 * @nb: notifier
1343 *
1344 * Register a notifier to be called when network device events occur.
1345 * The notifier passed is linked into the kernel structures and must
1346 * not be reused until it has been unregistered. A negative errno code
1347 * is returned on a failure.
1348 *
1349 * When registered all registration and up events are replayed
4ec93edb 1350 * to the new notifier to allow device to have a race free
1da177e4
LT
1351 * view of the network device list.
1352 */
1353
1354int register_netdevice_notifier(struct notifier_block *nb)
1355{
1356 struct net_device *dev;
fcc5a03a 1357 struct net_device *last;
881d966b 1358 struct net *net;
1da177e4
LT
1359 int err;
1360
1361 rtnl_lock();
f07d5b94 1362 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1363 if (err)
1364 goto unlock;
881d966b
EB
1365 if (dev_boot_phase)
1366 goto unlock;
1367 for_each_net(net) {
1368 for_each_netdev(net, dev) {
1369 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1370 err = notifier_to_errno(err);
1371 if (err)
1372 goto rollback;
1373
1374 if (!(dev->flags & IFF_UP))
1375 continue;
1da177e4 1376
881d966b
EB
1377 nb->notifier_call(nb, NETDEV_UP, dev);
1378 }
1da177e4 1379 }
fcc5a03a
HX
1380
1381unlock:
1da177e4
LT
1382 rtnl_unlock();
1383 return err;
fcc5a03a
HX
1384
1385rollback:
1386 last = dev;
881d966b
EB
1387 for_each_net(net) {
1388 for_each_netdev(net, dev) {
1389 if (dev == last)
1390 break;
fcc5a03a 1391
881d966b
EB
1392 if (dev->flags & IFF_UP) {
1393 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1394 nb->notifier_call(nb, NETDEV_DOWN, dev);
1395 }
1396 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
a5ee1551 1397 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
fcc5a03a 1398 }
fcc5a03a 1399 }
c67625a1
PE
1400
1401 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1402 goto unlock;
1da177e4 1403}
d1b19dff 1404EXPORT_SYMBOL(register_netdevice_notifier);
1da177e4
LT
1405
1406/**
1407 * unregister_netdevice_notifier - unregister a network notifier block
1408 * @nb: notifier
1409 *
1410 * Unregister a notifier previously registered by
1411 * register_netdevice_notifier(). The notifier is unlinked into the
1412 * kernel structures and may then be reused. A negative errno code
1413 * is returned on a failure.
1414 */
1415
1416int unregister_netdevice_notifier(struct notifier_block *nb)
1417{
9f514950
HX
1418 int err;
1419
1420 rtnl_lock();
f07d5b94 1421 err = raw_notifier_chain_unregister(&netdev_chain, nb);
9f514950
HX
1422 rtnl_unlock();
1423 return err;
1da177e4 1424}
d1b19dff 1425EXPORT_SYMBOL(unregister_netdevice_notifier);
1da177e4
LT
1426
1427/**
1428 * call_netdevice_notifiers - call all network notifier blocks
1429 * @val: value passed unmodified to notifier function
c4ea43c5 1430 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1431 *
1432 * Call all network notifier blocks. Parameters and return value
f07d5b94 1433 * are as for raw_notifier_call_chain().
1da177e4
LT
1434 */
1435
ad7379d4 1436int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1437{
ad7379d4 1438 return raw_notifier_call_chain(&netdev_chain, val, dev);
1da177e4
LT
1439}
1440
1441/* When > 0 there are consumers of rx skb time stamps */
1442static atomic_t netstamp_needed = ATOMIC_INIT(0);
1443
1444void net_enable_timestamp(void)
1445{
1446 atomic_inc(&netstamp_needed);
1447}
d1b19dff 1448EXPORT_SYMBOL(net_enable_timestamp);
1da177e4
LT
1449
1450void net_disable_timestamp(void)
1451{
1452 atomic_dec(&netstamp_needed);
1453}
d1b19dff 1454EXPORT_SYMBOL(net_disable_timestamp);
1da177e4 1455
a61bbcf2 1456static inline void net_timestamp(struct sk_buff *skb)
1da177e4
LT
1457{
1458 if (atomic_read(&netstamp_needed))
a61bbcf2 1459 __net_timestamp(skb);
b7aa0bf7
ED
1460 else
1461 skb->tstamp.tv64 = 0;
1da177e4
LT
1462}
1463
44540960
AB
1464/**
1465 * dev_forward_skb - loopback an skb to another netif
1466 *
1467 * @dev: destination network device
1468 * @skb: buffer to forward
1469 *
1470 * return values:
1471 * NET_RX_SUCCESS (no congestion)
1472 * NET_RX_DROP (packet was dropped)
1473 *
1474 * dev_forward_skb can be used for injecting an skb from the
1475 * start_xmit function of one device into the receive queue
1476 * of another device.
1477 *
1478 * The receiving device may be in another namespace, so
1479 * we have to clear all information in the skb that could
1480 * impact namespace isolation.
1481 */
1482int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1483{
1484 skb_orphan(skb);
1485
1486 if (!(dev->flags & IFF_UP))
1487 return NET_RX_DROP;
1488
1489 if (skb->len > (dev->mtu + dev->hard_header_len))
1490 return NET_RX_DROP;
1491
8a83a00b 1492 skb_set_dev(skb, dev);
44540960
AB
1493 skb->tstamp.tv64 = 0;
1494 skb->pkt_type = PACKET_HOST;
1495 skb->protocol = eth_type_trans(skb, dev);
44540960
AB
1496 return netif_rx(skb);
1497}
1498EXPORT_SYMBOL_GPL(dev_forward_skb);
1499
1da177e4
LT
1500/*
1501 * Support routine. Sends outgoing frames to any network
1502 * taps currently in use.
1503 */
1504
f6a78bfc 1505static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1506{
1507 struct packet_type *ptype;
a61bbcf2 1508
8caf1539
JP
1509#ifdef CONFIG_NET_CLS_ACT
1510 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1511 net_timestamp(skb);
1512#else
a61bbcf2 1513 net_timestamp(skb);
8caf1539 1514#endif
1da177e4
LT
1515
1516 rcu_read_lock();
1517 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1518 /* Never send packets back to the socket
1519 * they originated from - MvS (miquels@drinkel.ow.org)
1520 */
1521 if ((ptype->dev == dev || !ptype->dev) &&
1522 (ptype->af_packet_priv == NULL ||
1523 (struct sock *)ptype->af_packet_priv != skb->sk)) {
d1b19dff 1524 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1da177e4
LT
1525 if (!skb2)
1526 break;
1527
1528 /* skb->nh should be correctly
1529 set by sender, so that the second statement is
1530 just protection against buggy protocols.
1531 */
459a98ed 1532 skb_reset_mac_header(skb2);
1da177e4 1533
d56f90a7 1534 if (skb_network_header(skb2) < skb2->data ||
27a884dc 1535 skb2->network_header > skb2->tail) {
1da177e4
LT
1536 if (net_ratelimit())
1537 printk(KERN_CRIT "protocol %04x is "
1538 "buggy, dev %s\n",
1539 skb2->protocol, dev->name);
c1d2bbe1 1540 skb_reset_network_header(skb2);
1da177e4
LT
1541 }
1542
b0e380b1 1543 skb2->transport_header = skb2->network_header;
1da177e4 1544 skb2->pkt_type = PACKET_OUTGOING;
f2ccd8fa 1545 ptype->func(skb2, skb->dev, ptype, skb->dev);
1da177e4
LT
1546 }
1547 }
1548 rcu_read_unlock();
1549}
1550
56079431 1551
def82a1d 1552static inline void __netif_reschedule(struct Qdisc *q)
56079431 1553{
def82a1d
JP
1554 struct softnet_data *sd;
1555 unsigned long flags;
56079431 1556
def82a1d
JP
1557 local_irq_save(flags);
1558 sd = &__get_cpu_var(softnet_data);
1559 q->next_sched = sd->output_queue;
1560 sd->output_queue = q;
1561 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1562 local_irq_restore(flags);
1563}
1564
1565void __netif_schedule(struct Qdisc *q)
1566{
1567 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1568 __netif_reschedule(q);
56079431
DV
1569}
1570EXPORT_SYMBOL(__netif_schedule);
1571
bea3348e 1572void dev_kfree_skb_irq(struct sk_buff *skb)
56079431 1573{
bea3348e
SH
1574 if (atomic_dec_and_test(&skb->users)) {
1575 struct softnet_data *sd;
1576 unsigned long flags;
56079431 1577
bea3348e
SH
1578 local_irq_save(flags);
1579 sd = &__get_cpu_var(softnet_data);
1580 skb->next = sd->completion_queue;
1581 sd->completion_queue = skb;
1582 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1583 local_irq_restore(flags);
1584 }
56079431 1585}
bea3348e 1586EXPORT_SYMBOL(dev_kfree_skb_irq);
56079431
DV
1587
1588void dev_kfree_skb_any(struct sk_buff *skb)
1589{
1590 if (in_irq() || irqs_disabled())
1591 dev_kfree_skb_irq(skb);
1592 else
1593 dev_kfree_skb(skb);
1594}
1595EXPORT_SYMBOL(dev_kfree_skb_any);
1596
1597
bea3348e
SH
1598/**
1599 * netif_device_detach - mark device as removed
1600 * @dev: network device
1601 *
1602 * Mark device as removed from system and therefore no longer available.
1603 */
56079431
DV
1604void netif_device_detach(struct net_device *dev)
1605{
1606 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1607 netif_running(dev)) {
d543103a 1608 netif_tx_stop_all_queues(dev);
56079431
DV
1609 }
1610}
1611EXPORT_SYMBOL(netif_device_detach);
1612
bea3348e
SH
1613/**
1614 * netif_device_attach - mark device as attached
1615 * @dev: network device
1616 *
1617 * Mark device as attached from system and restart if needed.
1618 */
56079431
DV
1619void netif_device_attach(struct net_device *dev)
1620{
1621 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1622 netif_running(dev)) {
d543103a 1623 netif_tx_wake_all_queues(dev);
4ec93edb 1624 __netdev_watchdog_up(dev);
56079431
DV
1625 }
1626}
1627EXPORT_SYMBOL(netif_device_attach);
1628
6de329e2
BH
1629static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1630{
1631 return ((features & NETIF_F_GEN_CSUM) ||
1632 ((features & NETIF_F_IP_CSUM) &&
1633 protocol == htons(ETH_P_IP)) ||
1634 ((features & NETIF_F_IPV6_CSUM) &&
1c8dbcf6
YZ
1635 protocol == htons(ETH_P_IPV6)) ||
1636 ((features & NETIF_F_FCOE_CRC) &&
1637 protocol == htons(ETH_P_FCOE)));
6de329e2
BH
1638}
1639
1640static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1641{
1642 if (can_checksum_protocol(dev->features, skb->protocol))
1643 return true;
1644
1645 if (skb->protocol == htons(ETH_P_8021Q)) {
1646 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1647 if (can_checksum_protocol(dev->features & dev->vlan_features,
1648 veh->h_vlan_encapsulated_proto))
1649 return true;
1650 }
1651
1652 return false;
1653}
56079431 1654
8a83a00b
AB
1655/**
1656 * skb_dev_set -- assign a new device to a buffer
1657 * @skb: buffer for the new device
1658 * @dev: network device
1659 *
1660 * If an skb is owned by a device already, we have to reset
1661 * all data private to the namespace a device belongs to
1662 * before assigning it a new device.
1663 */
1664#ifdef CONFIG_NET_NS
1665void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1666{
1667 skb_dst_drop(skb);
1668 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1669 secpath_reset(skb);
1670 nf_reset(skb);
1671 skb_init_secmark(skb);
1672 skb->mark = 0;
1673 skb->priority = 0;
1674 skb->nf_trace = 0;
1675 skb->ipvs_property = 0;
1676#ifdef CONFIG_NET_SCHED
1677 skb->tc_index = 0;
1678#endif
1679 }
1680 skb->dev = dev;
1681}
1682EXPORT_SYMBOL(skb_set_dev);
1683#endif /* CONFIG_NET_NS */
1684
1da177e4
LT
1685/*
1686 * Invalidate hardware checksum when packet is to be mangled, and
1687 * complete checksum manually on outgoing path.
1688 */
84fa7933 1689int skb_checksum_help(struct sk_buff *skb)
1da177e4 1690{
d3bc23e7 1691 __wsum csum;
663ead3b 1692 int ret = 0, offset;
1da177e4 1693
84fa7933 1694 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
1695 goto out_set_summed;
1696
1697 if (unlikely(skb_shinfo(skb)->gso_size)) {
a430a43d
HX
1698 /* Let GSO fix up the checksum. */
1699 goto out_set_summed;
1da177e4
LT
1700 }
1701
a030847e
HX
1702 offset = skb->csum_start - skb_headroom(skb);
1703 BUG_ON(offset >= skb_headlen(skb));
1704 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1705
1706 offset += skb->csum_offset;
1707 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1708
1709 if (skb_cloned(skb) &&
1710 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
1711 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1712 if (ret)
1713 goto out;
1714 }
1715
a030847e 1716 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
a430a43d 1717out_set_summed:
1da177e4 1718 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 1719out:
1da177e4
LT
1720 return ret;
1721}
d1b19dff 1722EXPORT_SYMBOL(skb_checksum_help);
1da177e4 1723
f6a78bfc
HX
1724/**
1725 * skb_gso_segment - Perform segmentation on skb.
1726 * @skb: buffer to segment
576a30eb 1727 * @features: features for the output path (see dev->features)
f6a78bfc
HX
1728 *
1729 * This function segments the given skb and returns a list of segments.
576a30eb
HX
1730 *
1731 * It may return NULL if the skb requires no segmentation. This is
1732 * only possible when GSO is used for verifying header integrity.
f6a78bfc 1733 */
576a30eb 1734struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
f6a78bfc
HX
1735{
1736 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1737 struct packet_type *ptype;
252e3346 1738 __be16 type = skb->protocol;
a430a43d 1739 int err;
f6a78bfc 1740
459a98ed 1741 skb_reset_mac_header(skb);
b0e380b1 1742 skb->mac_len = skb->network_header - skb->mac_header;
f6a78bfc
HX
1743 __skb_pull(skb, skb->mac_len);
1744
67fd1a73
HX
1745 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1746 struct net_device *dev = skb->dev;
1747 struct ethtool_drvinfo info = {};
1748
1749 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1750 dev->ethtool_ops->get_drvinfo(dev, &info);
1751
1752 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1753 "ip_summed=%d",
1754 info.driver, dev ? dev->features : 0L,
1755 skb->sk ? skb->sk->sk_route_caps : 0L,
1756 skb->len, skb->data_len, skb->ip_summed);
1757
a430a43d
HX
1758 if (skb_header_cloned(skb) &&
1759 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1760 return ERR_PTR(err);
1761 }
1762
f6a78bfc 1763 rcu_read_lock();
82d8a867
PE
1764 list_for_each_entry_rcu(ptype,
1765 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
f6a78bfc 1766 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
84fa7933 1767 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
a430a43d
HX
1768 err = ptype->gso_send_check(skb);
1769 segs = ERR_PTR(err);
1770 if (err || skb_gso_ok(skb, features))
1771 break;
d56f90a7
ACM
1772 __skb_push(skb, (skb->data -
1773 skb_network_header(skb)));
a430a43d 1774 }
576a30eb 1775 segs = ptype->gso_segment(skb, features);
f6a78bfc
HX
1776 break;
1777 }
1778 }
1779 rcu_read_unlock();
1780
98e399f8 1781 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 1782
f6a78bfc
HX
1783 return segs;
1784}
f6a78bfc
HX
1785EXPORT_SYMBOL(skb_gso_segment);
1786
fb286bb2
HX
1787/* Take action when hardware reception checksum errors are detected. */
1788#ifdef CONFIG_BUG
1789void netdev_rx_csum_fault(struct net_device *dev)
1790{
1791 if (net_ratelimit()) {
4ec93edb 1792 printk(KERN_ERR "%s: hw csum failure.\n",
246a4212 1793 dev ? dev->name : "<unknown>");
fb286bb2
HX
1794 dump_stack();
1795 }
1796}
1797EXPORT_SYMBOL(netdev_rx_csum_fault);
1798#endif
1799
1da177e4
LT
1800/* Actually, we should eliminate this check as soon as we know, that:
1801 * 1. IOMMU is present and allows to map all the memory.
1802 * 2. No high memory really exists on this machine.
1803 */
1804
9092c658 1805static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1da177e4 1806{
3d3a8533 1807#ifdef CONFIG_HIGHMEM
1da177e4 1808 int i;
5acbbd42
FT
1809 if (!(dev->features & NETIF_F_HIGHDMA)) {
1810 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1811 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1812 return 1;
1813 }
1da177e4 1814
5acbbd42
FT
1815 if (PCI_DMA_BUS_IS_PHYS) {
1816 struct device *pdev = dev->dev.parent;
1da177e4 1817
9092c658
ED
1818 if (!pdev)
1819 return 0;
5acbbd42
FT
1820 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1821 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1822 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1823 return 1;
1824 }
1825 }
3d3a8533 1826#endif
1da177e4
LT
1827 return 0;
1828}
1da177e4 1829
f6a78bfc
HX
1830struct dev_gso_cb {
1831 void (*destructor)(struct sk_buff *skb);
1832};
1833
1834#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1835
1836static void dev_gso_skb_destructor(struct sk_buff *skb)
1837{
1838 struct dev_gso_cb *cb;
1839
1840 do {
1841 struct sk_buff *nskb = skb->next;
1842
1843 skb->next = nskb->next;
1844 nskb->next = NULL;
1845 kfree_skb(nskb);
1846 } while (skb->next);
1847
1848 cb = DEV_GSO_CB(skb);
1849 if (cb->destructor)
1850 cb->destructor(skb);
1851}
1852
1853/**
1854 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1855 * @skb: buffer to segment
1856 *
1857 * This function segments the given skb and stores the list of segments
1858 * in skb->next.
1859 */
1860static int dev_gso_segment(struct sk_buff *skb)
1861{
1862 struct net_device *dev = skb->dev;
1863 struct sk_buff *segs;
576a30eb
HX
1864 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1865 NETIF_F_SG : 0);
1866
1867 segs = skb_gso_segment(skb, features);
1868
1869 /* Verifying header integrity only. */
1870 if (!segs)
1871 return 0;
f6a78bfc 1872
801678c5 1873 if (IS_ERR(segs))
f6a78bfc
HX
1874 return PTR_ERR(segs);
1875
1876 skb->next = segs;
1877 DEV_GSO_CB(skb)->destructor = skb->destructor;
1878 skb->destructor = dev_gso_skb_destructor;
1879
1880 return 0;
1881}
1882
fc6055a5
ED
1883/*
1884 * Try to orphan skb early, right before transmission by the device.
1885 * We cannot orphan skb if tx timestamp is requested, since
1886 * drivers need to call skb_tstamp_tx() to send the timestamp.
1887 */
1888static inline void skb_orphan_try(struct sk_buff *skb)
1889{
1890 if (!skb_tx(skb)->flags)
1891 skb_orphan(skb);
1892}
1893
fd2ea0a7
DM
1894int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1895 struct netdev_queue *txq)
f6a78bfc 1896{
00829823 1897 const struct net_device_ops *ops = dev->netdev_ops;
572a9d7b 1898 int rc = NETDEV_TX_OK;
00829823 1899
f6a78bfc 1900 if (likely(!skb->next)) {
9be9a6b9 1901 if (!list_empty(&ptype_all))
f6a78bfc
HX
1902 dev_queue_xmit_nit(skb, dev);
1903
576a30eb
HX
1904 if (netif_needs_gso(dev, skb)) {
1905 if (unlikely(dev_gso_segment(skb)))
1906 goto out_kfree_skb;
1907 if (skb->next)
1908 goto gso;
1909 }
f6a78bfc 1910
93f154b5
ED
1911 /*
1912 * If device doesnt need skb->dst, release it right now while
1913 * its hot in this cpu cache
1914 */
adf30907
ED
1915 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1916 skb_dst_drop(skb);
1917
fc6055a5 1918 skb_orphan_try(skb);
ac45f602 1919 rc = ops->ndo_start_xmit(skb, dev);
ec634fe3 1920 if (rc == NETDEV_TX_OK)
08baf561 1921 txq_trans_update(txq);
ac45f602 1922 return rc;
f6a78bfc
HX
1923 }
1924
576a30eb 1925gso:
f6a78bfc
HX
1926 do {
1927 struct sk_buff *nskb = skb->next;
f6a78bfc
HX
1928
1929 skb->next = nskb->next;
1930 nskb->next = NULL;
068a2de5
KK
1931
1932 /*
1933 * If device doesnt need nskb->dst, release it right now while
1934 * its hot in this cpu cache
1935 */
1936 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1937 skb_dst_drop(nskb);
1938
fc6055a5 1939 skb_orphan_try(nskb);
00829823 1940 rc = ops->ndo_start_xmit(nskb, dev);
ec634fe3 1941 if (unlikely(rc != NETDEV_TX_OK)) {
572a9d7b
PM
1942 if (rc & ~NETDEV_TX_MASK)
1943 goto out_kfree_gso_skb;
f54d9e8d 1944 nskb->next = skb->next;
f6a78bfc
HX
1945 skb->next = nskb;
1946 return rc;
1947 }
08baf561 1948 txq_trans_update(txq);
fd2ea0a7 1949 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
f54d9e8d 1950 return NETDEV_TX_BUSY;
f6a78bfc 1951 } while (skb->next);
4ec93edb 1952
572a9d7b
PM
1953out_kfree_gso_skb:
1954 if (likely(skb->next == NULL))
1955 skb->destructor = DEV_GSO_CB(skb)->destructor;
f6a78bfc
HX
1956out_kfree_skb:
1957 kfree_skb(skb);
572a9d7b 1958 return rc;
f6a78bfc
HX
1959}
1960
0a9627f2 1961static u32 hashrnd __read_mostly;
b6b2fed1 1962
9247744e 1963u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
8f0f2223 1964{
7019298a 1965 u32 hash;
b6b2fed1 1966
513de11b
DM
1967 if (skb_rx_queue_recorded(skb)) {
1968 hash = skb_get_rx_queue(skb);
d1b19dff 1969 while (unlikely(hash >= dev->real_num_tx_queues))
513de11b
DM
1970 hash -= dev->real_num_tx_queues;
1971 return hash;
1972 }
ec581f6a
ED
1973
1974 if (skb->sk && skb->sk->sk_hash)
7019298a 1975 hash = skb->sk->sk_hash;
ec581f6a 1976 else
7019298a 1977 hash = skb->protocol;
d5a9e24a 1978
0a9627f2 1979 hash = jhash_1word(hash, hashrnd);
b6b2fed1
DM
1980
1981 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
8f0f2223 1982}
9247744e 1983EXPORT_SYMBOL(skb_tx_hash);
8f0f2223 1984
ed04642f
ED
1985static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1986{
1987 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1988 if (net_ratelimit()) {
7a161ea9
ED
1989 pr_warning("%s selects TX queue %d, but "
1990 "real number of TX queues is %d\n",
1991 dev->name, queue_index, dev->real_num_tx_queues);
ed04642f
ED
1992 }
1993 return 0;
1994 }
1995 return queue_index;
1996}
1997
e8a0464c
DM
1998static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1999 struct sk_buff *skb)
2000{
a4ee3ce3
KK
2001 u16 queue_index;
2002 struct sock *sk = skb->sk;
2003
2004 if (sk_tx_queue_recorded(sk)) {
2005 queue_index = sk_tx_queue_get(sk);
2006 } else {
2007 const struct net_device_ops *ops = dev->netdev_ops;
2008
2009 if (ops->ndo_select_queue) {
2010 queue_index = ops->ndo_select_queue(dev, skb);
ed04642f 2011 queue_index = dev_cap_txqueue(dev, queue_index);
a4ee3ce3
KK
2012 } else {
2013 queue_index = 0;
2014 if (dev->real_num_tx_queues > 1)
2015 queue_index = skb_tx_hash(dev, skb);
fd2ea0a7 2016
b6c6712a 2017 if (sk && rcu_dereference_check(sk->sk_dst_cache, 1))
a4ee3ce3
KK
2018 sk_tx_queue_set(sk, queue_index);
2019 }
2020 }
eae792b7 2021
fd2ea0a7
DM
2022 skb_set_queue_mapping(skb, queue_index);
2023 return netdev_get_tx_queue(dev, queue_index);
e8a0464c
DM
2024}
2025
bbd8a0d3
KK
2026static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2027 struct net_device *dev,
2028 struct netdev_queue *txq)
2029{
2030 spinlock_t *root_lock = qdisc_lock(q);
2031 int rc;
2032
2033 spin_lock(root_lock);
2034 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2035 kfree_skb(skb);
2036 rc = NET_XMIT_DROP;
2037 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2038 !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
2039 /*
2040 * This is a work-conserving queue; there are no old skbs
2041 * waiting to be sent out; and the qdisc is not running -
2042 * xmit the skb directly.
2043 */
2044 __qdisc_update_bstats(q, skb->len);
2045 if (sch_direct_xmit(skb, q, dev, txq, root_lock))
2046 __qdisc_run(q);
2047 else
2048 clear_bit(__QDISC_STATE_RUNNING, &q->state);
2049
2050 rc = NET_XMIT_SUCCESS;
2051 } else {
2052 rc = qdisc_enqueue_root(skb, q);
2053 qdisc_run(q);
2054 }
2055 spin_unlock(root_lock);
2056
2057 return rc;
2058}
2059
4b258461
KK
2060/*
2061 * Returns true if either:
2062 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2063 * 2. skb is fragmented and the device does not support SG, or if
2064 * at least one of fragments is in highmem and device does not
2065 * support DMA from it.
2066 */
2067static inline int skb_needs_linearize(struct sk_buff *skb,
2068 struct net_device *dev)
2069{
2070 return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
2071 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
2072 illegal_highdma(dev, skb)));
2073}
2074
d29f749e
DJ
2075/**
2076 * dev_queue_xmit - transmit a buffer
2077 * @skb: buffer to transmit
2078 *
2079 * Queue a buffer for transmission to a network device. The caller must
2080 * have set the device and priority and built the buffer before calling
2081 * this function. The function can be called from an interrupt.
2082 *
2083 * A negative errno code is returned on a failure. A success does not
2084 * guarantee the frame will be transmitted as it may be dropped due
2085 * to congestion or traffic shaping.
2086 *
2087 * -----------------------------------------------------------------------------------
2088 * I notice this method can also return errors from the queue disciplines,
2089 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2090 * be positive.
2091 *
2092 * Regardless of the return value, the skb is consumed, so it is currently
2093 * difficult to retry a send to this method. (You can bump the ref count
2094 * before sending to hold a reference for retry if you are careful.)
2095 *
2096 * When calling this method, interrupts MUST be enabled. This is because
2097 * the BH enable code must have IRQs enabled so that it will not deadlock.
2098 * --BLG
2099 */
1da177e4
LT
2100int dev_queue_xmit(struct sk_buff *skb)
2101{
2102 struct net_device *dev = skb->dev;
dc2b4847 2103 struct netdev_queue *txq;
1da177e4
LT
2104 struct Qdisc *q;
2105 int rc = -ENOMEM;
2106
f6a78bfc
HX
2107 /* GSO will handle the following emulations directly. */
2108 if (netif_needs_gso(dev, skb))
2109 goto gso;
2110
4b258461
KK
2111 /* Convert a paged skb to linear, if required */
2112 if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
1da177e4
LT
2113 goto out_kfree_skb;
2114
2115 /* If packet is not checksummed and device does not support
2116 * checksumming for this protocol, complete checksumming here.
2117 */
663ead3b
HX
2118 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2119 skb_set_transport_header(skb, skb->csum_start -
2120 skb_headroom(skb));
6de329e2
BH
2121 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2122 goto out_kfree_skb;
663ead3b 2123 }
1da177e4 2124
f6a78bfc 2125gso:
4ec93edb
YH
2126 /* Disable soft irqs for various locks below. Also
2127 * stops preemption for RCU.
1da177e4 2128 */
4ec93edb 2129 rcu_read_lock_bh();
1da177e4 2130
eae792b7 2131 txq = dev_pick_tx(dev, skb);
a898def2 2132 q = rcu_dereference_bh(txq->qdisc);
37437bb2 2133
1da177e4 2134#ifdef CONFIG_NET_CLS_ACT
d1b19dff 2135 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1da177e4
LT
2136#endif
2137 if (q->enqueue) {
bbd8a0d3 2138 rc = __dev_xmit_skb(skb, q, dev, txq);
37437bb2 2139 goto out;
1da177e4
LT
2140 }
2141
2142 /* The device has no queue. Common case for software devices:
2143 loopback, all the sorts of tunnels...
2144
932ff279
HX
2145 Really, it is unlikely that netif_tx_lock protection is necessary
2146 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
2147 counters.)
2148 However, it is possible, that they rely on protection
2149 made by us here.
2150
2151 Check this and shot the lock. It is not prone from deadlocks.
2152 Either shot noqueue qdisc, it is even simpler 8)
2153 */
2154 if (dev->flags & IFF_UP) {
2155 int cpu = smp_processor_id(); /* ok because BHs are off */
2156
c773e847 2157 if (txq->xmit_lock_owner != cpu) {
1da177e4 2158
c773e847 2159 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 2160
fd2ea0a7 2161 if (!netif_tx_queue_stopped(txq)) {
572a9d7b
PM
2162 rc = dev_hard_start_xmit(skb, dev, txq);
2163 if (dev_xmit_complete(rc)) {
c773e847 2164 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2165 goto out;
2166 }
2167 }
c773e847 2168 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2169 if (net_ratelimit())
2170 printk(KERN_CRIT "Virtual device %s asks to "
2171 "queue packet!\n", dev->name);
2172 } else {
2173 /* Recursion is detected! It is possible,
2174 * unfortunately */
2175 if (net_ratelimit())
2176 printk(KERN_CRIT "Dead loop on virtual device "
2177 "%s, fix it urgently!\n", dev->name);
2178 }
2179 }
2180
2181 rc = -ENETDOWN;
d4828d85 2182 rcu_read_unlock_bh();
1da177e4
LT
2183
2184out_kfree_skb:
2185 kfree_skb(skb);
2186 return rc;
2187out:
d4828d85 2188 rcu_read_unlock_bh();
1da177e4
LT
2189 return rc;
2190}
d1b19dff 2191EXPORT_SYMBOL(dev_queue_xmit);
1da177e4
LT
2192
2193
2194/*=======================================================================
2195 Receiver routines
2196 =======================================================================*/
2197
6b2bedc3
SH
2198int netdev_max_backlog __read_mostly = 1000;
2199int netdev_budget __read_mostly = 300;
2200int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4
LT
2201
2202DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2203
df334545 2204#ifdef CONFIG_RPS
fec5e652
TH
2205
2206/* One global table that all flow-based protocols share. */
8770acf0 2207struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
fec5e652
TH
2208EXPORT_SYMBOL(rps_sock_flow_table);
2209
0a9627f2
TH
2210/*
2211 * get_rps_cpu is called from netif_receive_skb and returns the target
2212 * CPU from the RPS map of the receiving queue for a given skb.
b0e28f1e 2213 * rcu_read_lock must be held on entry.
0a9627f2 2214 */
fec5e652
TH
2215static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2216 struct rps_dev_flow **rflowp)
0a9627f2
TH
2217{
2218 struct ipv6hdr *ip6;
2219 struct iphdr *ip;
2220 struct netdev_rx_queue *rxqueue;
2221 struct rps_map *map;
fec5e652
TH
2222 struct rps_dev_flow_table *flow_table;
2223 struct rps_sock_flow_table *sock_flow_table;
0a9627f2
TH
2224 int cpu = -1;
2225 u8 ip_proto;
fec5e652 2226 u16 tcpu;
0a9627f2
TH
2227 u32 addr1, addr2, ports, ihl;
2228
0a9627f2
TH
2229 if (skb_rx_queue_recorded(skb)) {
2230 u16 index = skb_get_rx_queue(skb);
2231 if (unlikely(index >= dev->num_rx_queues)) {
2232 if (net_ratelimit()) {
7a161ea9
ED
2233 pr_warning("%s received packet on queue "
2234 "%u, but number of RX queues is %u\n",
2235 dev->name, index, dev->num_rx_queues);
0a9627f2
TH
2236 }
2237 goto done;
2238 }
2239 rxqueue = dev->_rx + index;
2240 } else
2241 rxqueue = dev->_rx;
2242
fec5e652 2243 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
0a9627f2
TH
2244 goto done;
2245
2246 if (skb->rxhash)
2247 goto got_hash; /* Skip hash computation on packet header */
2248
2249 switch (skb->protocol) {
2250 case __constant_htons(ETH_P_IP):
2251 if (!pskb_may_pull(skb, sizeof(*ip)))
2252 goto done;
2253
2254 ip = (struct iphdr *) skb->data;
2255 ip_proto = ip->protocol;
2256 addr1 = ip->saddr;
2257 addr2 = ip->daddr;
2258 ihl = ip->ihl;
2259 break;
2260 case __constant_htons(ETH_P_IPV6):
2261 if (!pskb_may_pull(skb, sizeof(*ip6)))
2262 goto done;
2263
2264 ip6 = (struct ipv6hdr *) skb->data;
2265 ip_proto = ip6->nexthdr;
2266 addr1 = ip6->saddr.s6_addr32[3];
2267 addr2 = ip6->daddr.s6_addr32[3];
2268 ihl = (40 >> 2);
2269 break;
2270 default:
2271 goto done;
2272 }
2273 ports = 0;
2274 switch (ip_proto) {
2275 case IPPROTO_TCP:
2276 case IPPROTO_UDP:
2277 case IPPROTO_DCCP:
2278 case IPPROTO_ESP:
2279 case IPPROTO_AH:
2280 case IPPROTO_SCTP:
2281 case IPPROTO_UDPLITE:
2282 if (pskb_may_pull(skb, (ihl * 4) + 4))
2283 ports = *((u32 *) (skb->data + (ihl * 4)));
2284 break;
2285
2286 default:
2287 break;
2288 }
2289
2290 skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd);
2291 if (!skb->rxhash)
2292 skb->rxhash = 1;
2293
2294got_hash:
fec5e652
TH
2295 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2296 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2297 if (flow_table && sock_flow_table) {
2298 u16 next_cpu;
2299 struct rps_dev_flow *rflow;
2300
2301 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2302 tcpu = rflow->cpu;
2303
2304 next_cpu = sock_flow_table->ents[skb->rxhash &
2305 sock_flow_table->mask];
2306
2307 /*
2308 * If the desired CPU (where last recvmsg was done) is
2309 * different from current CPU (one in the rx-queue flow
2310 * table entry), switch if one of the following holds:
2311 * - Current CPU is unset (equal to RPS_NO_CPU).
2312 * - Current CPU is offline.
2313 * - The current CPU's queue tail has advanced beyond the
2314 * last packet that was enqueued using this table entry.
2315 * This guarantees that all previous packets for the flow
2316 * have been dequeued, thus preserving in order delivery.
2317 */
2318 if (unlikely(tcpu != next_cpu) &&
2319 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2320 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2321 rflow->last_qtail)) >= 0)) {
2322 tcpu = rflow->cpu = next_cpu;
2323 if (tcpu != RPS_NO_CPU)
2324 rflow->last_qtail = per_cpu(softnet_data,
2325 tcpu).input_queue_head;
2326 }
2327 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2328 *rflowp = rflow;
2329 cpu = tcpu;
2330 goto done;
2331 }
2332 }
2333
0a9627f2
TH
2334 map = rcu_dereference(rxqueue->rps_map);
2335 if (map) {
fec5e652 2336 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
0a9627f2
TH
2337
2338 if (cpu_online(tcpu)) {
2339 cpu = tcpu;
2340 goto done;
2341 }
2342 }
2343
2344done:
0a9627f2
TH
2345 return cpu;
2346}
2347
0a9627f2
TH
2348/* Called from hardirq (IPI) context */
2349static void trigger_softirq(void *data)
2350{
2351 struct softnet_data *queue = data;
2352 __napi_schedule(&queue->backlog);
2353 __get_cpu_var(netdev_rx_stat).received_rps++;
2354}
fec5e652 2355#endif /* CONFIG_RPS */
0a9627f2
TH
2356
2357/*
2358 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2359 * queue (may be a remote CPU queue).
2360 */
fec5e652
TH
2361static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2362 unsigned int *qtail)
0a9627f2
TH
2363{
2364 struct softnet_data *queue;
2365 unsigned long flags;
2366
2367 queue = &per_cpu(softnet_data, cpu);
2368
2369 local_irq_save(flags);
2370 __get_cpu_var(netdev_rx_stat).total++;
2371
152102c7 2372 rps_lock(queue);
0a9627f2
TH
2373 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2374 if (queue->input_pkt_queue.qlen) {
2375enqueue:
2376 __skb_queue_tail(&queue->input_pkt_queue, skb);
fec5e652
TH
2377#ifdef CONFIG_RPS
2378 *qtail = queue->input_queue_head +
2379 queue->input_pkt_queue.qlen;
2380#endif
152102c7
CG
2381 rps_unlock(queue);
2382 local_irq_restore(flags);
0a9627f2
TH
2383 return NET_RX_SUCCESS;
2384 }
2385
2386 /* Schedule NAPI for backlog device */
2387 if (napi_schedule_prep(&queue->backlog)) {
df334545 2388#ifdef CONFIG_RPS
0a9627f2 2389 if (cpu != smp_processor_id()) {
88751275
ED
2390 struct softnet_data *myqueue;
2391
2392 myqueue = &__get_cpu_var(softnet_data);
2393 queue->rps_ipi_next = myqueue->rps_ipi_list;
2394 myqueue->rps_ipi_list = queue;
0a9627f2 2395
0a9627f2 2396 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
fec5e652
TH
2397 goto enqueue;
2398 }
1e94d72f 2399#endif
fec5e652 2400 __napi_schedule(&queue->backlog);
0a9627f2
TH
2401 }
2402 goto enqueue;
2403 }
2404
152102c7 2405 rps_unlock(queue);
0a9627f2
TH
2406
2407 __get_cpu_var(netdev_rx_stat).dropped++;
2408 local_irq_restore(flags);
2409
2410 kfree_skb(skb);
2411 return NET_RX_DROP;
2412}
1da177e4 2413
1da177e4
LT
2414/**
2415 * netif_rx - post buffer to the network code
2416 * @skb: buffer to post
2417 *
2418 * This function receives a packet from a device driver and queues it for
2419 * the upper (protocol) levels to process. It always succeeds. The buffer
2420 * may be dropped during processing for congestion control or by the
2421 * protocol layers.
2422 *
2423 * return values:
2424 * NET_RX_SUCCESS (no congestion)
1da177e4
LT
2425 * NET_RX_DROP (packet was dropped)
2426 *
2427 */
2428
2429int netif_rx(struct sk_buff *skb)
2430{
b0e28f1e 2431 int ret;
1da177e4
LT
2432
2433 /* if netpoll wants it, pretend we never saw it */
2434 if (netpoll_rx(skb))
2435 return NET_RX_DROP;
2436
b7aa0bf7 2437 if (!skb->tstamp.tv64)
a61bbcf2 2438 net_timestamp(skb);
1da177e4 2439
df334545 2440#ifdef CONFIG_RPS
b0e28f1e 2441 {
fec5e652 2442 struct rps_dev_flow voidflow, *rflow = &voidflow;
b0e28f1e
ED
2443 int cpu;
2444
2445 rcu_read_lock();
fec5e652
TH
2446
2447 cpu = get_rps_cpu(skb->dev, skb, &rflow);
b0e28f1e
ED
2448 if (cpu < 0)
2449 cpu = smp_processor_id();
fec5e652
TH
2450
2451 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2452
b0e28f1e
ED
2453 rcu_read_unlock();
2454 }
1e94d72f 2455#else
fec5e652
TH
2456 {
2457 unsigned int qtail;
2458 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2459 put_cpu();
2460 }
1e94d72f 2461#endif
b0e28f1e 2462 return ret;
1da177e4 2463}
d1b19dff 2464EXPORT_SYMBOL(netif_rx);
1da177e4
LT
2465
2466int netif_rx_ni(struct sk_buff *skb)
2467{
2468 int err;
2469
2470 preempt_disable();
2471 err = netif_rx(skb);
2472 if (local_softirq_pending())
2473 do_softirq();
2474 preempt_enable();
2475
2476 return err;
2477}
1da177e4
LT
2478EXPORT_SYMBOL(netif_rx_ni);
2479
1da177e4
LT
2480static void net_tx_action(struct softirq_action *h)
2481{
2482 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2483
2484 if (sd->completion_queue) {
2485 struct sk_buff *clist;
2486
2487 local_irq_disable();
2488 clist = sd->completion_queue;
2489 sd->completion_queue = NULL;
2490 local_irq_enable();
2491
2492 while (clist) {
2493 struct sk_buff *skb = clist;
2494 clist = clist->next;
2495
547b792c 2496 WARN_ON(atomic_read(&skb->users));
1da177e4
LT
2497 __kfree_skb(skb);
2498 }
2499 }
2500
2501 if (sd->output_queue) {
37437bb2 2502 struct Qdisc *head;
1da177e4
LT
2503
2504 local_irq_disable();
2505 head = sd->output_queue;
2506 sd->output_queue = NULL;
2507 local_irq_enable();
2508
2509 while (head) {
37437bb2
DM
2510 struct Qdisc *q = head;
2511 spinlock_t *root_lock;
2512
1da177e4
LT
2513 head = head->next_sched;
2514
5fb66229 2515 root_lock = qdisc_lock(q);
37437bb2 2516 if (spin_trylock(root_lock)) {
def82a1d
JP
2517 smp_mb__before_clear_bit();
2518 clear_bit(__QDISC_STATE_SCHED,
2519 &q->state);
37437bb2
DM
2520 qdisc_run(q);
2521 spin_unlock(root_lock);
1da177e4 2522 } else {
195648bb 2523 if (!test_bit(__QDISC_STATE_DEACTIVATED,
e8a83e10 2524 &q->state)) {
195648bb 2525 __netif_reschedule(q);
e8a83e10
JP
2526 } else {
2527 smp_mb__before_clear_bit();
2528 clear_bit(__QDISC_STATE_SCHED,
2529 &q->state);
2530 }
1da177e4
LT
2531 }
2532 }
2533 }
2534}
2535
6f05f629
SH
2536static inline int deliver_skb(struct sk_buff *skb,
2537 struct packet_type *pt_prev,
2538 struct net_device *orig_dev)
1da177e4
LT
2539{
2540 atomic_inc(&skb->users);
f2ccd8fa 2541 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4
LT
2542}
2543
2544#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
da678292
MM
2545
2546#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2547/* This hook is defined here for ATM LANE */
2548int (*br_fdb_test_addr_hook)(struct net_device *dev,
2549 unsigned char *addr) __read_mostly;
4fb019a0 2550EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
da678292 2551#endif
1da177e4 2552
6229e362
SH
2553/*
2554 * If bridge module is loaded call bridging hook.
2555 * returns NULL if packet was consumed.
2556 */
2557struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2558 struct sk_buff *skb) __read_mostly;
4fb019a0 2559EXPORT_SYMBOL_GPL(br_handle_frame_hook);
da678292 2560
6229e362
SH
2561static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2562 struct packet_type **pt_prev, int *ret,
2563 struct net_device *orig_dev)
1da177e4
LT
2564{
2565 struct net_bridge_port *port;
2566
6229e362
SH
2567 if (skb->pkt_type == PACKET_LOOPBACK ||
2568 (port = rcu_dereference(skb->dev->br_port)) == NULL)
2569 return skb;
1da177e4
LT
2570
2571 if (*pt_prev) {
6229e362 2572 *ret = deliver_skb(skb, *pt_prev, orig_dev);
1da177e4 2573 *pt_prev = NULL;
4ec93edb
YH
2574 }
2575
6229e362 2576 return br_handle_frame_hook(port, skb);
1da177e4
LT
2577}
2578#else
6229e362 2579#define handle_bridge(skb, pt_prev, ret, orig_dev) (skb)
1da177e4
LT
2580#endif
2581
b863ceb7
PM
2582#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2583struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2584EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2585
2586static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2587 struct packet_type **pt_prev,
2588 int *ret,
2589 struct net_device *orig_dev)
2590{
2591 if (skb->dev->macvlan_port == NULL)
2592 return skb;
2593
2594 if (*pt_prev) {
2595 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2596 *pt_prev = NULL;
2597 }
2598 return macvlan_handle_frame_hook(skb);
2599}
2600#else
2601#define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb)
2602#endif
2603
1da177e4
LT
2604#ifdef CONFIG_NET_CLS_ACT
2605/* TODO: Maybe we should just force sch_ingress to be compiled in
2606 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2607 * a compare and 2 stores extra right now if we dont have it on
2608 * but have CONFIG_NET_CLS_ACT
4ec93edb 2609 * NOTE: This doesnt stop any functionality; if you dont have
1da177e4
LT
2610 * the ingress scheduler, you just cant add policies on ingress.
2611 *
2612 */
4ec93edb 2613static int ing_filter(struct sk_buff *skb)
1da177e4 2614{
1da177e4 2615 struct net_device *dev = skb->dev;
f697c3e8 2616 u32 ttl = G_TC_RTTL(skb->tc_verd);
555353cf
DM
2617 struct netdev_queue *rxq;
2618 int result = TC_ACT_OK;
2619 struct Qdisc *q;
4ec93edb 2620
f697c3e8
HX
2621 if (MAX_RED_LOOP < ttl++) {
2622 printk(KERN_WARNING
2623 "Redir loop detected Dropping packet (%d->%d)\n",
8964be4a 2624 skb->skb_iif, dev->ifindex);
f697c3e8
HX
2625 return TC_ACT_SHOT;
2626 }
1da177e4 2627
f697c3e8
HX
2628 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2629 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1da177e4 2630
555353cf
DM
2631 rxq = &dev->rx_queue;
2632
83874000 2633 q = rxq->qdisc;
8d50b53d 2634 if (q != &noop_qdisc) {
83874000 2635 spin_lock(qdisc_lock(q));
a9312ae8
DM
2636 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2637 result = qdisc_enqueue_root(skb, q);
83874000
DM
2638 spin_unlock(qdisc_lock(q));
2639 }
f697c3e8
HX
2640
2641 return result;
2642}
86e65da9 2643
f697c3e8
HX
2644static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2645 struct packet_type **pt_prev,
2646 int *ret, struct net_device *orig_dev)
2647{
8d50b53d 2648 if (skb->dev->rx_queue.qdisc == &noop_qdisc)
f697c3e8 2649 goto out;
1da177e4 2650
f697c3e8
HX
2651 if (*pt_prev) {
2652 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2653 *pt_prev = NULL;
2654 } else {
2655 /* Huh? Why does turning on AF_PACKET affect this? */
2656 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
1da177e4
LT
2657 }
2658
f697c3e8
HX
2659 switch (ing_filter(skb)) {
2660 case TC_ACT_SHOT:
2661 case TC_ACT_STOLEN:
2662 kfree_skb(skb);
2663 return NULL;
2664 }
2665
2666out:
2667 skb->tc_verd = 0;
2668 return skb;
1da177e4
LT
2669}
2670#endif
2671
bc1d0411
PM
2672/*
2673 * netif_nit_deliver - deliver received packets to network taps
2674 * @skb: buffer
2675 *
2676 * This function is used to deliver incoming packets to network
2677 * taps. It should be used when the normal netif_receive_skb path
2678 * is bypassed, for example because of VLAN acceleration.
2679 */
2680void netif_nit_deliver(struct sk_buff *skb)
2681{
2682 struct packet_type *ptype;
2683
2684 if (list_empty(&ptype_all))
2685 return;
2686
2687 skb_reset_network_header(skb);
2688 skb_reset_transport_header(skb);
2689 skb->mac_len = skb->network_header - skb->mac_header;
2690
2691 rcu_read_lock();
2692 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2693 if (!ptype->dev || ptype->dev == skb->dev)
2694 deliver_skb(skb, ptype, skb->dev);
2695 }
2696 rcu_read_unlock();
2697}
2698
acbbc071
ED
2699static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2700 struct net_device *master)
2701{
2702 if (skb->pkt_type == PACKET_HOST) {
2703 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2704
2705 memcpy(dest, master->dev_addr, ETH_ALEN);
2706 }
2707}
2708
2709/* On bonding slaves other than the currently active slave, suppress
2710 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2711 * ARP on active-backup slaves with arp_validate enabled.
2712 */
2713int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2714{
2715 struct net_device *dev = skb->dev;
2716
2717 if (master->priv_flags & IFF_MASTER_ARPMON)
2718 dev->last_rx = jiffies;
2719
2720 if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
2721 /* Do address unmangle. The local destination address
2722 * will be always the one master has. Provides the right
2723 * functionality in a bridge.
2724 */
2725 skb_bond_set_mac_by_master(skb, master);
2726 }
2727
2728 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2729 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2730 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2731 return 0;
2732
2733 if (master->priv_flags & IFF_MASTER_ALB) {
2734 if (skb->pkt_type != PACKET_BROADCAST &&
2735 skb->pkt_type != PACKET_MULTICAST)
2736 return 0;
2737 }
2738 if (master->priv_flags & IFF_MASTER_8023AD &&
2739 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2740 return 0;
2741
2742 return 1;
2743 }
2744 return 0;
2745}
2746EXPORT_SYMBOL(__skb_bond_should_drop);
2747
10f744d2 2748static int __netif_receive_skb(struct sk_buff *skb)
1da177e4
LT
2749{
2750 struct packet_type *ptype, *pt_prev;
f2ccd8fa 2751 struct net_device *orig_dev;
0641e4fb 2752 struct net_device *master;
0d7a3681 2753 struct net_device *null_or_orig;
ca8d9ea3 2754 struct net_device *null_or_bond;
1da177e4 2755 int ret = NET_RX_DROP;
252e3346 2756 __be16 type;
1da177e4 2757
81bbb3d4
ED
2758 if (!skb->tstamp.tv64)
2759 net_timestamp(skb);
2760
05423b24 2761 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
9b22ea56
PM
2762 return NET_RX_SUCCESS;
2763
1da177e4 2764 /* if we've gotten here through NAPI, check netpoll */
bea3348e 2765 if (netpoll_receive_skb(skb))
1da177e4
LT
2766 return NET_RX_DROP;
2767
8964be4a
ED
2768 if (!skb->skb_iif)
2769 skb->skb_iif = skb->dev->ifindex;
86e65da9 2770
0d7a3681 2771 null_or_orig = NULL;
cc9bd5ce 2772 orig_dev = skb->dev;
0641e4fb
ED
2773 master = ACCESS_ONCE(orig_dev->master);
2774 if (master) {
2775 if (skb_bond_should_drop(skb, master))
0d7a3681
JE
2776 null_or_orig = orig_dev; /* deliver only exact match */
2777 else
0641e4fb 2778 skb->dev = master;
cc9bd5ce 2779 }
8f903c70 2780
1da177e4
LT
2781 __get_cpu_var(netdev_rx_stat).total++;
2782
c1d2bbe1 2783 skb_reset_network_header(skb);
badff6d0 2784 skb_reset_transport_header(skb);
b0e380b1 2785 skb->mac_len = skb->network_header - skb->mac_header;
1da177e4
LT
2786
2787 pt_prev = NULL;
2788
2789 rcu_read_lock();
2790
2791#ifdef CONFIG_NET_CLS_ACT
2792 if (skb->tc_verd & TC_NCLS) {
2793 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2794 goto ncls;
2795 }
2796#endif
2797
2798 list_for_each_entry_rcu(ptype, &ptype_all, list) {
f982307f
JE
2799 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2800 ptype->dev == orig_dev) {
4ec93edb 2801 if (pt_prev)
f2ccd8fa 2802 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
2803 pt_prev = ptype;
2804 }
2805 }
2806
2807#ifdef CONFIG_NET_CLS_ACT
f697c3e8
HX
2808 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2809 if (!skb)
1da177e4 2810 goto out;
1da177e4
LT
2811ncls:
2812#endif
2813
6229e362 2814 skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
b863ceb7
PM
2815 if (!skb)
2816 goto out;
2817 skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
6229e362 2818 if (!skb)
1da177e4
LT
2819 goto out;
2820
1f3c8804
AG
2821 /*
2822 * Make sure frames received on VLAN interfaces stacked on
2823 * bonding interfaces still make their way to any base bonding
2824 * device that may have registered for a specific ptype. The
2825 * handler may have to adjust skb->dev and orig_dev.
1f3c8804 2826 */
ca8d9ea3 2827 null_or_bond = NULL;
1f3c8804
AG
2828 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2829 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
ca8d9ea3 2830 null_or_bond = vlan_dev_real_dev(skb->dev);
1f3c8804
AG
2831 }
2832
1da177e4 2833 type = skb->protocol;
82d8a867
PE
2834 list_for_each_entry_rcu(ptype,
2835 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1f3c8804 2836 if (ptype->type == type && (ptype->dev == null_or_orig ||
ca8d9ea3
AG
2837 ptype->dev == skb->dev || ptype->dev == orig_dev ||
2838 ptype->dev == null_or_bond)) {
4ec93edb 2839 if (pt_prev)
f2ccd8fa 2840 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
2841 pt_prev = ptype;
2842 }
2843 }
2844
2845 if (pt_prev) {
f2ccd8fa 2846 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4
LT
2847 } else {
2848 kfree_skb(skb);
2849 /* Jamal, now you will not able to escape explaining
2850 * me how you were going to use this. :-)
2851 */
2852 ret = NET_RX_DROP;
2853 }
2854
2855out:
2856 rcu_read_unlock();
2857 return ret;
2858}
0a9627f2
TH
2859
2860/**
2861 * netif_receive_skb - process receive buffer from network
2862 * @skb: buffer to process
2863 *
2864 * netif_receive_skb() is the main receive data processing function.
2865 * It always succeeds. The buffer may be dropped during processing
2866 * for congestion control or by the protocol layers.
2867 *
2868 * This function may only be called from softirq context and interrupts
2869 * should be enabled.
2870 *
2871 * Return values (usually ignored):
2872 * NET_RX_SUCCESS: no congestion
2873 * NET_RX_DROP: packet was dropped
2874 */
2875int netif_receive_skb(struct sk_buff *skb)
2876{
df334545 2877#ifdef CONFIG_RPS
fec5e652
TH
2878 struct rps_dev_flow voidflow, *rflow = &voidflow;
2879 int cpu, ret;
2880
2881 rcu_read_lock();
0a9627f2 2882
fec5e652 2883 cpu = get_rps_cpu(skb->dev, skb, &rflow);
0a9627f2 2884
fec5e652
TH
2885 if (cpu >= 0) {
2886 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2887 rcu_read_unlock();
2888 } else {
2889 rcu_read_unlock();
2890 ret = __netif_receive_skb(skb);
2891 }
2892
2893 return ret;
1e94d72f
TH
2894#else
2895 return __netif_receive_skb(skb);
2896#endif
0a9627f2 2897}
d1b19dff 2898EXPORT_SYMBOL(netif_receive_skb);
1da177e4 2899
88751275
ED
2900/* Network device is going away, flush any packets still pending
2901 * Called with irqs disabled.
2902 */
152102c7 2903static void flush_backlog(void *arg)
6e583ce5 2904{
152102c7
CG
2905 struct net_device *dev = arg;
2906 struct softnet_data *queue = &__get_cpu_var(softnet_data);
6e583ce5
SH
2907 struct sk_buff *skb, *tmp;
2908
152102c7 2909 rps_lock(queue);
6e583ce5
SH
2910 skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2911 if (skb->dev == dev) {
2912 __skb_unlink(skb, &queue->input_pkt_queue);
2913 kfree_skb(skb);
fec5e652 2914 incr_input_queue_head(queue);
6e583ce5 2915 }
152102c7 2916 rps_unlock(queue);
6e583ce5
SH
2917}
2918
d565b0a1
HX
2919static int napi_gro_complete(struct sk_buff *skb)
2920{
2921 struct packet_type *ptype;
2922 __be16 type = skb->protocol;
2923 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2924 int err = -ENOENT;
2925
fc59f9a3
HX
2926 if (NAPI_GRO_CB(skb)->count == 1) {
2927 skb_shinfo(skb)->gso_size = 0;
d565b0a1 2928 goto out;
fc59f9a3 2929 }
d565b0a1
HX
2930
2931 rcu_read_lock();
2932 list_for_each_entry_rcu(ptype, head, list) {
2933 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2934 continue;
2935
2936 err = ptype->gro_complete(skb);
2937 break;
2938 }
2939 rcu_read_unlock();
2940
2941 if (err) {
2942 WARN_ON(&ptype->list == head);
2943 kfree_skb(skb);
2944 return NET_RX_SUCCESS;
2945 }
2946
2947out:
d565b0a1
HX
2948 return netif_receive_skb(skb);
2949}
2950
11380a4b 2951static void napi_gro_flush(struct napi_struct *napi)
d565b0a1
HX
2952{
2953 struct sk_buff *skb, *next;
2954
2955 for (skb = napi->gro_list; skb; skb = next) {
2956 next = skb->next;
2957 skb->next = NULL;
2958 napi_gro_complete(skb);
2959 }
2960
4ae5544f 2961 napi->gro_count = 0;
d565b0a1
HX
2962 napi->gro_list = NULL;
2963}
d565b0a1 2964
5b252f0c 2965enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
d565b0a1
HX
2966{
2967 struct sk_buff **pp = NULL;
2968 struct packet_type *ptype;
2969 __be16 type = skb->protocol;
2970 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
0da2afd5 2971 int same_flow;
d565b0a1 2972 int mac_len;
5b252f0c 2973 enum gro_result ret;
d565b0a1
HX
2974
2975 if (!(skb->dev->features & NETIF_F_GRO))
2976 goto normal;
2977
4cf704fb 2978 if (skb_is_gso(skb) || skb_has_frags(skb))
f17f5c91
HX
2979 goto normal;
2980
d565b0a1
HX
2981 rcu_read_lock();
2982 list_for_each_entry_rcu(ptype, head, list) {
d565b0a1
HX
2983 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2984 continue;
2985
86911732 2986 skb_set_network_header(skb, skb_gro_offset(skb));
d565b0a1
HX
2987 mac_len = skb->network_header - skb->mac_header;
2988 skb->mac_len = mac_len;
2989 NAPI_GRO_CB(skb)->same_flow = 0;
2990 NAPI_GRO_CB(skb)->flush = 0;
5d38a079 2991 NAPI_GRO_CB(skb)->free = 0;
d565b0a1 2992
d565b0a1
HX
2993 pp = ptype->gro_receive(&napi->gro_list, skb);
2994 break;
2995 }
2996 rcu_read_unlock();
2997
2998 if (&ptype->list == head)
2999 goto normal;
3000
0da2afd5 3001 same_flow = NAPI_GRO_CB(skb)->same_flow;
5d0d9be8 3002 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
0da2afd5 3003
d565b0a1
HX
3004 if (pp) {
3005 struct sk_buff *nskb = *pp;
3006
3007 *pp = nskb->next;
3008 nskb->next = NULL;
3009 napi_gro_complete(nskb);
4ae5544f 3010 napi->gro_count--;
d565b0a1
HX
3011 }
3012
0da2afd5 3013 if (same_flow)
d565b0a1
HX
3014 goto ok;
3015
4ae5544f 3016 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
d565b0a1 3017 goto normal;
d565b0a1 3018
4ae5544f 3019 napi->gro_count++;
d565b0a1 3020 NAPI_GRO_CB(skb)->count = 1;
86911732 3021 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
d565b0a1
HX
3022 skb->next = napi->gro_list;
3023 napi->gro_list = skb;
5d0d9be8 3024 ret = GRO_HELD;
d565b0a1 3025
ad0f9904 3026pull:
cb18978c
HX
3027 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3028 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3029
3030 BUG_ON(skb->end - skb->tail < grow);
3031
3032 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3033
3034 skb->tail += grow;
3035 skb->data_len -= grow;
3036
3037 skb_shinfo(skb)->frags[0].page_offset += grow;
3038 skb_shinfo(skb)->frags[0].size -= grow;
3039
3040 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3041 put_page(skb_shinfo(skb)->frags[0].page);
3042 memmove(skb_shinfo(skb)->frags,
3043 skb_shinfo(skb)->frags + 1,
3044 --skb_shinfo(skb)->nr_frags);
3045 }
ad0f9904
HX
3046 }
3047
d565b0a1 3048ok:
5d0d9be8 3049 return ret;
d565b0a1
HX
3050
3051normal:
ad0f9904
HX
3052 ret = GRO_NORMAL;
3053 goto pull;
5d38a079 3054}
96e93eab
HX
3055EXPORT_SYMBOL(dev_gro_receive);
3056
5b252f0c
BH
3057static gro_result_t
3058__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
96e93eab
HX
3059{
3060 struct sk_buff *p;
3061
d1c76af9
HX
3062 if (netpoll_rx_on(skb))
3063 return GRO_NORMAL;
3064
96e93eab 3065 for (p = napi->gro_list; p; p = p->next) {
f64f9e71
JP
3066 NAPI_GRO_CB(p)->same_flow =
3067 (p->dev == skb->dev) &&
3068 !compare_ether_header(skb_mac_header(p),
3069 skb_gro_mac_header(skb));
96e93eab
HX
3070 NAPI_GRO_CB(p)->flush = 0;
3071 }
3072
3073 return dev_gro_receive(napi, skb);
3074}
5d38a079 3075
c7c4b3b6 3076gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5d38a079 3077{
5d0d9be8
HX
3078 switch (ret) {
3079 case GRO_NORMAL:
c7c4b3b6
BH
3080 if (netif_receive_skb(skb))
3081 ret = GRO_DROP;
3082 break;
5d38a079 3083
5d0d9be8 3084 case GRO_DROP:
5d0d9be8 3085 case GRO_MERGED_FREE:
5d38a079
HX
3086 kfree_skb(skb);
3087 break;
5b252f0c
BH
3088
3089 case GRO_HELD:
3090 case GRO_MERGED:
3091 break;
5d38a079
HX
3092 }
3093
c7c4b3b6 3094 return ret;
5d0d9be8
HX
3095}
3096EXPORT_SYMBOL(napi_skb_finish);
3097
78a478d0
HX
3098void skb_gro_reset_offset(struct sk_buff *skb)
3099{
3100 NAPI_GRO_CB(skb)->data_offset = 0;
3101 NAPI_GRO_CB(skb)->frag0 = NULL;
7489594c 3102 NAPI_GRO_CB(skb)->frag0_len = 0;
78a478d0 3103
78d3fd0b 3104 if (skb->mac_header == skb->tail &&
7489594c 3105 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
78a478d0
HX
3106 NAPI_GRO_CB(skb)->frag0 =
3107 page_address(skb_shinfo(skb)->frags[0].page) +
3108 skb_shinfo(skb)->frags[0].page_offset;
7489594c
HX
3109 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3110 }
78a478d0
HX
3111}
3112EXPORT_SYMBOL(skb_gro_reset_offset);
3113
c7c4b3b6 3114gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5d0d9be8 3115{
86911732
HX
3116 skb_gro_reset_offset(skb);
3117
5d0d9be8 3118 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
d565b0a1
HX
3119}
3120EXPORT_SYMBOL(napi_gro_receive);
3121
96e93eab
HX
3122void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3123{
96e93eab
HX
3124 __skb_pull(skb, skb_headlen(skb));
3125 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3126
3127 napi->skb = skb;
3128}
3129EXPORT_SYMBOL(napi_reuse_skb);
3130
76620aaf 3131struct sk_buff *napi_get_frags(struct napi_struct *napi)
5d38a079 3132{
5d38a079 3133 struct sk_buff *skb = napi->skb;
5d38a079
HX
3134
3135 if (!skb) {
89d71a66
ED
3136 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3137 if (skb)
3138 napi->skb = skb;
80595d59 3139 }
96e93eab
HX
3140 return skb;
3141}
76620aaf 3142EXPORT_SYMBOL(napi_get_frags);
96e93eab 3143
c7c4b3b6
BH
3144gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3145 gro_result_t ret)
96e93eab 3146{
5d0d9be8
HX
3147 switch (ret) {
3148 case GRO_NORMAL:
86911732 3149 case GRO_HELD:
e76b69cc 3150 skb->protocol = eth_type_trans(skb, skb->dev);
86911732 3151
c7c4b3b6
BH
3152 if (ret == GRO_HELD)
3153 skb_gro_pull(skb, -ETH_HLEN);
3154 else if (netif_receive_skb(skb))
3155 ret = GRO_DROP;
86911732 3156 break;
5d38a079 3157
5d0d9be8 3158 case GRO_DROP:
5d0d9be8
HX
3159 case GRO_MERGED_FREE:
3160 napi_reuse_skb(napi, skb);
3161 break;
5b252f0c
BH
3162
3163 case GRO_MERGED:
3164 break;
5d0d9be8 3165 }
5d38a079 3166
c7c4b3b6 3167 return ret;
5d38a079 3168}
5d0d9be8
HX
3169EXPORT_SYMBOL(napi_frags_finish);
3170
76620aaf
HX
3171struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3172{
3173 struct sk_buff *skb = napi->skb;
3174 struct ethhdr *eth;
a5b1cf28
HX
3175 unsigned int hlen;
3176 unsigned int off;
76620aaf
HX
3177
3178 napi->skb = NULL;
3179
3180 skb_reset_mac_header(skb);
3181 skb_gro_reset_offset(skb);
3182
a5b1cf28
HX
3183 off = skb_gro_offset(skb);
3184 hlen = off + sizeof(*eth);
3185 eth = skb_gro_header_fast(skb, off);
3186 if (skb_gro_header_hard(skb, hlen)) {
3187 eth = skb_gro_header_slow(skb, hlen, off);
3188 if (unlikely(!eth)) {
3189 napi_reuse_skb(napi, skb);
3190 skb = NULL;
3191 goto out;
3192 }
76620aaf
HX
3193 }
3194
3195 skb_gro_pull(skb, sizeof(*eth));
3196
3197 /*
3198 * This works because the only protocols we care about don't require
3199 * special handling. We'll fix it up properly at the end.
3200 */
3201 skb->protocol = eth->h_proto;
3202
3203out:
3204 return skb;
3205}
3206EXPORT_SYMBOL(napi_frags_skb);
3207
c7c4b3b6 3208gro_result_t napi_gro_frags(struct napi_struct *napi)
5d0d9be8 3209{
76620aaf 3210 struct sk_buff *skb = napi_frags_skb(napi);
5d0d9be8
HX
3211
3212 if (!skb)
c7c4b3b6 3213 return GRO_DROP;
5d0d9be8
HX
3214
3215 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3216}
5d38a079
HX
3217EXPORT_SYMBOL(napi_gro_frags);
3218
bea3348e 3219static int process_backlog(struct napi_struct *napi, int quota)
1da177e4
LT
3220{
3221 int work = 0;
1da177e4 3222 struct softnet_data *queue = &__get_cpu_var(softnet_data);
1da177e4 3223
bea3348e
SH
3224 napi->weight = weight_p;
3225 do {
1da177e4 3226 struct sk_buff *skb;
1da177e4 3227
152102c7
CG
3228 local_irq_disable();
3229 rps_lock(queue);
1da177e4 3230 skb = __skb_dequeue(&queue->input_pkt_queue);
bea3348e 3231 if (!skb) {
8f1ead2d 3232 __napi_complete(napi);
5a6d234e 3233 rps_unlock(queue);
e4008276 3234 local_irq_enable();
8f1ead2d 3235 break;
bea3348e 3236 }
fec5e652 3237 incr_input_queue_head(queue);
152102c7
CG
3238 rps_unlock(queue);
3239 local_irq_enable();
1da177e4 3240
0a9627f2 3241 __netif_receive_skb(skb);
9958da05 3242 } while (++work < quota);
1da177e4 3243
bea3348e
SH
3244 return work;
3245}
1da177e4 3246
bea3348e
SH
3247/**
3248 * __napi_schedule - schedule for receive
c4ea43c5 3249 * @n: entry to schedule
bea3348e
SH
3250 *
3251 * The entry's receive function will be scheduled to run
3252 */
b5606c2d 3253void __napi_schedule(struct napi_struct *n)
bea3348e
SH
3254{
3255 unsigned long flags;
1da177e4 3256
bea3348e
SH
3257 local_irq_save(flags);
3258 list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
3259 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3260 local_irq_restore(flags);
1da177e4 3261}
bea3348e
SH
3262EXPORT_SYMBOL(__napi_schedule);
3263
d565b0a1
HX
3264void __napi_complete(struct napi_struct *n)
3265{
3266 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3267 BUG_ON(n->gro_list);
3268
3269 list_del(&n->poll_list);
3270 smp_mb__before_clear_bit();
3271 clear_bit(NAPI_STATE_SCHED, &n->state);
3272}
3273EXPORT_SYMBOL(__napi_complete);
3274
3275void napi_complete(struct napi_struct *n)
3276{
3277 unsigned long flags;
3278
3279 /*
3280 * don't let napi dequeue from the cpu poll list
3281 * just in case its running on a different cpu
3282 */
3283 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3284 return;
3285
3286 napi_gro_flush(n);
3287 local_irq_save(flags);
3288 __napi_complete(n);
3289 local_irq_restore(flags);
3290}
3291EXPORT_SYMBOL(napi_complete);
3292
3293void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3294 int (*poll)(struct napi_struct *, int), int weight)
3295{
3296 INIT_LIST_HEAD(&napi->poll_list);
4ae5544f 3297 napi->gro_count = 0;
d565b0a1 3298 napi->gro_list = NULL;
5d38a079 3299 napi->skb = NULL;
d565b0a1
HX
3300 napi->poll = poll;
3301 napi->weight = weight;
3302 list_add(&napi->dev_list, &dev->napi_list);
d565b0a1 3303 napi->dev = dev;
5d38a079 3304#ifdef CONFIG_NETPOLL
d565b0a1
HX
3305 spin_lock_init(&napi->poll_lock);
3306 napi->poll_owner = -1;
3307#endif
3308 set_bit(NAPI_STATE_SCHED, &napi->state);
3309}
3310EXPORT_SYMBOL(netif_napi_add);
3311
3312void netif_napi_del(struct napi_struct *napi)
3313{
3314 struct sk_buff *skb, *next;
3315
d7b06636 3316 list_del_init(&napi->dev_list);
76620aaf 3317 napi_free_frags(napi);
d565b0a1
HX
3318
3319 for (skb = napi->gro_list; skb; skb = next) {
3320 next = skb->next;
3321 skb->next = NULL;
3322 kfree_skb(skb);
3323 }
3324
3325 napi->gro_list = NULL;
4ae5544f 3326 napi->gro_count = 0;
d565b0a1
HX
3327}
3328EXPORT_SYMBOL(netif_napi_del);
3329
0a9627f2 3330/*
88751275
ED
3331 * net_rps_action sends any pending IPI's for rps.
3332 * Note: called with local irq disabled, but exits with local irq enabled.
0a9627f2 3333 */
88751275 3334static void net_rps_action(void)
0a9627f2 3335{
88751275
ED
3336#ifdef CONFIG_RPS
3337 struct softnet_data *locqueue = &__get_cpu_var(softnet_data);
3338 struct softnet_data *remqueue = locqueue->rps_ipi_list;
0a9627f2 3339
88751275
ED
3340 if (remqueue) {
3341 locqueue->rps_ipi_list = NULL;
3342
3343 local_irq_enable();
3344
3345 /* Send pending IPI's to kick RPS processing on remote cpus. */
3346 while (remqueue) {
3347 struct softnet_data *next = remqueue->rps_ipi_next;
3348 if (cpu_online(remqueue->cpu))
3349 __smp_call_function_single(remqueue->cpu,
3350 &remqueue->csd, 0);
3351 remqueue = next;
3352 }
3353 } else
1e94d72f 3354#endif
88751275
ED
3355 local_irq_enable();
3356}
1da177e4
LT
3357
3358static void net_rx_action(struct softirq_action *h)
3359{
bea3348e 3360 struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
24f8b238 3361 unsigned long time_limit = jiffies + 2;
51b0bded 3362 int budget = netdev_budget;
53fb95d3
MM
3363 void *have;
3364
1da177e4
LT
3365 local_irq_disable();
3366
bea3348e
SH
3367 while (!list_empty(list)) {
3368 struct napi_struct *n;
3369 int work, weight;
1da177e4 3370
bea3348e 3371 /* If softirq window is exhuasted then punt.
24f8b238
SH
3372 * Allow this to run for 2 jiffies since which will allow
3373 * an average latency of 1.5/HZ.
bea3348e 3374 */
24f8b238 3375 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
1da177e4
LT
3376 goto softnet_break;
3377
3378 local_irq_enable();
3379
bea3348e
SH
3380 /* Even though interrupts have been re-enabled, this
3381 * access is safe because interrupts can only add new
3382 * entries to the tail of this list, and only ->poll()
3383 * calls can remove this head entry from the list.
3384 */
e5e26d75 3385 n = list_first_entry(list, struct napi_struct, poll_list);
1da177e4 3386
bea3348e
SH
3387 have = netpoll_poll_lock(n);
3388
3389 weight = n->weight;
3390
0a7606c1
DM
3391 /* This NAPI_STATE_SCHED test is for avoiding a race
3392 * with netpoll's poll_napi(). Only the entity which
3393 * obtains the lock and sees NAPI_STATE_SCHED set will
3394 * actually make the ->poll() call. Therefore we avoid
3395 * accidently calling ->poll() when NAPI is not scheduled.
3396 */
3397 work = 0;
4ea7e386 3398 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
0a7606c1 3399 work = n->poll(n, weight);
4ea7e386
NH
3400 trace_napi_poll(n);
3401 }
bea3348e
SH
3402
3403 WARN_ON_ONCE(work > weight);
3404
3405 budget -= work;
3406
3407 local_irq_disable();
3408
3409 /* Drivers must not modify the NAPI state if they
3410 * consume the entire weight. In such cases this code
3411 * still "owns" the NAPI instance and therefore can
3412 * move the instance around on the list at-will.
3413 */
fed17f30 3414 if (unlikely(work == weight)) {
ff780cd8
HX
3415 if (unlikely(napi_disable_pending(n))) {
3416 local_irq_enable();
3417 napi_complete(n);
3418 local_irq_disable();
3419 } else
fed17f30
DM
3420 list_move_tail(&n->poll_list, list);
3421 }
bea3348e
SH
3422
3423 netpoll_poll_unlock(have);
1da177e4
LT
3424 }
3425out:
88751275 3426 net_rps_action();
0a9627f2 3427
db217334
CL
3428#ifdef CONFIG_NET_DMA
3429 /*
3430 * There may not be any more sk_buffs coming right now, so push
3431 * any pending DMA copies to hardware
3432 */
2ba05622 3433 dma_issue_pending_all();
db217334 3434#endif
bea3348e 3435
1da177e4
LT
3436 return;
3437
3438softnet_break:
3439 __get_cpu_var(netdev_rx_stat).time_squeeze++;
3440 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3441 goto out;
3442}
3443
d1b19dff 3444static gifconf_func_t *gifconf_list[NPROTO];
1da177e4
LT
3445
3446/**
3447 * register_gifconf - register a SIOCGIF handler
3448 * @family: Address family
3449 * @gifconf: Function handler
3450 *
3451 * Register protocol dependent address dumping routines. The handler
3452 * that is passed must not be freed or reused until it has been replaced
3453 * by another handler.
3454 */
d1b19dff 3455int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
1da177e4
LT
3456{
3457 if (family >= NPROTO)
3458 return -EINVAL;
3459 gifconf_list[family] = gifconf;
3460 return 0;
3461}
d1b19dff 3462EXPORT_SYMBOL(register_gifconf);
1da177e4
LT
3463
3464
3465/*
3466 * Map an interface index to its name (SIOCGIFNAME)
3467 */
3468
3469/*
3470 * We need this ioctl for efficient implementation of the
3471 * if_indextoname() function required by the IPv6 API. Without
3472 * it, we would have to search all the interfaces to find a
3473 * match. --pb
3474 */
3475
881d966b 3476static int dev_ifname(struct net *net, struct ifreq __user *arg)
1da177e4
LT
3477{
3478 struct net_device *dev;
3479 struct ifreq ifr;
3480
3481 /*
3482 * Fetch the caller's info block.
3483 */
3484
3485 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3486 return -EFAULT;
3487
fb699dfd
ED
3488 rcu_read_lock();
3489 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
1da177e4 3490 if (!dev) {
fb699dfd 3491 rcu_read_unlock();
1da177e4
LT
3492 return -ENODEV;
3493 }
3494
3495 strcpy(ifr.ifr_name, dev->name);
fb699dfd 3496 rcu_read_unlock();
1da177e4
LT
3497
3498 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3499 return -EFAULT;
3500 return 0;
3501}
3502
3503/*
3504 * Perform a SIOCGIFCONF call. This structure will change
3505 * size eventually, and there is nothing I can do about it.
3506 * Thus we will need a 'compatibility mode'.
3507 */
3508
881d966b 3509static int dev_ifconf(struct net *net, char __user *arg)
1da177e4
LT
3510{
3511 struct ifconf ifc;
3512 struct net_device *dev;
3513 char __user *pos;
3514 int len;
3515 int total;
3516 int i;
3517
3518 /*
3519 * Fetch the caller's info block.
3520 */
3521
3522 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3523 return -EFAULT;
3524
3525 pos = ifc.ifc_buf;
3526 len = ifc.ifc_len;
3527
3528 /*
3529 * Loop over the interfaces, and write an info block for each.
3530 */
3531
3532 total = 0;
881d966b 3533 for_each_netdev(net, dev) {
1da177e4
LT
3534 for (i = 0; i < NPROTO; i++) {
3535 if (gifconf_list[i]) {
3536 int done;
3537 if (!pos)
3538 done = gifconf_list[i](dev, NULL, 0);
3539 else
3540 done = gifconf_list[i](dev, pos + total,
3541 len - total);
3542 if (done < 0)
3543 return -EFAULT;
3544 total += done;
3545 }
3546 }
4ec93edb 3547 }
1da177e4
LT
3548
3549 /*
3550 * All done. Write the updated control block back to the caller.
3551 */
3552 ifc.ifc_len = total;
3553
3554 /*
3555 * Both BSD and Solaris return 0 here, so we do too.
3556 */
3557 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3558}
3559
3560#ifdef CONFIG_PROC_FS
3561/*
3562 * This is invoked by the /proc filesystem handler to display a device
3563 * in detail.
3564 */
7562f876 3565void *dev_seq_start(struct seq_file *seq, loff_t *pos)
c6d14c84 3566 __acquires(RCU)
1da177e4 3567{
e372c414 3568 struct net *net = seq_file_net(seq);
7562f876 3569 loff_t off;
1da177e4 3570 struct net_device *dev;
1da177e4 3571
c6d14c84 3572 rcu_read_lock();
7562f876
PE
3573 if (!*pos)
3574 return SEQ_START_TOKEN;
1da177e4 3575
7562f876 3576 off = 1;
c6d14c84 3577 for_each_netdev_rcu(net, dev)
7562f876
PE
3578 if (off++ == *pos)
3579 return dev;
1da177e4 3580
7562f876 3581 return NULL;
1da177e4
LT
3582}
3583
3584void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3585{
c6d14c84
ED
3586 struct net_device *dev = (v == SEQ_START_TOKEN) ?
3587 first_net_device(seq_file_net(seq)) :
3588 next_net_device((struct net_device *)v);
3589
1da177e4 3590 ++*pos;
c6d14c84 3591 return rcu_dereference(dev);
1da177e4
LT
3592}
3593
3594void dev_seq_stop(struct seq_file *seq, void *v)
c6d14c84 3595 __releases(RCU)
1da177e4 3596{
c6d14c84 3597 rcu_read_unlock();
1da177e4
LT
3598}
3599
3600static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3601{
eeda3fd6 3602 const struct net_device_stats *stats = dev_get_stats(dev);
1da177e4 3603
2d13bafe 3604 seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
5a1b5898
RR
3605 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3606 dev->name, stats->rx_bytes, stats->rx_packets,
3607 stats->rx_errors,
3608 stats->rx_dropped + stats->rx_missed_errors,
3609 stats->rx_fifo_errors,
3610 stats->rx_length_errors + stats->rx_over_errors +
3611 stats->rx_crc_errors + stats->rx_frame_errors,
3612 stats->rx_compressed, stats->multicast,
3613 stats->tx_bytes, stats->tx_packets,
3614 stats->tx_errors, stats->tx_dropped,
3615 stats->tx_fifo_errors, stats->collisions,
3616 stats->tx_carrier_errors +
3617 stats->tx_aborted_errors +
3618 stats->tx_window_errors +
3619 stats->tx_heartbeat_errors,
3620 stats->tx_compressed);
1da177e4
LT
3621}
3622
3623/*
3624 * Called from the PROCfs module. This now uses the new arbitrary sized
3625 * /proc/net interface to create /proc/net/dev
3626 */
3627static int dev_seq_show(struct seq_file *seq, void *v)
3628{
3629 if (v == SEQ_START_TOKEN)
3630 seq_puts(seq, "Inter-| Receive "
3631 " | Transmit\n"
3632 " face |bytes packets errs drop fifo frame "
3633 "compressed multicast|bytes packets errs "
3634 "drop fifo colls carrier compressed\n");
3635 else
3636 dev_seq_printf_stats(seq, v);
3637 return 0;
3638}
3639
3640static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3641{
3642 struct netif_rx_stats *rc = NULL;
3643
0c0b0aca 3644 while (*pos < nr_cpu_ids)
4ec93edb 3645 if (cpu_online(*pos)) {
1da177e4
LT
3646 rc = &per_cpu(netdev_rx_stat, *pos);
3647 break;
3648 } else
3649 ++*pos;
3650 return rc;
3651}
3652
3653static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3654{
3655 return softnet_get_online(pos);
3656}
3657
3658static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3659{
3660 ++*pos;
3661 return softnet_get_online(pos);
3662}
3663
3664static void softnet_seq_stop(struct seq_file *seq, void *v)
3665{
3666}
3667
3668static int softnet_seq_show(struct seq_file *seq, void *v)
3669{
3670 struct netif_rx_stats *s = v;
3671
0a9627f2 3672 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
31aa02c5 3673 s->total, s->dropped, s->time_squeeze, 0,
c1ebcdb8 3674 0, 0, 0, 0, /* was fastroute */
0a9627f2 3675 s->cpu_collision, s->received_rps);
1da177e4
LT
3676 return 0;
3677}
3678
f690808e 3679static const struct seq_operations dev_seq_ops = {
1da177e4
LT
3680 .start = dev_seq_start,
3681 .next = dev_seq_next,
3682 .stop = dev_seq_stop,
3683 .show = dev_seq_show,
3684};
3685
3686static int dev_seq_open(struct inode *inode, struct file *file)
3687{
e372c414
DL
3688 return seq_open_net(inode, file, &dev_seq_ops,
3689 sizeof(struct seq_net_private));
1da177e4
LT
3690}
3691
9a32144e 3692static const struct file_operations dev_seq_fops = {
1da177e4
LT
3693 .owner = THIS_MODULE,
3694 .open = dev_seq_open,
3695 .read = seq_read,
3696 .llseek = seq_lseek,
e372c414 3697 .release = seq_release_net,
1da177e4
LT
3698};
3699
f690808e 3700static const struct seq_operations softnet_seq_ops = {
1da177e4
LT
3701 .start = softnet_seq_start,
3702 .next = softnet_seq_next,
3703 .stop = softnet_seq_stop,
3704 .show = softnet_seq_show,
3705};
3706
3707static int softnet_seq_open(struct inode *inode, struct file *file)
3708{
3709 return seq_open(file, &softnet_seq_ops);
3710}
3711
9a32144e 3712static const struct file_operations softnet_seq_fops = {
1da177e4
LT
3713 .owner = THIS_MODULE,
3714 .open = softnet_seq_open,
3715 .read = seq_read,
3716 .llseek = seq_lseek,
3717 .release = seq_release,
3718};
3719
0e1256ff
SH
3720static void *ptype_get_idx(loff_t pos)
3721{
3722 struct packet_type *pt = NULL;
3723 loff_t i = 0;
3724 int t;
3725
3726 list_for_each_entry_rcu(pt, &ptype_all, list) {
3727 if (i == pos)
3728 return pt;
3729 ++i;
3730 }
3731
82d8a867 3732 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
0e1256ff
SH
3733 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3734 if (i == pos)
3735 return pt;
3736 ++i;
3737 }
3738 }
3739 return NULL;
3740}
3741
3742static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
72348a42 3743 __acquires(RCU)
0e1256ff
SH
3744{
3745 rcu_read_lock();
3746 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3747}
3748
3749static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3750{
3751 struct packet_type *pt;
3752 struct list_head *nxt;
3753 int hash;
3754
3755 ++*pos;
3756 if (v == SEQ_START_TOKEN)
3757 return ptype_get_idx(0);
3758
3759 pt = v;
3760 nxt = pt->list.next;
3761 if (pt->type == htons(ETH_P_ALL)) {
3762 if (nxt != &ptype_all)
3763 goto found;
3764 hash = 0;
3765 nxt = ptype_base[0].next;
3766 } else
82d8a867 3767 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
0e1256ff
SH
3768
3769 while (nxt == &ptype_base[hash]) {
82d8a867 3770 if (++hash >= PTYPE_HASH_SIZE)
0e1256ff
SH
3771 return NULL;
3772 nxt = ptype_base[hash].next;
3773 }
3774found:
3775 return list_entry(nxt, struct packet_type, list);
3776}
3777
3778static void ptype_seq_stop(struct seq_file *seq, void *v)
72348a42 3779 __releases(RCU)
0e1256ff
SH
3780{
3781 rcu_read_unlock();
3782}
3783
0e1256ff
SH
3784static int ptype_seq_show(struct seq_file *seq, void *v)
3785{
3786 struct packet_type *pt = v;
3787
3788 if (v == SEQ_START_TOKEN)
3789 seq_puts(seq, "Type Device Function\n");
c346dca1 3790 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
0e1256ff
SH
3791 if (pt->type == htons(ETH_P_ALL))
3792 seq_puts(seq, "ALL ");
3793 else
3794 seq_printf(seq, "%04x", ntohs(pt->type));
3795
908cd2da
AD
3796 seq_printf(seq, " %-8s %pF\n",
3797 pt->dev ? pt->dev->name : "", pt->func);
0e1256ff
SH
3798 }
3799
3800 return 0;
3801}
3802
3803static const struct seq_operations ptype_seq_ops = {
3804 .start = ptype_seq_start,
3805 .next = ptype_seq_next,
3806 .stop = ptype_seq_stop,
3807 .show = ptype_seq_show,
3808};
3809
3810static int ptype_seq_open(struct inode *inode, struct file *file)
3811{
2feb27db
PE
3812 return seq_open_net(inode, file, &ptype_seq_ops,
3813 sizeof(struct seq_net_private));
0e1256ff
SH
3814}
3815
3816static const struct file_operations ptype_seq_fops = {
3817 .owner = THIS_MODULE,
3818 .open = ptype_seq_open,
3819 .read = seq_read,
3820 .llseek = seq_lseek,
2feb27db 3821 .release = seq_release_net,
0e1256ff
SH
3822};
3823
3824
4665079c 3825static int __net_init dev_proc_net_init(struct net *net)
1da177e4
LT
3826{
3827 int rc = -ENOMEM;
3828
881d966b 3829 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
1da177e4 3830 goto out;
881d966b 3831 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
1da177e4 3832 goto out_dev;
881d966b 3833 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
457c4cbc 3834 goto out_softnet;
0e1256ff 3835
881d966b 3836 if (wext_proc_init(net))
457c4cbc 3837 goto out_ptype;
1da177e4
LT
3838 rc = 0;
3839out:
3840 return rc;
457c4cbc 3841out_ptype:
881d966b 3842 proc_net_remove(net, "ptype");
1da177e4 3843out_softnet:
881d966b 3844 proc_net_remove(net, "softnet_stat");
1da177e4 3845out_dev:
881d966b 3846 proc_net_remove(net, "dev");
1da177e4
LT
3847 goto out;
3848}
881d966b 3849
4665079c 3850static void __net_exit dev_proc_net_exit(struct net *net)
881d966b
EB
3851{
3852 wext_proc_exit(net);
3853
3854 proc_net_remove(net, "ptype");
3855 proc_net_remove(net, "softnet_stat");
3856 proc_net_remove(net, "dev");
3857}
3858
022cbae6 3859static struct pernet_operations __net_initdata dev_proc_ops = {
881d966b
EB
3860 .init = dev_proc_net_init,
3861 .exit = dev_proc_net_exit,
3862};
3863
3864static int __init dev_proc_init(void)
3865{
3866 return register_pernet_subsys(&dev_proc_ops);
3867}
1da177e4
LT
3868#else
3869#define dev_proc_init() 0
3870#endif /* CONFIG_PROC_FS */
3871
3872
3873/**
3874 * netdev_set_master - set up master/slave pair
3875 * @slave: slave device
3876 * @master: new master device
3877 *
3878 * Changes the master device of the slave. Pass %NULL to break the
3879 * bonding. The caller must hold the RTNL semaphore. On a failure
3880 * a negative errno code is returned. On success the reference counts
3881 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3882 * function returns zero.
3883 */
3884int netdev_set_master(struct net_device *slave, struct net_device *master)
3885{
3886 struct net_device *old = slave->master;
3887
3888 ASSERT_RTNL();
3889
3890 if (master) {
3891 if (old)
3892 return -EBUSY;
3893 dev_hold(master);
3894 }
3895
3896 slave->master = master;
4ec93edb 3897
283f2fe8
ED
3898 if (old) {
3899 synchronize_net();
1da177e4 3900 dev_put(old);
283f2fe8 3901 }
1da177e4
LT
3902 if (master)
3903 slave->flags |= IFF_SLAVE;
3904 else
3905 slave->flags &= ~IFF_SLAVE;
3906
3907 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3908 return 0;
3909}
d1b19dff 3910EXPORT_SYMBOL(netdev_set_master);
1da177e4 3911
b6c40d68
PM
3912static void dev_change_rx_flags(struct net_device *dev, int flags)
3913{
d314774c
SH
3914 const struct net_device_ops *ops = dev->netdev_ops;
3915
3916 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3917 ops->ndo_change_rx_flags(dev, flags);
b6c40d68
PM
3918}
3919
dad9b335 3920static int __dev_set_promiscuity(struct net_device *dev, int inc)
1da177e4
LT
3921{
3922 unsigned short old_flags = dev->flags;
8192b0c4
DH
3923 uid_t uid;
3924 gid_t gid;
1da177e4 3925
24023451
PM
3926 ASSERT_RTNL();
3927
dad9b335
WC
3928 dev->flags |= IFF_PROMISC;
3929 dev->promiscuity += inc;
3930 if (dev->promiscuity == 0) {
3931 /*
3932 * Avoid overflow.
3933 * If inc causes overflow, untouch promisc and return error.
3934 */
3935 if (inc < 0)
3936 dev->flags &= ~IFF_PROMISC;
3937 else {
3938 dev->promiscuity -= inc;
3939 printk(KERN_WARNING "%s: promiscuity touches roof, "
3940 "set promiscuity failed, promiscuity feature "
3941 "of device might be broken.\n", dev->name);
3942 return -EOVERFLOW;
3943 }
3944 }
52609c0b 3945 if (dev->flags != old_flags) {
1da177e4
LT
3946 printk(KERN_INFO "device %s %s promiscuous mode\n",
3947 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4ec93edb 3948 "left");
8192b0c4
DH
3949 if (audit_enabled) {
3950 current_uid_gid(&uid, &gid);
7759db82
KHK
3951 audit_log(current->audit_context, GFP_ATOMIC,
3952 AUDIT_ANOM_PROMISCUOUS,
3953 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3954 dev->name, (dev->flags & IFF_PROMISC),
3955 (old_flags & IFF_PROMISC),
3956 audit_get_loginuid(current),
8192b0c4 3957 uid, gid,
7759db82 3958 audit_get_sessionid(current));
8192b0c4 3959 }
24023451 3960
b6c40d68 3961 dev_change_rx_flags(dev, IFF_PROMISC);
1da177e4 3962 }
dad9b335 3963 return 0;
1da177e4
LT
3964}
3965
4417da66
PM
3966/**
3967 * dev_set_promiscuity - update promiscuity count on a device
3968 * @dev: device
3969 * @inc: modifier
3970 *
3971 * Add or remove promiscuity from a device. While the count in the device
3972 * remains above zero the interface remains promiscuous. Once it hits zero
3973 * the device reverts back to normal filtering operation. A negative inc
3974 * value is used to drop promiscuity on the device.
dad9b335 3975 * Return 0 if successful or a negative errno code on error.
4417da66 3976 */
dad9b335 3977int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66
PM
3978{
3979 unsigned short old_flags = dev->flags;
dad9b335 3980 int err;
4417da66 3981
dad9b335 3982 err = __dev_set_promiscuity(dev, inc);
4b5a698e 3983 if (err < 0)
dad9b335 3984 return err;
4417da66
PM
3985 if (dev->flags != old_flags)
3986 dev_set_rx_mode(dev);
dad9b335 3987 return err;
4417da66 3988}
d1b19dff 3989EXPORT_SYMBOL(dev_set_promiscuity);
4417da66 3990
1da177e4
LT
3991/**
3992 * dev_set_allmulti - update allmulti count on a device
3993 * @dev: device
3994 * @inc: modifier
3995 *
3996 * Add or remove reception of all multicast frames to a device. While the
3997 * count in the device remains above zero the interface remains listening
3998 * to all interfaces. Once it hits zero the device reverts back to normal
3999 * filtering operation. A negative @inc value is used to drop the counter
4000 * when releasing a resource needing all multicasts.
dad9b335 4001 * Return 0 if successful or a negative errno code on error.
1da177e4
LT
4002 */
4003
dad9b335 4004int dev_set_allmulti(struct net_device *dev, int inc)
1da177e4
LT
4005{
4006 unsigned short old_flags = dev->flags;
4007
24023451
PM
4008 ASSERT_RTNL();
4009
1da177e4 4010 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
4011 dev->allmulti += inc;
4012 if (dev->allmulti == 0) {
4013 /*
4014 * Avoid overflow.
4015 * If inc causes overflow, untouch allmulti and return error.
4016 */
4017 if (inc < 0)
4018 dev->flags &= ~IFF_ALLMULTI;
4019 else {
4020 dev->allmulti -= inc;
4021 printk(KERN_WARNING "%s: allmulti touches roof, "
4022 "set allmulti failed, allmulti feature of "
4023 "device might be broken.\n", dev->name);
4024 return -EOVERFLOW;
4025 }
4026 }
24023451 4027 if (dev->flags ^ old_flags) {
b6c40d68 4028 dev_change_rx_flags(dev, IFF_ALLMULTI);
4417da66 4029 dev_set_rx_mode(dev);
24023451 4030 }
dad9b335 4031 return 0;
4417da66 4032}
d1b19dff 4033EXPORT_SYMBOL(dev_set_allmulti);
4417da66
PM
4034
4035/*
4036 * Upload unicast and multicast address lists to device and
4037 * configure RX filtering. When the device doesn't support unicast
53ccaae1 4038 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
4039 * are present.
4040 */
4041void __dev_set_rx_mode(struct net_device *dev)
4042{
d314774c
SH
4043 const struct net_device_ops *ops = dev->netdev_ops;
4044
4417da66
PM
4045 /* dev_open will call this function so the list will stay sane. */
4046 if (!(dev->flags&IFF_UP))
4047 return;
4048
4049 if (!netif_device_present(dev))
40b77c94 4050 return;
4417da66 4051
d314774c
SH
4052 if (ops->ndo_set_rx_mode)
4053 ops->ndo_set_rx_mode(dev);
4417da66
PM
4054 else {
4055 /* Unicast addresses changes may only happen under the rtnl,
4056 * therefore calling __dev_set_promiscuity here is safe.
4057 */
32e7bfc4 4058 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4417da66
PM
4059 __dev_set_promiscuity(dev, 1);
4060 dev->uc_promisc = 1;
32e7bfc4 4061 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4417da66
PM
4062 __dev_set_promiscuity(dev, -1);
4063 dev->uc_promisc = 0;
4064 }
4065
d314774c
SH
4066 if (ops->ndo_set_multicast_list)
4067 ops->ndo_set_multicast_list(dev);
4417da66
PM
4068 }
4069}
4070
4071void dev_set_rx_mode(struct net_device *dev)
4072{
b9e40857 4073 netif_addr_lock_bh(dev);
4417da66 4074 __dev_set_rx_mode(dev);
b9e40857 4075 netif_addr_unlock_bh(dev);
1da177e4
LT
4076}
4077
f0db275a
SH
4078/**
4079 * dev_get_flags - get flags reported to userspace
4080 * @dev: device
4081 *
4082 * Get the combination of flag bits exported through APIs to userspace.
4083 */
1da177e4
LT
4084unsigned dev_get_flags(const struct net_device *dev)
4085{
4086 unsigned flags;
4087
4088 flags = (dev->flags & ~(IFF_PROMISC |
4089 IFF_ALLMULTI |
b00055aa
SR
4090 IFF_RUNNING |
4091 IFF_LOWER_UP |
4092 IFF_DORMANT)) |
1da177e4
LT
4093 (dev->gflags & (IFF_PROMISC |
4094 IFF_ALLMULTI));
4095
b00055aa
SR
4096 if (netif_running(dev)) {
4097 if (netif_oper_up(dev))
4098 flags |= IFF_RUNNING;
4099 if (netif_carrier_ok(dev))
4100 flags |= IFF_LOWER_UP;
4101 if (netif_dormant(dev))
4102 flags |= IFF_DORMANT;
4103 }
1da177e4
LT
4104
4105 return flags;
4106}
d1b19dff 4107EXPORT_SYMBOL(dev_get_flags);
1da177e4 4108
bd380811 4109int __dev_change_flags(struct net_device *dev, unsigned int flags)
1da177e4 4110{
1da177e4 4111 int old_flags = dev->flags;
bd380811 4112 int ret;
1da177e4 4113
24023451
PM
4114 ASSERT_RTNL();
4115
1da177e4
LT
4116 /*
4117 * Set the flags on our device.
4118 */
4119
4120 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4121 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4122 IFF_AUTOMEDIA)) |
4123 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4124 IFF_ALLMULTI));
4125
4126 /*
4127 * Load in the correct multicast list now the flags have changed.
4128 */
4129
b6c40d68
PM
4130 if ((old_flags ^ flags) & IFF_MULTICAST)
4131 dev_change_rx_flags(dev, IFF_MULTICAST);
24023451 4132
4417da66 4133 dev_set_rx_mode(dev);
1da177e4
LT
4134
4135 /*
4136 * Have we downed the interface. We handle IFF_UP ourselves
4137 * according to user attempts to set it, rather than blindly
4138 * setting it.
4139 */
4140
4141 ret = 0;
4142 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
bd380811 4143 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
1da177e4
LT
4144
4145 if (!ret)
4417da66 4146 dev_set_rx_mode(dev);
1da177e4
LT
4147 }
4148
1da177e4 4149 if ((flags ^ dev->gflags) & IFF_PROMISC) {
d1b19dff
ED
4150 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4151
1da177e4
LT
4152 dev->gflags ^= IFF_PROMISC;
4153 dev_set_promiscuity(dev, inc);
4154 }
4155
4156 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4157 is important. Some (broken) drivers set IFF_PROMISC, when
4158 IFF_ALLMULTI is requested not asking us and not reporting.
4159 */
4160 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
d1b19dff
ED
4161 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4162
1da177e4
LT
4163 dev->gflags ^= IFF_ALLMULTI;
4164 dev_set_allmulti(dev, inc);
4165 }
4166
bd380811
PM
4167 return ret;
4168}
4169
4170void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4171{
4172 unsigned int changes = dev->flags ^ old_flags;
4173
4174 if (changes & IFF_UP) {
4175 if (dev->flags & IFF_UP)
4176 call_netdevice_notifiers(NETDEV_UP, dev);
4177 else
4178 call_netdevice_notifiers(NETDEV_DOWN, dev);
4179 }
4180
4181 if (dev->flags & IFF_UP &&
4182 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4183 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4184}
4185
4186/**
4187 * dev_change_flags - change device settings
4188 * @dev: device
4189 * @flags: device state flags
4190 *
4191 * Change settings on device based state flags. The flags are
4192 * in the userspace exported format.
4193 */
4194int dev_change_flags(struct net_device *dev, unsigned flags)
4195{
4196 int ret, changes;
4197 int old_flags = dev->flags;
4198
4199 ret = __dev_change_flags(dev, flags);
4200 if (ret < 0)
4201 return ret;
4202
4203 changes = old_flags ^ dev->flags;
7c355f53
TG
4204 if (changes)
4205 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
1da177e4 4206
bd380811 4207 __dev_notify_flags(dev, old_flags);
1da177e4
LT
4208 return ret;
4209}
d1b19dff 4210EXPORT_SYMBOL(dev_change_flags);
1da177e4 4211
f0db275a
SH
4212/**
4213 * dev_set_mtu - Change maximum transfer unit
4214 * @dev: device
4215 * @new_mtu: new transfer unit
4216 *
4217 * Change the maximum transfer size of the network device.
4218 */
1da177e4
LT
4219int dev_set_mtu(struct net_device *dev, int new_mtu)
4220{
d314774c 4221 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4222 int err;
4223
4224 if (new_mtu == dev->mtu)
4225 return 0;
4226
4227 /* MTU must be positive. */
4228 if (new_mtu < 0)
4229 return -EINVAL;
4230
4231 if (!netif_device_present(dev))
4232 return -ENODEV;
4233
4234 err = 0;
d314774c
SH
4235 if (ops->ndo_change_mtu)
4236 err = ops->ndo_change_mtu(dev, new_mtu);
1da177e4
LT
4237 else
4238 dev->mtu = new_mtu;
d314774c 4239
1da177e4 4240 if (!err && dev->flags & IFF_UP)
056925ab 4241 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
1da177e4
LT
4242 return err;
4243}
d1b19dff 4244EXPORT_SYMBOL(dev_set_mtu);
1da177e4 4245
f0db275a
SH
4246/**
4247 * dev_set_mac_address - Change Media Access Control Address
4248 * @dev: device
4249 * @sa: new address
4250 *
4251 * Change the hardware (MAC) address of the device
4252 */
1da177e4
LT
4253int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4254{
d314774c 4255 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4256 int err;
4257
d314774c 4258 if (!ops->ndo_set_mac_address)
1da177e4
LT
4259 return -EOPNOTSUPP;
4260 if (sa->sa_family != dev->type)
4261 return -EINVAL;
4262 if (!netif_device_present(dev))
4263 return -ENODEV;
d314774c 4264 err = ops->ndo_set_mac_address(dev, sa);
1da177e4 4265 if (!err)
056925ab 4266 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
1da177e4
LT
4267 return err;
4268}
d1b19dff 4269EXPORT_SYMBOL(dev_set_mac_address);
1da177e4
LT
4270
4271/*
3710becf 4272 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
1da177e4 4273 */
14e3e079 4274static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
1da177e4
LT
4275{
4276 int err;
3710becf 4277 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
1da177e4
LT
4278
4279 if (!dev)
4280 return -ENODEV;
4281
4282 switch (cmd) {
d1b19dff
ED
4283 case SIOCGIFFLAGS: /* Get interface flags */
4284 ifr->ifr_flags = (short) dev_get_flags(dev);
4285 return 0;
1da177e4 4286
d1b19dff
ED
4287 case SIOCGIFMETRIC: /* Get the metric on the interface
4288 (currently unused) */
4289 ifr->ifr_metric = 0;
4290 return 0;
1da177e4 4291
d1b19dff
ED
4292 case SIOCGIFMTU: /* Get the MTU of a device */
4293 ifr->ifr_mtu = dev->mtu;
4294 return 0;
1da177e4 4295
d1b19dff
ED
4296 case SIOCGIFHWADDR:
4297 if (!dev->addr_len)
4298 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4299 else
4300 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4301 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4302 ifr->ifr_hwaddr.sa_family = dev->type;
4303 return 0;
1da177e4 4304
d1b19dff
ED
4305 case SIOCGIFSLAVE:
4306 err = -EINVAL;
4307 break;
14e3e079 4308
d1b19dff
ED
4309 case SIOCGIFMAP:
4310 ifr->ifr_map.mem_start = dev->mem_start;
4311 ifr->ifr_map.mem_end = dev->mem_end;
4312 ifr->ifr_map.base_addr = dev->base_addr;
4313 ifr->ifr_map.irq = dev->irq;
4314 ifr->ifr_map.dma = dev->dma;
4315 ifr->ifr_map.port = dev->if_port;
4316 return 0;
14e3e079 4317
d1b19dff
ED
4318 case SIOCGIFINDEX:
4319 ifr->ifr_ifindex = dev->ifindex;
4320 return 0;
14e3e079 4321
d1b19dff
ED
4322 case SIOCGIFTXQLEN:
4323 ifr->ifr_qlen = dev->tx_queue_len;
4324 return 0;
14e3e079 4325
d1b19dff
ED
4326 default:
4327 /* dev_ioctl() should ensure this case
4328 * is never reached
4329 */
4330 WARN_ON(1);
4331 err = -EINVAL;
4332 break;
14e3e079
JG
4333
4334 }
4335 return err;
4336}
4337
4338/*
4339 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4340 */
4341static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4342{
4343 int err;
4344 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5f2f6da7 4345 const struct net_device_ops *ops;
14e3e079
JG
4346
4347 if (!dev)
4348 return -ENODEV;
4349
5f2f6da7
JP
4350 ops = dev->netdev_ops;
4351
14e3e079 4352 switch (cmd) {
d1b19dff
ED
4353 case SIOCSIFFLAGS: /* Set interface flags */
4354 return dev_change_flags(dev, ifr->ifr_flags);
14e3e079 4355
d1b19dff
ED
4356 case SIOCSIFMETRIC: /* Set the metric on the interface
4357 (currently unused) */
4358 return -EOPNOTSUPP;
14e3e079 4359
d1b19dff
ED
4360 case SIOCSIFMTU: /* Set the MTU of a device */
4361 return dev_set_mtu(dev, ifr->ifr_mtu);
1da177e4 4362
d1b19dff
ED
4363 case SIOCSIFHWADDR:
4364 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
1da177e4 4365
d1b19dff
ED
4366 case SIOCSIFHWBROADCAST:
4367 if (ifr->ifr_hwaddr.sa_family != dev->type)
4368 return -EINVAL;
4369 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4370 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4371 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4372 return 0;
1da177e4 4373
d1b19dff
ED
4374 case SIOCSIFMAP:
4375 if (ops->ndo_set_config) {
1da177e4
LT
4376 if (!netif_device_present(dev))
4377 return -ENODEV;
d1b19dff
ED
4378 return ops->ndo_set_config(dev, &ifr->ifr_map);
4379 }
4380 return -EOPNOTSUPP;
1da177e4 4381
d1b19dff
ED
4382 case SIOCADDMULTI:
4383 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4384 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4385 return -EINVAL;
4386 if (!netif_device_present(dev))
4387 return -ENODEV;
22bedad3 4388 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
d1b19dff
ED
4389
4390 case SIOCDELMULTI:
4391 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4392 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4393 return -EINVAL;
4394 if (!netif_device_present(dev))
4395 return -ENODEV;
22bedad3 4396 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
1da177e4 4397
d1b19dff
ED
4398 case SIOCSIFTXQLEN:
4399 if (ifr->ifr_qlen < 0)
4400 return -EINVAL;
4401 dev->tx_queue_len = ifr->ifr_qlen;
4402 return 0;
1da177e4 4403
d1b19dff
ED
4404 case SIOCSIFNAME:
4405 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4406 return dev_change_name(dev, ifr->ifr_newname);
1da177e4 4407
d1b19dff
ED
4408 /*
4409 * Unknown or private ioctl
4410 */
4411 default:
4412 if ((cmd >= SIOCDEVPRIVATE &&
4413 cmd <= SIOCDEVPRIVATE + 15) ||
4414 cmd == SIOCBONDENSLAVE ||
4415 cmd == SIOCBONDRELEASE ||
4416 cmd == SIOCBONDSETHWADDR ||
4417 cmd == SIOCBONDSLAVEINFOQUERY ||
4418 cmd == SIOCBONDINFOQUERY ||
4419 cmd == SIOCBONDCHANGEACTIVE ||
4420 cmd == SIOCGMIIPHY ||
4421 cmd == SIOCGMIIREG ||
4422 cmd == SIOCSMIIREG ||
4423 cmd == SIOCBRADDIF ||
4424 cmd == SIOCBRDELIF ||
4425 cmd == SIOCSHWTSTAMP ||
4426 cmd == SIOCWANDEV) {
4427 err = -EOPNOTSUPP;
4428 if (ops->ndo_do_ioctl) {
4429 if (netif_device_present(dev))
4430 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4431 else
4432 err = -ENODEV;
4433 }
4434 } else
4435 err = -EINVAL;
1da177e4
LT
4436
4437 }
4438 return err;
4439}
4440
4441/*
4442 * This function handles all "interface"-type I/O control requests. The actual
4443 * 'doing' part of this is dev_ifsioc above.
4444 */
4445
4446/**
4447 * dev_ioctl - network device ioctl
c4ea43c5 4448 * @net: the applicable net namespace
1da177e4
LT
4449 * @cmd: command to issue
4450 * @arg: pointer to a struct ifreq in user space
4451 *
4452 * Issue ioctl functions to devices. This is normally called by the
4453 * user space syscall interfaces but can sometimes be useful for
4454 * other purposes. The return value is the return from the syscall if
4455 * positive or a negative errno code on error.
4456 */
4457
881d966b 4458int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1da177e4
LT
4459{
4460 struct ifreq ifr;
4461 int ret;
4462 char *colon;
4463
4464 /* One special case: SIOCGIFCONF takes ifconf argument
4465 and requires shared lock, because it sleeps writing
4466 to user space.
4467 */
4468
4469 if (cmd == SIOCGIFCONF) {
6756ae4b 4470 rtnl_lock();
881d966b 4471 ret = dev_ifconf(net, (char __user *) arg);
6756ae4b 4472 rtnl_unlock();
1da177e4
LT
4473 return ret;
4474 }
4475 if (cmd == SIOCGIFNAME)
881d966b 4476 return dev_ifname(net, (struct ifreq __user *)arg);
1da177e4
LT
4477
4478 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4479 return -EFAULT;
4480
4481 ifr.ifr_name[IFNAMSIZ-1] = 0;
4482
4483 colon = strchr(ifr.ifr_name, ':');
4484 if (colon)
4485 *colon = 0;
4486
4487 /*
4488 * See which interface the caller is talking about.
4489 */
4490
4491 switch (cmd) {
d1b19dff
ED
4492 /*
4493 * These ioctl calls:
4494 * - can be done by all.
4495 * - atomic and do not require locking.
4496 * - return a value
4497 */
4498 case SIOCGIFFLAGS:
4499 case SIOCGIFMETRIC:
4500 case SIOCGIFMTU:
4501 case SIOCGIFHWADDR:
4502 case SIOCGIFSLAVE:
4503 case SIOCGIFMAP:
4504 case SIOCGIFINDEX:
4505 case SIOCGIFTXQLEN:
4506 dev_load(net, ifr.ifr_name);
3710becf 4507 rcu_read_lock();
d1b19dff 4508 ret = dev_ifsioc_locked(net, &ifr, cmd);
3710becf 4509 rcu_read_unlock();
d1b19dff
ED
4510 if (!ret) {
4511 if (colon)
4512 *colon = ':';
4513 if (copy_to_user(arg, &ifr,
4514 sizeof(struct ifreq)))
4515 ret = -EFAULT;
4516 }
4517 return ret;
1da177e4 4518
d1b19dff
ED
4519 case SIOCETHTOOL:
4520 dev_load(net, ifr.ifr_name);
4521 rtnl_lock();
4522 ret = dev_ethtool(net, &ifr);
4523 rtnl_unlock();
4524 if (!ret) {
4525 if (colon)
4526 *colon = ':';
4527 if (copy_to_user(arg, &ifr,
4528 sizeof(struct ifreq)))
4529 ret = -EFAULT;
4530 }
4531 return ret;
1da177e4 4532
d1b19dff
ED
4533 /*
4534 * These ioctl calls:
4535 * - require superuser power.
4536 * - require strict serialization.
4537 * - return a value
4538 */
4539 case SIOCGMIIPHY:
4540 case SIOCGMIIREG:
4541 case SIOCSIFNAME:
4542 if (!capable(CAP_NET_ADMIN))
4543 return -EPERM;
4544 dev_load(net, ifr.ifr_name);
4545 rtnl_lock();
4546 ret = dev_ifsioc(net, &ifr, cmd);
4547 rtnl_unlock();
4548 if (!ret) {
4549 if (colon)
4550 *colon = ':';
4551 if (copy_to_user(arg, &ifr,
4552 sizeof(struct ifreq)))
4553 ret = -EFAULT;
4554 }
4555 return ret;
1da177e4 4556
d1b19dff
ED
4557 /*
4558 * These ioctl calls:
4559 * - require superuser power.
4560 * - require strict serialization.
4561 * - do not return a value
4562 */
4563 case SIOCSIFFLAGS:
4564 case SIOCSIFMETRIC:
4565 case SIOCSIFMTU:
4566 case SIOCSIFMAP:
4567 case SIOCSIFHWADDR:
4568 case SIOCSIFSLAVE:
4569 case SIOCADDMULTI:
4570 case SIOCDELMULTI:
4571 case SIOCSIFHWBROADCAST:
4572 case SIOCSIFTXQLEN:
4573 case SIOCSMIIREG:
4574 case SIOCBONDENSLAVE:
4575 case SIOCBONDRELEASE:
4576 case SIOCBONDSETHWADDR:
4577 case SIOCBONDCHANGEACTIVE:
4578 case SIOCBRADDIF:
4579 case SIOCBRDELIF:
4580 case SIOCSHWTSTAMP:
4581 if (!capable(CAP_NET_ADMIN))
4582 return -EPERM;
4583 /* fall through */
4584 case SIOCBONDSLAVEINFOQUERY:
4585 case SIOCBONDINFOQUERY:
4586 dev_load(net, ifr.ifr_name);
4587 rtnl_lock();
4588 ret = dev_ifsioc(net, &ifr, cmd);
4589 rtnl_unlock();
4590 return ret;
4591
4592 case SIOCGIFMEM:
4593 /* Get the per device memory space. We can add this but
4594 * currently do not support it */
4595 case SIOCSIFMEM:
4596 /* Set the per device memory buffer space.
4597 * Not applicable in our case */
4598 case SIOCSIFLINK:
4599 return -EINVAL;
4600
4601 /*
4602 * Unknown or private ioctl.
4603 */
4604 default:
4605 if (cmd == SIOCWANDEV ||
4606 (cmd >= SIOCDEVPRIVATE &&
4607 cmd <= SIOCDEVPRIVATE + 15)) {
881d966b 4608 dev_load(net, ifr.ifr_name);
1da177e4 4609 rtnl_lock();
881d966b 4610 ret = dev_ifsioc(net, &ifr, cmd);
1da177e4 4611 rtnl_unlock();
d1b19dff
ED
4612 if (!ret && copy_to_user(arg, &ifr,
4613 sizeof(struct ifreq)))
4614 ret = -EFAULT;
1da177e4 4615 return ret;
d1b19dff
ED
4616 }
4617 /* Take care of Wireless Extensions */
4618 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4619 return wext_handle_ioctl(net, &ifr, cmd, arg);
4620 return -EINVAL;
1da177e4
LT
4621 }
4622}
4623
4624
4625/**
4626 * dev_new_index - allocate an ifindex
c4ea43c5 4627 * @net: the applicable net namespace
1da177e4
LT
4628 *
4629 * Returns a suitable unique value for a new device interface
4630 * number. The caller must hold the rtnl semaphore or the
4631 * dev_base_lock to be sure it remains unique.
4632 */
881d966b 4633static int dev_new_index(struct net *net)
1da177e4
LT
4634{
4635 static int ifindex;
4636 for (;;) {
4637 if (++ifindex <= 0)
4638 ifindex = 1;
881d966b 4639 if (!__dev_get_by_index(net, ifindex))
1da177e4
LT
4640 return ifindex;
4641 }
4642}
4643
1da177e4 4644/* Delayed registration/unregisteration */
3b5b34fd 4645static LIST_HEAD(net_todo_list);
1da177e4 4646
6f05f629 4647static void net_set_todo(struct net_device *dev)
1da177e4 4648{
1da177e4 4649 list_add_tail(&dev->todo_list, &net_todo_list);
1da177e4
LT
4650}
4651
9b5e383c 4652static void rollback_registered_many(struct list_head *head)
93ee31f1 4653{
e93737b0 4654 struct net_device *dev, *tmp;
9b5e383c 4655
93ee31f1
DL
4656 BUG_ON(dev_boot_phase);
4657 ASSERT_RTNL();
4658
e93737b0 4659 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9b5e383c 4660 /* Some devices call without registering
e93737b0
KK
4661 * for initialization unwind. Remove those
4662 * devices and proceed with the remaining.
9b5e383c
ED
4663 */
4664 if (dev->reg_state == NETREG_UNINITIALIZED) {
4665 pr_debug("unregister_netdevice: device %s/%p never "
4666 "was registered\n", dev->name, dev);
93ee31f1 4667
9b5e383c 4668 WARN_ON(1);
e93737b0
KK
4669 list_del(&dev->unreg_list);
4670 continue;
9b5e383c 4671 }
93ee31f1 4672
9b5e383c 4673 BUG_ON(dev->reg_state != NETREG_REGISTERED);
93ee31f1 4674
9b5e383c
ED
4675 /* If device is running, close it first. */
4676 dev_close(dev);
93ee31f1 4677
9b5e383c
ED
4678 /* And unlink it from device chain. */
4679 unlist_netdevice(dev);
93ee31f1 4680
9b5e383c
ED
4681 dev->reg_state = NETREG_UNREGISTERING;
4682 }
93ee31f1
DL
4683
4684 synchronize_net();
4685
9b5e383c
ED
4686 list_for_each_entry(dev, head, unreg_list) {
4687 /* Shutdown queueing discipline. */
4688 dev_shutdown(dev);
93ee31f1
DL
4689
4690
9b5e383c
ED
4691 /* Notify protocols, that we are about to destroy
4692 this device. They should clean all the things.
4693 */
4694 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
93ee31f1 4695
a2835763
PM
4696 if (!dev->rtnl_link_ops ||
4697 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4698 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4699
9b5e383c
ED
4700 /*
4701 * Flush the unicast and multicast chains
4702 */
a748ee24 4703 dev_uc_flush(dev);
22bedad3 4704 dev_mc_flush(dev);
93ee31f1 4705
9b5e383c
ED
4706 if (dev->netdev_ops->ndo_uninit)
4707 dev->netdev_ops->ndo_uninit(dev);
93ee31f1 4708
9b5e383c
ED
4709 /* Notifier chain MUST detach us from master device. */
4710 WARN_ON(dev->master);
93ee31f1 4711
9b5e383c
ED
4712 /* Remove entries from kobject tree */
4713 netdev_unregister_kobject(dev);
4714 }
93ee31f1 4715
a5ee1551 4716 /* Process any work delayed until the end of the batch */
e5e26d75 4717 dev = list_first_entry(head, struct net_device, unreg_list);
a5ee1551 4718 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
93ee31f1 4719
a5ee1551 4720 synchronize_net();
395264d5 4721
a5ee1551 4722 list_for_each_entry(dev, head, unreg_list)
9b5e383c
ED
4723 dev_put(dev);
4724}
4725
4726static void rollback_registered(struct net_device *dev)
4727{
4728 LIST_HEAD(single);
4729
4730 list_add(&dev->unreg_list, &single);
4731 rollback_registered_many(&single);
93ee31f1
DL
4732}
4733
e8a0464c
DM
4734static void __netdev_init_queue_locks_one(struct net_device *dev,
4735 struct netdev_queue *dev_queue,
4736 void *_unused)
c773e847
DM
4737{
4738 spin_lock_init(&dev_queue->_xmit_lock);
cf508b12 4739 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
c773e847
DM
4740 dev_queue->xmit_lock_owner = -1;
4741}
4742
4743static void netdev_init_queue_locks(struct net_device *dev)
4744{
e8a0464c
DM
4745 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4746 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
c773e847
DM
4747}
4748
b63365a2
HX
4749unsigned long netdev_fix_features(unsigned long features, const char *name)
4750{
4751 /* Fix illegal SG+CSUM combinations. */
4752 if ((features & NETIF_F_SG) &&
4753 !(features & NETIF_F_ALL_CSUM)) {
4754 if (name)
4755 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4756 "checksum feature.\n", name);
4757 features &= ~NETIF_F_SG;
4758 }
4759
4760 /* TSO requires that SG is present as well. */
4761 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4762 if (name)
4763 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4764 "SG feature.\n", name);
4765 features &= ~NETIF_F_TSO;
4766 }
4767
4768 if (features & NETIF_F_UFO) {
4769 if (!(features & NETIF_F_GEN_CSUM)) {
4770 if (name)
4771 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4772 "since no NETIF_F_HW_CSUM feature.\n",
4773 name);
4774 features &= ~NETIF_F_UFO;
4775 }
4776
4777 if (!(features & NETIF_F_SG)) {
4778 if (name)
4779 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4780 "since no NETIF_F_SG feature.\n", name);
4781 features &= ~NETIF_F_UFO;
4782 }
4783 }
4784
4785 return features;
4786}
4787EXPORT_SYMBOL(netdev_fix_features);
4788
fc4a7489
PM
4789/**
4790 * netif_stacked_transfer_operstate - transfer operstate
4791 * @rootdev: the root or lower level device to transfer state from
4792 * @dev: the device to transfer operstate to
4793 *
4794 * Transfer operational state from root to device. This is normally
4795 * called when a stacking relationship exists between the root
4796 * device and the device(a leaf device).
4797 */
4798void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4799 struct net_device *dev)
4800{
4801 if (rootdev->operstate == IF_OPER_DORMANT)
4802 netif_dormant_on(dev);
4803 else
4804 netif_dormant_off(dev);
4805
4806 if (netif_carrier_ok(rootdev)) {
4807 if (!netif_carrier_ok(dev))
4808 netif_carrier_on(dev);
4809 } else {
4810 if (netif_carrier_ok(dev))
4811 netif_carrier_off(dev);
4812 }
4813}
4814EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4815
1da177e4
LT
4816/**
4817 * register_netdevice - register a network device
4818 * @dev: device to register
4819 *
4820 * Take a completed network device structure and add it to the kernel
4821 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4822 * chain. 0 is returned on success. A negative errno code is returned
4823 * on a failure to set up the device, or if the name is a duplicate.
4824 *
4825 * Callers must hold the rtnl semaphore. You may want
4826 * register_netdev() instead of this.
4827 *
4828 * BUGS:
4829 * The locking appears insufficient to guarantee two parallel registers
4830 * will not get the same name.
4831 */
4832
4833int register_netdevice(struct net_device *dev)
4834{
1da177e4 4835 int ret;
d314774c 4836 struct net *net = dev_net(dev);
1da177e4
LT
4837
4838 BUG_ON(dev_boot_phase);
4839 ASSERT_RTNL();
4840
b17a7c17
SH
4841 might_sleep();
4842
1da177e4
LT
4843 /* When net_device's are persistent, this will be fatal. */
4844 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
d314774c 4845 BUG_ON(!net);
1da177e4 4846
f1f28aa3 4847 spin_lock_init(&dev->addr_list_lock);
cf508b12 4848 netdev_set_addr_lockdep_class(dev);
c773e847 4849 netdev_init_queue_locks(dev);
1da177e4 4850
1da177e4
LT
4851 dev->iflink = -1;
4852
df334545 4853#ifdef CONFIG_RPS
0a9627f2
TH
4854 if (!dev->num_rx_queues) {
4855 /*
4856 * Allocate a single RX queue if driver never called
4857 * alloc_netdev_mq
4858 */
4859
4860 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
4861 if (!dev->_rx) {
4862 ret = -ENOMEM;
4863 goto out;
4864 }
4865
4866 dev->_rx->first = dev->_rx;
4867 atomic_set(&dev->_rx->count, 1);
4868 dev->num_rx_queues = 1;
4869 }
df334545 4870#endif
1da177e4 4871 /* Init, if this function is available */
d314774c
SH
4872 if (dev->netdev_ops->ndo_init) {
4873 ret = dev->netdev_ops->ndo_init(dev);
1da177e4
LT
4874 if (ret) {
4875 if (ret > 0)
4876 ret = -EIO;
90833aa4 4877 goto out;
1da177e4
LT
4878 }
4879 }
4ec93edb 4880
d9031024
OP
4881 ret = dev_get_valid_name(net, dev->name, dev->name, 0);
4882 if (ret)
7ce1b0ed 4883 goto err_uninit;
1da177e4 4884
881d966b 4885 dev->ifindex = dev_new_index(net);
1da177e4
LT
4886 if (dev->iflink == -1)
4887 dev->iflink = dev->ifindex;
4888
d212f87b
SH
4889 /* Fix illegal checksum combinations */
4890 if ((dev->features & NETIF_F_HW_CSUM) &&
4891 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4892 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4893 dev->name);
4894 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4895 }
4896
4897 if ((dev->features & NETIF_F_NO_CSUM) &&
4898 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4899 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4900 dev->name);
4901 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4902 }
4903
b63365a2 4904 dev->features = netdev_fix_features(dev->features, dev->name);
1da177e4 4905
e5a4a72d
LB
4906 /* Enable software GSO if SG is supported. */
4907 if (dev->features & NETIF_F_SG)
4908 dev->features |= NETIF_F_GSO;
4909
aaf8cdc3 4910 netdev_initialize_kobject(dev);
7ffbe3fd
JB
4911
4912 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
4913 ret = notifier_to_errno(ret);
4914 if (ret)
4915 goto err_uninit;
4916
8b41d188 4917 ret = netdev_register_kobject(dev);
b17a7c17 4918 if (ret)
7ce1b0ed 4919 goto err_uninit;
b17a7c17
SH
4920 dev->reg_state = NETREG_REGISTERED;
4921
1da177e4
LT
4922 /*
4923 * Default initial state at registry is that the
4924 * device is present.
4925 */
4926
4927 set_bit(__LINK_STATE_PRESENT, &dev->state);
4928
1da177e4 4929 dev_init_scheduler(dev);
1da177e4 4930 dev_hold(dev);
ce286d32 4931 list_netdevice(dev);
1da177e4
LT
4932
4933 /* Notify protocols, that a new device appeared. */
056925ab 4934 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 4935 ret = notifier_to_errno(ret);
93ee31f1
DL
4936 if (ret) {
4937 rollback_registered(dev);
4938 dev->reg_state = NETREG_UNREGISTERED;
4939 }
d90a909e
EB
4940 /*
4941 * Prevent userspace races by waiting until the network
4942 * device is fully setup before sending notifications.
4943 */
a2835763
PM
4944 if (!dev->rtnl_link_ops ||
4945 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4946 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
1da177e4
LT
4947
4948out:
4949 return ret;
7ce1b0ed
HX
4950
4951err_uninit:
d314774c
SH
4952 if (dev->netdev_ops->ndo_uninit)
4953 dev->netdev_ops->ndo_uninit(dev);
7ce1b0ed 4954 goto out;
1da177e4 4955}
d1b19dff 4956EXPORT_SYMBOL(register_netdevice);
1da177e4 4957
937f1ba5
BH
4958/**
4959 * init_dummy_netdev - init a dummy network device for NAPI
4960 * @dev: device to init
4961 *
4962 * This takes a network device structure and initialize the minimum
4963 * amount of fields so it can be used to schedule NAPI polls without
4964 * registering a full blown interface. This is to be used by drivers
4965 * that need to tie several hardware interfaces to a single NAPI
4966 * poll scheduler due to HW limitations.
4967 */
4968int init_dummy_netdev(struct net_device *dev)
4969{
4970 /* Clear everything. Note we don't initialize spinlocks
4971 * are they aren't supposed to be taken by any of the
4972 * NAPI code and this dummy netdev is supposed to be
4973 * only ever used for NAPI polls
4974 */
4975 memset(dev, 0, sizeof(struct net_device));
4976
4977 /* make sure we BUG if trying to hit standard
4978 * register/unregister code path
4979 */
4980 dev->reg_state = NETREG_DUMMY;
4981
4982 /* initialize the ref count */
4983 atomic_set(&dev->refcnt, 1);
4984
4985 /* NAPI wants this */
4986 INIT_LIST_HEAD(&dev->napi_list);
4987
4988 /* a dummy interface is started by default */
4989 set_bit(__LINK_STATE_PRESENT, &dev->state);
4990 set_bit(__LINK_STATE_START, &dev->state);
4991
4992 return 0;
4993}
4994EXPORT_SYMBOL_GPL(init_dummy_netdev);
4995
4996
1da177e4
LT
4997/**
4998 * register_netdev - register a network device
4999 * @dev: device to register
5000 *
5001 * Take a completed network device structure and add it to the kernel
5002 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5003 * chain. 0 is returned on success. A negative errno code is returned
5004 * on a failure to set up the device, or if the name is a duplicate.
5005 *
38b4da38 5006 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
5007 * and expands the device name if you passed a format string to
5008 * alloc_netdev.
5009 */
5010int register_netdev(struct net_device *dev)
5011{
5012 int err;
5013
5014 rtnl_lock();
5015
5016 /*
5017 * If the name is a format string the caller wants us to do a
5018 * name allocation.
5019 */
5020 if (strchr(dev->name, '%')) {
5021 err = dev_alloc_name(dev, dev->name);
5022 if (err < 0)
5023 goto out;
5024 }
4ec93edb 5025
1da177e4
LT
5026 err = register_netdevice(dev);
5027out:
5028 rtnl_unlock();
5029 return err;
5030}
5031EXPORT_SYMBOL(register_netdev);
5032
5033/*
5034 * netdev_wait_allrefs - wait until all references are gone.
5035 *
5036 * This is called when unregistering network devices.
5037 *
5038 * Any protocol or device that holds a reference should register
5039 * for netdevice notification, and cleanup and put back the
5040 * reference if they receive an UNREGISTER event.
5041 * We can get stuck here if buggy protocols don't correctly
4ec93edb 5042 * call dev_put.
1da177e4
LT
5043 */
5044static void netdev_wait_allrefs(struct net_device *dev)
5045{
5046 unsigned long rebroadcast_time, warning_time;
5047
e014debe
ED
5048 linkwatch_forget_dev(dev);
5049
1da177e4
LT
5050 rebroadcast_time = warning_time = jiffies;
5051 while (atomic_read(&dev->refcnt) != 0) {
5052 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 5053 rtnl_lock();
1da177e4
LT
5054
5055 /* Rebroadcast unregister notification */
056925ab 5056 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
a5ee1551 5057 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
395264d5 5058 * should have already handle it the first time */
1da177e4
LT
5059
5060 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5061 &dev->state)) {
5062 /* We must not have linkwatch events
5063 * pending on unregister. If this
5064 * happens, we simply run the queue
5065 * unscheduled, resulting in a noop
5066 * for this device.
5067 */
5068 linkwatch_run_queue();
5069 }
5070
6756ae4b 5071 __rtnl_unlock();
1da177e4
LT
5072
5073 rebroadcast_time = jiffies;
5074 }
5075
5076 msleep(250);
5077
5078 if (time_after(jiffies, warning_time + 10 * HZ)) {
5079 printk(KERN_EMERG "unregister_netdevice: "
5080 "waiting for %s to become free. Usage "
5081 "count = %d\n",
5082 dev->name, atomic_read(&dev->refcnt));
5083 warning_time = jiffies;
5084 }
5085 }
5086}
5087
5088/* The sequence is:
5089 *
5090 * rtnl_lock();
5091 * ...
5092 * register_netdevice(x1);
5093 * register_netdevice(x2);
5094 * ...
5095 * unregister_netdevice(y1);
5096 * unregister_netdevice(y2);
5097 * ...
5098 * rtnl_unlock();
5099 * free_netdev(y1);
5100 * free_netdev(y2);
5101 *
58ec3b4d 5102 * We are invoked by rtnl_unlock().
1da177e4 5103 * This allows us to deal with problems:
b17a7c17 5104 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
5105 * without deadlocking with linkwatch via keventd.
5106 * 2) Since we run with the RTNL semaphore not held, we can sleep
5107 * safely in order to wait for the netdev refcnt to drop to zero.
58ec3b4d
HX
5108 *
5109 * We must not return until all unregister events added during
5110 * the interval the lock was held have been completed.
1da177e4 5111 */
1da177e4
LT
5112void netdev_run_todo(void)
5113{
626ab0e6 5114 struct list_head list;
1da177e4 5115
1da177e4 5116 /* Snapshot list, allow later requests */
626ab0e6 5117 list_replace_init(&net_todo_list, &list);
58ec3b4d
HX
5118
5119 __rtnl_unlock();
626ab0e6 5120
1da177e4
LT
5121 while (!list_empty(&list)) {
5122 struct net_device *dev
e5e26d75 5123 = list_first_entry(&list, struct net_device, todo_list);
1da177e4
LT
5124 list_del(&dev->todo_list);
5125
b17a7c17
SH
5126 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5127 printk(KERN_ERR "network todo '%s' but state %d\n",
5128 dev->name, dev->reg_state);
5129 dump_stack();
5130 continue;
5131 }
1da177e4 5132
b17a7c17 5133 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 5134
152102c7 5135 on_each_cpu(flush_backlog, dev, 1);
6e583ce5 5136
b17a7c17 5137 netdev_wait_allrefs(dev);
1da177e4 5138
b17a7c17
SH
5139 /* paranoia */
5140 BUG_ON(atomic_read(&dev->refcnt));
547b792c
IJ
5141 WARN_ON(dev->ip_ptr);
5142 WARN_ON(dev->ip6_ptr);
5143 WARN_ON(dev->dn_ptr);
1da177e4 5144
b17a7c17
SH
5145 if (dev->destructor)
5146 dev->destructor(dev);
9093bbb2
SH
5147
5148 /* Free network device */
5149 kobject_put(&dev->dev.kobj);
1da177e4 5150 }
1da177e4
LT
5151}
5152
d83345ad
ED
5153/**
5154 * dev_txq_stats_fold - fold tx_queues stats
5155 * @dev: device to get statistics from
5156 * @stats: struct net_device_stats to hold results
5157 */
5158void dev_txq_stats_fold(const struct net_device *dev,
5159 struct net_device_stats *stats)
5160{
5161 unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5162 unsigned int i;
5163 struct netdev_queue *txq;
5164
5165 for (i = 0; i < dev->num_tx_queues; i++) {
5166 txq = netdev_get_tx_queue(dev, i);
5167 tx_bytes += txq->tx_bytes;
5168 tx_packets += txq->tx_packets;
5169 tx_dropped += txq->tx_dropped;
5170 }
5171 if (tx_bytes || tx_packets || tx_dropped) {
5172 stats->tx_bytes = tx_bytes;
5173 stats->tx_packets = tx_packets;
5174 stats->tx_dropped = tx_dropped;
5175 }
5176}
5177EXPORT_SYMBOL(dev_txq_stats_fold);
5178
eeda3fd6
SH
5179/**
5180 * dev_get_stats - get network device statistics
5181 * @dev: device to get statistics from
5182 *
5183 * Get network statistics from device. The device driver may provide
5184 * its own method by setting dev->netdev_ops->get_stats; otherwise
5185 * the internal statistics structure is used.
5186 */
5187const struct net_device_stats *dev_get_stats(struct net_device *dev)
7004bf25 5188{
eeda3fd6
SH
5189 const struct net_device_ops *ops = dev->netdev_ops;
5190
5191 if (ops->ndo_get_stats)
5192 return ops->ndo_get_stats(dev);
d83345ad
ED
5193
5194 dev_txq_stats_fold(dev, &dev->stats);
5195 return &dev->stats;
c45d286e 5196}
eeda3fd6 5197EXPORT_SYMBOL(dev_get_stats);
c45d286e 5198
dc2b4847 5199static void netdev_init_one_queue(struct net_device *dev,
e8a0464c
DM
5200 struct netdev_queue *queue,
5201 void *_unused)
dc2b4847 5202{
dc2b4847
DM
5203 queue->dev = dev;
5204}
5205
bb949fbd
DM
5206static void netdev_init_queues(struct net_device *dev)
5207{
e8a0464c
DM
5208 netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5209 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
c3f26a26 5210 spin_lock_init(&dev->tx_global_lock);
bb949fbd
DM
5211}
5212
1da177e4 5213/**
f25f4e44 5214 * alloc_netdev_mq - allocate network device
1da177e4
LT
5215 * @sizeof_priv: size of private data to allocate space for
5216 * @name: device name format string
5217 * @setup: callback to initialize device
f25f4e44 5218 * @queue_count: the number of subqueues to allocate
1da177e4
LT
5219 *
5220 * Allocates a struct net_device with private data area for driver use
f25f4e44
PWJ
5221 * and performs basic initialization. Also allocates subquue structs
5222 * for each queue on the device at the end of the netdevice.
1da177e4 5223 */
f25f4e44
PWJ
5224struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5225 void (*setup)(struct net_device *), unsigned int queue_count)
1da177e4 5226{
e8a0464c 5227 struct netdev_queue *tx;
1da177e4 5228 struct net_device *dev;
7943986c 5229 size_t alloc_size;
1ce8e7b5 5230 struct net_device *p;
df334545
ED
5231#ifdef CONFIG_RPS
5232 struct netdev_rx_queue *rx;
0a9627f2 5233 int i;
df334545 5234#endif
1da177e4 5235
b6fe17d6
SH
5236 BUG_ON(strlen(name) >= sizeof(dev->name));
5237
fd2ea0a7 5238 alloc_size = sizeof(struct net_device);
d1643d24
AD
5239 if (sizeof_priv) {
5240 /* ensure 32-byte alignment of private area */
1ce8e7b5 5241 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
d1643d24
AD
5242 alloc_size += sizeof_priv;
5243 }
5244 /* ensure 32-byte alignment of whole construct */
1ce8e7b5 5245 alloc_size += NETDEV_ALIGN - 1;
1da177e4 5246
31380de9 5247 p = kzalloc(alloc_size, GFP_KERNEL);
1da177e4 5248 if (!p) {
b6fe17d6 5249 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
1da177e4
LT
5250 return NULL;
5251 }
1da177e4 5252
7943986c 5253 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
e8a0464c
DM
5254 if (!tx) {
5255 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5256 "tx qdiscs.\n");
ab9c73cc 5257 goto free_p;
e8a0464c
DM
5258 }
5259
df334545 5260#ifdef CONFIG_RPS
0a9627f2
TH
5261 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5262 if (!rx) {
5263 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5264 "rx queues.\n");
5265 goto free_tx;
5266 }
5267
5268 atomic_set(&rx->count, queue_count);
5269
5270 /*
5271 * Set a pointer to first element in the array which holds the
5272 * reference count.
5273 */
5274 for (i = 0; i < queue_count; i++)
5275 rx[i].first = rx;
df334545 5276#endif
0a9627f2 5277
1ce8e7b5 5278 dev = PTR_ALIGN(p, NETDEV_ALIGN);
1da177e4 5279 dev->padded = (char *)dev - (char *)p;
ab9c73cc
JP
5280
5281 if (dev_addr_init(dev))
0a9627f2 5282 goto free_rx;
ab9c73cc 5283
22bedad3 5284 dev_mc_init(dev);
a748ee24 5285 dev_uc_init(dev);
ccffad25 5286
c346dca1 5287 dev_net_set(dev, &init_net);
1da177e4 5288
e8a0464c
DM
5289 dev->_tx = tx;
5290 dev->num_tx_queues = queue_count;
fd2ea0a7 5291 dev->real_num_tx_queues = queue_count;
e8a0464c 5292
df334545 5293#ifdef CONFIG_RPS
0a9627f2
TH
5294 dev->_rx = rx;
5295 dev->num_rx_queues = queue_count;
df334545 5296#endif
0a9627f2 5297
82cc1a7a 5298 dev->gso_max_size = GSO_MAX_SIZE;
1da177e4 5299
bb949fbd
DM
5300 netdev_init_queues(dev);
5301
15682bc4
PWJ
5302 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5303 dev->ethtool_ntuple_list.count = 0;
d565b0a1 5304 INIT_LIST_HEAD(&dev->napi_list);
9fdce099 5305 INIT_LIST_HEAD(&dev->unreg_list);
e014debe 5306 INIT_LIST_HEAD(&dev->link_watch_list);
93f154b5 5307 dev->priv_flags = IFF_XMIT_DST_RELEASE;
1da177e4
LT
5308 setup(dev);
5309 strcpy(dev->name, name);
5310 return dev;
ab9c73cc 5311
0a9627f2 5312free_rx:
df334545 5313#ifdef CONFIG_RPS
0a9627f2 5314 kfree(rx);
ab9c73cc 5315free_tx:
df334545 5316#endif
ab9c73cc 5317 kfree(tx);
ab9c73cc
JP
5318free_p:
5319 kfree(p);
5320 return NULL;
1da177e4 5321}
f25f4e44 5322EXPORT_SYMBOL(alloc_netdev_mq);
1da177e4
LT
5323
5324/**
5325 * free_netdev - free network device
5326 * @dev: device
5327 *
4ec93edb
YH
5328 * This function does the last stage of destroying an allocated device
5329 * interface. The reference to the device object is released.
1da177e4
LT
5330 * If this is the last reference then it will be freed.
5331 */
5332void free_netdev(struct net_device *dev)
5333{
d565b0a1
HX
5334 struct napi_struct *p, *n;
5335
f3005d7f
DL
5336 release_net(dev_net(dev));
5337
e8a0464c
DM
5338 kfree(dev->_tx);
5339
f001fde5
JP
5340 /* Flush device addresses */
5341 dev_addr_flush(dev);
5342
15682bc4
PWJ
5343 /* Clear ethtool n-tuple list */
5344 ethtool_ntuple_flush(dev);
5345
d565b0a1
HX
5346 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5347 netif_napi_del(p);
5348
3041a069 5349 /* Compatibility with error handling in drivers */
1da177e4
LT
5350 if (dev->reg_state == NETREG_UNINITIALIZED) {
5351 kfree((char *)dev - dev->padded);
5352 return;
5353 }
5354
5355 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5356 dev->reg_state = NETREG_RELEASED;
5357
43cb76d9
GKH
5358 /* will free via device release */
5359 put_device(&dev->dev);
1da177e4 5360}
d1b19dff 5361EXPORT_SYMBOL(free_netdev);
4ec93edb 5362
f0db275a
SH
5363/**
5364 * synchronize_net - Synchronize with packet receive processing
5365 *
5366 * Wait for packets currently being received to be done.
5367 * Does not block later packets from starting.
5368 */
4ec93edb 5369void synchronize_net(void)
1da177e4
LT
5370{
5371 might_sleep();
fbd568a3 5372 synchronize_rcu();
1da177e4 5373}
d1b19dff 5374EXPORT_SYMBOL(synchronize_net);
1da177e4
LT
5375
5376/**
44a0873d 5377 * unregister_netdevice_queue - remove device from the kernel
1da177e4 5378 * @dev: device
44a0873d 5379 * @head: list
6ebfbc06 5380 *
1da177e4 5381 * This function shuts down a device interface and removes it
d59b54b1 5382 * from the kernel tables.
44a0873d 5383 * If head not NULL, device is queued to be unregistered later.
1da177e4
LT
5384 *
5385 * Callers must hold the rtnl semaphore. You may want
5386 * unregister_netdev() instead of this.
5387 */
5388
44a0873d 5389void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1da177e4 5390{
a6620712
HX
5391 ASSERT_RTNL();
5392
44a0873d 5393 if (head) {
9fdce099 5394 list_move_tail(&dev->unreg_list, head);
44a0873d
ED
5395 } else {
5396 rollback_registered(dev);
5397 /* Finish processing unregister after unlock */
5398 net_set_todo(dev);
5399 }
1da177e4 5400}
44a0873d 5401EXPORT_SYMBOL(unregister_netdevice_queue);
1da177e4 5402
9b5e383c
ED
5403/**
5404 * unregister_netdevice_many - unregister many devices
5405 * @head: list of devices
9b5e383c
ED
5406 */
5407void unregister_netdevice_many(struct list_head *head)
5408{
5409 struct net_device *dev;
5410
5411 if (!list_empty(head)) {
5412 rollback_registered_many(head);
5413 list_for_each_entry(dev, head, unreg_list)
5414 net_set_todo(dev);
5415 }
5416}
63c8099d 5417EXPORT_SYMBOL(unregister_netdevice_many);
9b5e383c 5418
1da177e4
LT
5419/**
5420 * unregister_netdev - remove device from the kernel
5421 * @dev: device
5422 *
5423 * This function shuts down a device interface and removes it
d59b54b1 5424 * from the kernel tables.
1da177e4
LT
5425 *
5426 * This is just a wrapper for unregister_netdevice that takes
5427 * the rtnl semaphore. In general you want to use this and not
5428 * unregister_netdevice.
5429 */
5430void unregister_netdev(struct net_device *dev)
5431{
5432 rtnl_lock();
5433 unregister_netdevice(dev);
5434 rtnl_unlock();
5435}
1da177e4
LT
5436EXPORT_SYMBOL(unregister_netdev);
5437
ce286d32
EB
5438/**
5439 * dev_change_net_namespace - move device to different nethost namespace
5440 * @dev: device
5441 * @net: network namespace
5442 * @pat: If not NULL name pattern to try if the current device name
5443 * is already taken in the destination network namespace.
5444 *
5445 * This function shuts down a device interface and moves it
5446 * to a new network namespace. On success 0 is returned, on
5447 * a failure a netagive errno code is returned.
5448 *
5449 * Callers must hold the rtnl semaphore.
5450 */
5451
5452int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5453{
ce286d32
EB
5454 int err;
5455
5456 ASSERT_RTNL();
5457
5458 /* Don't allow namespace local devices to be moved. */
5459 err = -EINVAL;
5460 if (dev->features & NETIF_F_NETNS_LOCAL)
5461 goto out;
5462
3891845e
EB
5463#ifdef CONFIG_SYSFS
5464 /* Don't allow real devices to be moved when sysfs
5465 * is enabled.
5466 */
5467 err = -EINVAL;
5468 if (dev->dev.parent)
5469 goto out;
5470#endif
5471
ce286d32
EB
5472 /* Ensure the device has been registrered */
5473 err = -EINVAL;
5474 if (dev->reg_state != NETREG_REGISTERED)
5475 goto out;
5476
5477 /* Get out if there is nothing todo */
5478 err = 0;
878628fb 5479 if (net_eq(dev_net(dev), net))
ce286d32
EB
5480 goto out;
5481
5482 /* Pick the destination device name, and ensure
5483 * we can use it in the destination network namespace.
5484 */
5485 err = -EEXIST;
d9031024 5486 if (__dev_get_by_name(net, dev->name)) {
ce286d32
EB
5487 /* We get here if we can't use the current device name */
5488 if (!pat)
5489 goto out;
d9031024 5490 if (dev_get_valid_name(net, pat, dev->name, 1))
ce286d32
EB
5491 goto out;
5492 }
5493
5494 /*
5495 * And now a mini version of register_netdevice unregister_netdevice.
5496 */
5497
5498 /* If device is running close it first. */
9b772652 5499 dev_close(dev);
ce286d32
EB
5500
5501 /* And unlink it from device chain */
5502 err = -ENODEV;
5503 unlist_netdevice(dev);
5504
5505 synchronize_net();
5506
5507 /* Shutdown queueing discipline. */
5508 dev_shutdown(dev);
5509
5510 /* Notify protocols, that we are about to destroy
5511 this device. They should clean all the things.
5512 */
5513 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
a5ee1551 5514 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
ce286d32
EB
5515
5516 /*
5517 * Flush the unicast and multicast chains
5518 */
a748ee24 5519 dev_uc_flush(dev);
22bedad3 5520 dev_mc_flush(dev);
ce286d32 5521
3891845e
EB
5522 netdev_unregister_kobject(dev);
5523
ce286d32 5524 /* Actually switch the network namespace */
c346dca1 5525 dev_net_set(dev, net);
ce286d32 5526
ce286d32
EB
5527 /* If there is an ifindex conflict assign a new one */
5528 if (__dev_get_by_index(net, dev->ifindex)) {
5529 int iflink = (dev->iflink == dev->ifindex);
5530 dev->ifindex = dev_new_index(net);
5531 if (iflink)
5532 dev->iflink = dev->ifindex;
5533 }
5534
8b41d188 5535 /* Fixup kobjects */
aaf8cdc3 5536 err = netdev_register_kobject(dev);
8b41d188 5537 WARN_ON(err);
ce286d32
EB
5538
5539 /* Add the device back in the hashes */
5540 list_netdevice(dev);
5541
5542 /* Notify protocols, that a new device appeared. */
5543 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5544
d90a909e
EB
5545 /*
5546 * Prevent userspace races by waiting until the network
5547 * device is fully setup before sending notifications.
5548 */
5549 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5550
ce286d32
EB
5551 synchronize_net();
5552 err = 0;
5553out:
5554 return err;
5555}
463d0183 5556EXPORT_SYMBOL_GPL(dev_change_net_namespace);
ce286d32 5557
1da177e4
LT
5558static int dev_cpu_callback(struct notifier_block *nfb,
5559 unsigned long action,
5560 void *ocpu)
5561{
5562 struct sk_buff **list_skb;
37437bb2 5563 struct Qdisc **list_net;
1da177e4
LT
5564 struct sk_buff *skb;
5565 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5566 struct softnet_data *sd, *oldsd;
5567
8bb78442 5568 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
5569 return NOTIFY_OK;
5570
5571 local_irq_disable();
5572 cpu = smp_processor_id();
5573 sd = &per_cpu(softnet_data, cpu);
5574 oldsd = &per_cpu(softnet_data, oldcpu);
5575
5576 /* Find end of our completion_queue. */
5577 list_skb = &sd->completion_queue;
5578 while (*list_skb)
5579 list_skb = &(*list_skb)->next;
5580 /* Append completion queue from offline CPU. */
5581 *list_skb = oldsd->completion_queue;
5582 oldsd->completion_queue = NULL;
5583
5584 /* Find end of our output_queue. */
5585 list_net = &sd->output_queue;
5586 while (*list_net)
5587 list_net = &(*list_net)->next_sched;
5588 /* Append output queue from offline CPU. */
5589 *list_net = oldsd->output_queue;
5590 oldsd->output_queue = NULL;
5591
5592 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5593 local_irq_enable();
5594
5595 /* Process offline CPU's input_pkt_queue */
fec5e652 5596 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
1da177e4 5597 netif_rx(skb);
fec5e652
TH
5598 incr_input_queue_head(oldsd);
5599 }
1da177e4
LT
5600
5601 return NOTIFY_OK;
5602}
1da177e4
LT
5603
5604
7f353bf2 5605/**
b63365a2
HX
5606 * netdev_increment_features - increment feature set by one
5607 * @all: current feature set
5608 * @one: new feature set
5609 * @mask: mask feature set
7f353bf2
HX
5610 *
5611 * Computes a new feature set after adding a device with feature set
b63365a2
HX
5612 * @one to the master device with current feature set @all. Will not
5613 * enable anything that is off in @mask. Returns the new feature set.
7f353bf2 5614 */
b63365a2
HX
5615unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5616 unsigned long mask)
5617{
5618 /* If device needs checksumming, downgrade to it. */
d1b19dff 5619 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
b63365a2
HX
5620 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5621 else if (mask & NETIF_F_ALL_CSUM) {
5622 /* If one device supports v4/v6 checksumming, set for all. */
5623 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5624 !(all & NETIF_F_GEN_CSUM)) {
5625 all &= ~NETIF_F_ALL_CSUM;
5626 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5627 }
e2a6b852 5628
b63365a2
HX
5629 /* If one device supports hw checksumming, set for all. */
5630 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5631 all &= ~NETIF_F_ALL_CSUM;
5632 all |= NETIF_F_HW_CSUM;
5633 }
5634 }
7f353bf2 5635
b63365a2 5636 one |= NETIF_F_ALL_CSUM;
7f353bf2 5637
b63365a2 5638 one |= all & NETIF_F_ONE_FOR_ALL;
d9f5950f 5639 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
b63365a2 5640 all |= one & mask & NETIF_F_ONE_FOR_ALL;
7f353bf2
HX
5641
5642 return all;
5643}
b63365a2 5644EXPORT_SYMBOL(netdev_increment_features);
7f353bf2 5645
30d97d35
PE
5646static struct hlist_head *netdev_create_hash(void)
5647{
5648 int i;
5649 struct hlist_head *hash;
5650
5651 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5652 if (hash != NULL)
5653 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5654 INIT_HLIST_HEAD(&hash[i]);
5655
5656 return hash;
5657}
5658
881d966b 5659/* Initialize per network namespace state */
4665079c 5660static int __net_init netdev_init(struct net *net)
881d966b 5661{
881d966b 5662 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 5663
30d97d35
PE
5664 net->dev_name_head = netdev_create_hash();
5665 if (net->dev_name_head == NULL)
5666 goto err_name;
881d966b 5667
30d97d35
PE
5668 net->dev_index_head = netdev_create_hash();
5669 if (net->dev_index_head == NULL)
5670 goto err_idx;
881d966b
EB
5671
5672 return 0;
30d97d35
PE
5673
5674err_idx:
5675 kfree(net->dev_name_head);
5676err_name:
5677 return -ENOMEM;
881d966b
EB
5678}
5679
f0db275a
SH
5680/**
5681 * netdev_drivername - network driver for the device
5682 * @dev: network device
5683 * @buffer: buffer for resulting name
5684 * @len: size of buffer
5685 *
5686 * Determine network driver for device.
5687 */
cf04a4c7 5688char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6579e57b 5689{
cf04a4c7
SH
5690 const struct device_driver *driver;
5691 const struct device *parent;
6579e57b
AV
5692
5693 if (len <= 0 || !buffer)
5694 return buffer;
5695 buffer[0] = 0;
5696
5697 parent = dev->dev.parent;
5698
5699 if (!parent)
5700 return buffer;
5701
5702 driver = parent->driver;
5703 if (driver && driver->name)
5704 strlcpy(buffer, driver->name, len);
5705 return buffer;
5706}
5707
4665079c 5708static void __net_exit netdev_exit(struct net *net)
881d966b
EB
5709{
5710 kfree(net->dev_name_head);
5711 kfree(net->dev_index_head);
5712}
5713
022cbae6 5714static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
5715 .init = netdev_init,
5716 .exit = netdev_exit,
5717};
5718
4665079c 5719static void __net_exit default_device_exit(struct net *net)
ce286d32 5720{
e008b5fc 5721 struct net_device *dev, *aux;
ce286d32 5722 /*
e008b5fc 5723 * Push all migratable network devices back to the
ce286d32
EB
5724 * initial network namespace
5725 */
5726 rtnl_lock();
e008b5fc 5727 for_each_netdev_safe(net, dev, aux) {
ce286d32 5728 int err;
aca51397 5729 char fb_name[IFNAMSIZ];
ce286d32
EB
5730
5731 /* Ignore unmoveable devices (i.e. loopback) */
5732 if (dev->features & NETIF_F_NETNS_LOCAL)
5733 continue;
5734
e008b5fc
EB
5735 /* Leave virtual devices for the generic cleanup */
5736 if (dev->rtnl_link_ops)
5737 continue;
d0c082ce 5738
ce286d32 5739 /* Push remaing network devices to init_net */
aca51397
PE
5740 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5741 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 5742 if (err) {
aca51397 5743 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
ce286d32 5744 __func__, dev->name, err);
aca51397 5745 BUG();
ce286d32
EB
5746 }
5747 }
5748 rtnl_unlock();
5749}
5750
04dc7f6b
EB
5751static void __net_exit default_device_exit_batch(struct list_head *net_list)
5752{
5753 /* At exit all network devices most be removed from a network
5754 * namespace. Do this in the reverse order of registeration.
5755 * Do this across as many network namespaces as possible to
5756 * improve batching efficiency.
5757 */
5758 struct net_device *dev;
5759 struct net *net;
5760 LIST_HEAD(dev_kill_list);
5761
5762 rtnl_lock();
5763 list_for_each_entry(net, net_list, exit_list) {
5764 for_each_netdev_reverse(net, dev) {
5765 if (dev->rtnl_link_ops)
5766 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5767 else
5768 unregister_netdevice_queue(dev, &dev_kill_list);
5769 }
5770 }
5771 unregister_netdevice_many(&dev_kill_list);
5772 rtnl_unlock();
5773}
5774
022cbae6 5775static struct pernet_operations __net_initdata default_device_ops = {
ce286d32 5776 .exit = default_device_exit,
04dc7f6b 5777 .exit_batch = default_device_exit_batch,
ce286d32
EB
5778};
5779
1da177e4
LT
5780/*
5781 * Initialize the DEV module. At boot time this walks the device list and
5782 * unhooks any devices that fail to initialise (normally hardware not
5783 * present) and leaves us with a valid list of present and active devices.
5784 *
5785 */
5786
5787/*
5788 * This is called single threaded during boot, so no need
5789 * to take the rtnl semaphore.
5790 */
5791static int __init net_dev_init(void)
5792{
5793 int i, rc = -ENOMEM;
5794
5795 BUG_ON(!dev_boot_phase);
5796
1da177e4
LT
5797 if (dev_proc_init())
5798 goto out;
5799
8b41d188 5800 if (netdev_kobject_init())
1da177e4
LT
5801 goto out;
5802
5803 INIT_LIST_HEAD(&ptype_all);
82d8a867 5804 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
5805 INIT_LIST_HEAD(&ptype_base[i]);
5806
881d966b
EB
5807 if (register_pernet_subsys(&netdev_net_ops))
5808 goto out;
1da177e4
LT
5809
5810 /*
5811 * Initialise the packet receive queues.
5812 */
5813
6f912042 5814 for_each_possible_cpu(i) {
1da177e4
LT
5815 struct softnet_data *queue;
5816
5817 queue = &per_cpu(softnet_data, i);
5818 skb_queue_head_init(&queue->input_pkt_queue);
1da177e4
LT
5819 queue->completion_queue = NULL;
5820 INIT_LIST_HEAD(&queue->poll_list);
bea3348e 5821
df334545 5822#ifdef CONFIG_RPS
0a9627f2
TH
5823 queue->csd.func = trigger_softirq;
5824 queue->csd.info = queue;
5825 queue->csd.flags = 0;
88751275 5826 queue->cpu = i;
1e94d72f 5827#endif
0a9627f2 5828
bea3348e
SH
5829 queue->backlog.poll = process_backlog;
5830 queue->backlog.weight = weight_p;
d565b0a1 5831 queue->backlog.gro_list = NULL;
4ae5544f 5832 queue->backlog.gro_count = 0;
1da177e4
LT
5833 }
5834
1da177e4
LT
5835 dev_boot_phase = 0;
5836
505d4f73
EB
5837 /* The loopback device is special if any other network devices
5838 * is present in a network namespace the loopback device must
5839 * be present. Since we now dynamically allocate and free the
5840 * loopback device ensure this invariant is maintained by
5841 * keeping the loopback device as the first device on the
5842 * list of network devices. Ensuring the loopback devices
5843 * is the first device that appears and the last network device
5844 * that disappears.
5845 */
5846 if (register_pernet_device(&loopback_net_ops))
5847 goto out;
5848
5849 if (register_pernet_device(&default_device_ops))
5850 goto out;
5851
962cf36c
CM
5852 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5853 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
5854
5855 hotcpu_notifier(dev_cpu_callback, 0);
5856 dst_init();
5857 dev_mcast_init();
5858 rc = 0;
5859out:
5860 return rc;
5861}
5862
5863subsys_initcall(net_dev_init);
5864
e88721f8
KK
5865static int __init initialize_hashrnd(void)
5866{
0a9627f2 5867 get_random_bytes(&hashrnd, sizeof(hashrnd));
e88721f8
KK
5868 return 0;
5869}
5870
5871late_initcall_sync(initialize_hashrnd);
5872