]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/core/dev.c
IPv6: fix IPV6_RECVERR handling of locally-generated errors
[net-next-2.6.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <asm/system.h>
77#include <linux/bitops.h>
4fc268d2 78#include <linux/capability.h>
1da177e4
LT
79#include <linux/cpu.h>
80#include <linux/types.h>
81#include <linux/kernel.h>
08e9897d 82#include <linux/hash.h>
5a0e3ad6 83#include <linux/slab.h>
1da177e4 84#include <linux/sched.h>
4a3e2f71 85#include <linux/mutex.h>
1da177e4
LT
86#include <linux/string.h>
87#include <linux/mm.h>
88#include <linux/socket.h>
89#include <linux/sockios.h>
90#include <linux/errno.h>
91#include <linux/interrupt.h>
92#include <linux/if_ether.h>
93#include <linux/netdevice.h>
94#include <linux/etherdevice.h>
0187bdfb 95#include <linux/ethtool.h>
1da177e4
LT
96#include <linux/notifier.h>
97#include <linux/skbuff.h>
457c4cbc 98#include <net/net_namespace.h>
1da177e4
LT
99#include <net/sock.h>
100#include <linux/rtnetlink.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/stat.h>
104#include <linux/if_bridge.h>
b863ceb7 105#include <linux/if_macvlan.h>
1da177e4
LT
106#include <net/dst.h>
107#include <net/pkt_sched.h>
108#include <net/checksum.h>
44540960 109#include <net/xfrm.h>
1da177e4
LT
110#include <linux/highmem.h>
111#include <linux/init.h>
112#include <linux/kmod.h>
113#include <linux/module.h>
1da177e4
LT
114#include <linux/netpoll.h>
115#include <linux/rcupdate.h>
116#include <linux/delay.h>
295f4a1f 117#include <net/wext.h>
1da177e4 118#include <net/iw_handler.h>
1da177e4 119#include <asm/current.h>
5bdb9886 120#include <linux/audit.h>
db217334 121#include <linux/dmaengine.h>
f6a78bfc 122#include <linux/err.h>
c7fa9d18 123#include <linux/ctype.h>
723e98b7 124#include <linux/if_arp.h>
6de329e2 125#include <linux/if_vlan.h>
8f0f2223 126#include <linux/ip.h>
ad55dcaf 127#include <net/ip.h>
8f0f2223
DM
128#include <linux/ipv6.h>
129#include <linux/in.h>
b6b2fed1
DM
130#include <linux/jhash.h>
131#include <linux/random.h>
9cbc1cb8 132#include <trace/events/napi.h>
1da177e4 133
342709ef
PE
134#include "net-sysfs.h"
135
d565b0a1
HX
136/* Instead of increasing this, you should create a hash table. */
137#define MAX_GRO_SKBS 8
138
5d38a079
HX
139/* This should be increased if a protocol with a bigger head is added. */
140#define GRO_MAX_HEAD (MAX_HEADER + 128)
141
1da177e4
LT
142/*
143 * The list of packet types we will receive (as opposed to discard)
144 * and the routines to invoke.
145 *
146 * Why 16. Because with 16 the only overlap we get on a hash of the
147 * low nibble of the protocol value is RARP/SNAP/X.25.
148 *
149 * NOTE: That is no longer true with the addition of VLAN tags. Not
150 * sure which should go first, but I bet it won't make much
151 * difference if we are running VLANs. The good news is that
152 * this protocol won't be in the list unless compiled in, so
3041a069 153 * the average user (w/out VLANs) will not be adversely affected.
1da177e4
LT
154 * --BLG
155 *
156 * 0800 IP
157 * 8100 802.1Q VLAN
158 * 0001 802.3
159 * 0002 AX.25
160 * 0004 802.2
161 * 8035 RARP
162 * 0005 SNAP
163 * 0805 X.25
164 * 0806 ARP
165 * 8137 IPX
166 * 0009 Localtalk
167 * 86DD IPv6
168 */
169
82d8a867
PE
170#define PTYPE_HASH_SIZE (16)
171#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
172
1da177e4 173static DEFINE_SPINLOCK(ptype_lock);
82d8a867 174static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
6b2bedc3 175static struct list_head ptype_all __read_mostly; /* Taps */
1da177e4 176
1da177e4 177/*
7562f876 178 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
179 * semaphore.
180 *
c6d14c84 181 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
1da177e4
LT
182 *
183 * Writers must hold the rtnl semaphore while they loop through the
7562f876 184 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
185 * actual updates. This allows pure readers to access the list even
186 * while a writer is preparing to update it.
187 *
188 * To put it another way, dev_base_lock is held for writing only to
189 * protect against pure readers; the rtnl semaphore provides the
190 * protection against other writers.
191 *
192 * See, for example usages, register_netdevice() and
193 * unregister_netdevice(), which must be called with the rtnl
194 * semaphore held.
195 */
1da177e4 196DEFINE_RWLOCK(dev_base_lock);
1da177e4
LT
197EXPORT_SYMBOL(dev_base_lock);
198
881d966b 199static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4
LT
200{
201 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
08e9897d 202 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
1da177e4
LT
203}
204
881d966b 205static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 206{
7c28bd0b 207 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
1da177e4
LT
208}
209
ce286d32
EB
210/* Device list insertion */
211static int list_netdevice(struct net_device *dev)
212{
c346dca1 213 struct net *net = dev_net(dev);
ce286d32
EB
214
215 ASSERT_RTNL();
216
217 write_lock_bh(&dev_base_lock);
c6d14c84 218 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
72c9528b 219 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
fb699dfd
ED
220 hlist_add_head_rcu(&dev->index_hlist,
221 dev_index_hash(net, dev->ifindex));
ce286d32
EB
222 write_unlock_bh(&dev_base_lock);
223 return 0;
224}
225
fb699dfd
ED
226/* Device list removal
227 * caller must respect a RCU grace period before freeing/reusing dev
228 */
ce286d32
EB
229static void unlist_netdevice(struct net_device *dev)
230{
231 ASSERT_RTNL();
232
233 /* Unlink dev from the device chain */
234 write_lock_bh(&dev_base_lock);
c6d14c84 235 list_del_rcu(&dev->dev_list);
72c9528b 236 hlist_del_rcu(&dev->name_hlist);
fb699dfd 237 hlist_del_rcu(&dev->index_hlist);
ce286d32
EB
238 write_unlock_bh(&dev_base_lock);
239}
240
1da177e4
LT
241/*
242 * Our notifier list
243 */
244
f07d5b94 245static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
246
247/*
248 * Device drivers call our routines to queue packets here. We empty the
249 * queue in the local softnet handler.
250 */
bea3348e
SH
251
252DEFINE_PER_CPU(struct softnet_data, softnet_data);
d1b19dff 253EXPORT_PER_CPU_SYMBOL(softnet_data);
1da177e4 254
cf508b12 255#ifdef CONFIG_LOCKDEP
723e98b7 256/*
c773e847 257 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
258 * according to dev->type
259 */
260static const unsigned short netdev_lock_type[] =
261 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
262 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
263 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
264 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
265 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
266 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
267 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
268 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
269 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
270 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
271 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
272 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
273 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
2d91d78b 274 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
929122cd 275 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
fcb94e42 276 ARPHRD_VOID, ARPHRD_NONE};
723e98b7 277
36cbd3dc 278static const char *const netdev_lock_name[] =
723e98b7
JP
279 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
280 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
281 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
282 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
283 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
284 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
285 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
286 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
287 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
288 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
289 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
290 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
291 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
2d91d78b 292 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
929122cd 293 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
fcb94e42 294 "_xmit_VOID", "_xmit_NONE"};
723e98b7
JP
295
296static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 297static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
298
299static inline unsigned short netdev_lock_pos(unsigned short dev_type)
300{
301 int i;
302
303 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
304 if (netdev_lock_type[i] == dev_type)
305 return i;
306 /* the last key is used by default */
307 return ARRAY_SIZE(netdev_lock_type) - 1;
308}
309
cf508b12
DM
310static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
311 unsigned short dev_type)
723e98b7
JP
312{
313 int i;
314
315 i = netdev_lock_pos(dev_type);
316 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
317 netdev_lock_name[i]);
318}
cf508b12
DM
319
320static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
321{
322 int i;
323
324 i = netdev_lock_pos(dev->type);
325 lockdep_set_class_and_name(&dev->addr_list_lock,
326 &netdev_addr_lock_key[i],
327 netdev_lock_name[i]);
328}
723e98b7 329#else
cf508b12
DM
330static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
331 unsigned short dev_type)
332{
333}
334static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
335{
336}
337#endif
1da177e4
LT
338
339/*******************************************************************************
340
341 Protocol management and registration routines
342
343*******************************************************************************/
344
1da177e4
LT
345/*
346 * Add a protocol ID to the list. Now that the input handler is
347 * smarter we can dispense with all the messy stuff that used to be
348 * here.
349 *
350 * BEWARE!!! Protocol handlers, mangling input packets,
351 * MUST BE last in hash buckets and checking protocol handlers
352 * MUST start from promiscuous ptype_all chain in net_bh.
353 * It is true now, do not change it.
354 * Explanation follows: if protocol handler, mangling packet, will
355 * be the first on list, it is not able to sense, that packet
356 * is cloned and should be copied-on-write, so that it will
357 * change it and subsequent readers will get broken packet.
358 * --ANK (980803)
359 */
360
361/**
362 * dev_add_pack - add packet handler
363 * @pt: packet type declaration
364 *
365 * Add a protocol handler to the networking stack. The passed &packet_type
366 * is linked into kernel lists and may not be freed until it has been
367 * removed from the kernel lists.
368 *
4ec93edb 369 * This call does not sleep therefore it can not
1da177e4
LT
370 * guarantee all CPU's that are in middle of receiving packets
371 * will see the new packet type (until the next received packet).
372 */
373
374void dev_add_pack(struct packet_type *pt)
375{
376 int hash;
377
378 spin_lock_bh(&ptype_lock);
9be9a6b9 379 if (pt->type == htons(ETH_P_ALL))
1da177e4 380 list_add_rcu(&pt->list, &ptype_all);
9be9a6b9 381 else {
82d8a867 382 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
1da177e4
LT
383 list_add_rcu(&pt->list, &ptype_base[hash]);
384 }
385 spin_unlock_bh(&ptype_lock);
386}
d1b19dff 387EXPORT_SYMBOL(dev_add_pack);
1da177e4 388
1da177e4
LT
389/**
390 * __dev_remove_pack - remove packet handler
391 * @pt: packet type declaration
392 *
393 * Remove a protocol handler that was previously added to the kernel
394 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
395 * from the kernel lists and can be freed or reused once this function
4ec93edb 396 * returns.
1da177e4
LT
397 *
398 * The packet type might still be in use by receivers
399 * and must not be freed until after all the CPU's have gone
400 * through a quiescent state.
401 */
402void __dev_remove_pack(struct packet_type *pt)
403{
404 struct list_head *head;
405 struct packet_type *pt1;
406
407 spin_lock_bh(&ptype_lock);
408
9be9a6b9 409 if (pt->type == htons(ETH_P_ALL))
1da177e4 410 head = &ptype_all;
9be9a6b9 411 else
82d8a867 412 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
1da177e4
LT
413
414 list_for_each_entry(pt1, head, list) {
415 if (pt == pt1) {
416 list_del_rcu(&pt->list);
417 goto out;
418 }
419 }
420
421 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
422out:
423 spin_unlock_bh(&ptype_lock);
424}
d1b19dff
ED
425EXPORT_SYMBOL(__dev_remove_pack);
426
1da177e4
LT
427/**
428 * dev_remove_pack - remove packet handler
429 * @pt: packet type declaration
430 *
431 * Remove a protocol handler that was previously added to the kernel
432 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
433 * from the kernel lists and can be freed or reused once this function
434 * returns.
435 *
436 * This call sleeps to guarantee that no CPU is looking at the packet
437 * type after return.
438 */
439void dev_remove_pack(struct packet_type *pt)
440{
441 __dev_remove_pack(pt);
4ec93edb 442
1da177e4
LT
443 synchronize_net();
444}
d1b19dff 445EXPORT_SYMBOL(dev_remove_pack);
1da177e4
LT
446
447/******************************************************************************
448
449 Device Boot-time Settings Routines
450
451*******************************************************************************/
452
453/* Boot time configuration table */
454static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
455
456/**
457 * netdev_boot_setup_add - add new setup entry
458 * @name: name of the device
459 * @map: configured settings for the device
460 *
461 * Adds new setup entry to the dev_boot_setup list. The function
462 * returns 0 on error and 1 on success. This is a generic routine to
463 * all netdevices.
464 */
465static int netdev_boot_setup_add(char *name, struct ifmap *map)
466{
467 struct netdev_boot_setup *s;
468 int i;
469
470 s = dev_boot_setup;
471 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
472 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
473 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 474 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
475 memcpy(&s[i].map, map, sizeof(s[i].map));
476 break;
477 }
478 }
479
480 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
481}
482
483/**
484 * netdev_boot_setup_check - check boot time settings
485 * @dev: the netdevice
486 *
487 * Check boot time settings for the device.
488 * The found settings are set for the device to be used
489 * later in the device probing.
490 * Returns 0 if no settings found, 1 if they are.
491 */
492int netdev_boot_setup_check(struct net_device *dev)
493{
494 struct netdev_boot_setup *s = dev_boot_setup;
495 int i;
496
497 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
498 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 499 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
500 dev->irq = s[i].map.irq;
501 dev->base_addr = s[i].map.base_addr;
502 dev->mem_start = s[i].map.mem_start;
503 dev->mem_end = s[i].map.mem_end;
504 return 1;
505 }
506 }
507 return 0;
508}
d1b19dff 509EXPORT_SYMBOL(netdev_boot_setup_check);
1da177e4
LT
510
511
512/**
513 * netdev_boot_base - get address from boot time settings
514 * @prefix: prefix for network device
515 * @unit: id for network device
516 *
517 * Check boot time settings for the base address of device.
518 * The found settings are set for the device to be used
519 * later in the device probing.
520 * Returns 0 if no settings found.
521 */
522unsigned long netdev_boot_base(const char *prefix, int unit)
523{
524 const struct netdev_boot_setup *s = dev_boot_setup;
525 char name[IFNAMSIZ];
526 int i;
527
528 sprintf(name, "%s%d", prefix, unit);
529
530 /*
531 * If device already registered then return base of 1
532 * to indicate not to probe for this interface
533 */
881d966b 534 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
535 return 1;
536
537 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
538 if (!strcmp(name, s[i].name))
539 return s[i].map.base_addr;
540 return 0;
541}
542
543/*
544 * Saves at boot time configured settings for any netdevice.
545 */
546int __init netdev_boot_setup(char *str)
547{
548 int ints[5];
549 struct ifmap map;
550
551 str = get_options(str, ARRAY_SIZE(ints), ints);
552 if (!str || !*str)
553 return 0;
554
555 /* Save settings */
556 memset(&map, 0, sizeof(map));
557 if (ints[0] > 0)
558 map.irq = ints[1];
559 if (ints[0] > 1)
560 map.base_addr = ints[2];
561 if (ints[0] > 2)
562 map.mem_start = ints[3];
563 if (ints[0] > 3)
564 map.mem_end = ints[4];
565
566 /* Add new entry to the list */
567 return netdev_boot_setup_add(str, &map);
568}
569
570__setup("netdev=", netdev_boot_setup);
571
572/*******************************************************************************
573
574 Device Interface Subroutines
575
576*******************************************************************************/
577
578/**
579 * __dev_get_by_name - find a device by its name
c4ea43c5 580 * @net: the applicable net namespace
1da177e4
LT
581 * @name: name to find
582 *
583 * Find an interface by name. Must be called under RTNL semaphore
584 * or @dev_base_lock. If the name is found a pointer to the device
585 * is returned. If the name is not found then %NULL is returned. The
586 * reference counters are not incremented so the caller must be
587 * careful with locks.
588 */
589
881d966b 590struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
591{
592 struct hlist_node *p;
0bd8d536
ED
593 struct net_device *dev;
594 struct hlist_head *head = dev_name_hash(net, name);
1da177e4 595
0bd8d536 596 hlist_for_each_entry(dev, p, head, name_hlist)
1da177e4
LT
597 if (!strncmp(dev->name, name, IFNAMSIZ))
598 return dev;
0bd8d536 599
1da177e4
LT
600 return NULL;
601}
d1b19dff 602EXPORT_SYMBOL(__dev_get_by_name);
1da177e4 603
72c9528b
ED
604/**
605 * dev_get_by_name_rcu - find a device by its name
606 * @net: the applicable net namespace
607 * @name: name to find
608 *
609 * Find an interface by name.
610 * If the name is found a pointer to the device is returned.
611 * If the name is not found then %NULL is returned.
612 * The reference counters are not incremented so the caller must be
613 * careful with locks. The caller must hold RCU lock.
614 */
615
616struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
617{
618 struct hlist_node *p;
619 struct net_device *dev;
620 struct hlist_head *head = dev_name_hash(net, name);
621
622 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
623 if (!strncmp(dev->name, name, IFNAMSIZ))
624 return dev;
625
626 return NULL;
627}
628EXPORT_SYMBOL(dev_get_by_name_rcu);
629
1da177e4
LT
630/**
631 * dev_get_by_name - find a device by its name
c4ea43c5 632 * @net: the applicable net namespace
1da177e4
LT
633 * @name: name to find
634 *
635 * Find an interface by name. This can be called from any
636 * context and does its own locking. The returned handle has
637 * the usage count incremented and the caller must use dev_put() to
638 * release it when it is no longer needed. %NULL is returned if no
639 * matching device is found.
640 */
641
881d966b 642struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
643{
644 struct net_device *dev;
645
72c9528b
ED
646 rcu_read_lock();
647 dev = dev_get_by_name_rcu(net, name);
1da177e4
LT
648 if (dev)
649 dev_hold(dev);
72c9528b 650 rcu_read_unlock();
1da177e4
LT
651 return dev;
652}
d1b19dff 653EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
654
655/**
656 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 657 * @net: the applicable net namespace
1da177e4
LT
658 * @ifindex: index of device
659 *
660 * Search for an interface by index. Returns %NULL if the device
661 * is not found or a pointer to the device. The device has not
662 * had its reference counter increased so the caller must be careful
663 * about locking. The caller must hold either the RTNL semaphore
664 * or @dev_base_lock.
665 */
666
881d966b 667struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
668{
669 struct hlist_node *p;
0bd8d536
ED
670 struct net_device *dev;
671 struct hlist_head *head = dev_index_hash(net, ifindex);
1da177e4 672
0bd8d536 673 hlist_for_each_entry(dev, p, head, index_hlist)
1da177e4
LT
674 if (dev->ifindex == ifindex)
675 return dev;
0bd8d536 676
1da177e4
LT
677 return NULL;
678}
d1b19dff 679EXPORT_SYMBOL(__dev_get_by_index);
1da177e4 680
fb699dfd
ED
681/**
682 * dev_get_by_index_rcu - find a device by its ifindex
683 * @net: the applicable net namespace
684 * @ifindex: index of device
685 *
686 * Search for an interface by index. Returns %NULL if the device
687 * is not found or a pointer to the device. The device has not
688 * had its reference counter increased so the caller must be careful
689 * about locking. The caller must hold RCU lock.
690 */
691
692struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
693{
694 struct hlist_node *p;
695 struct net_device *dev;
696 struct hlist_head *head = dev_index_hash(net, ifindex);
697
698 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
699 if (dev->ifindex == ifindex)
700 return dev;
701
702 return NULL;
703}
704EXPORT_SYMBOL(dev_get_by_index_rcu);
705
1da177e4
LT
706
707/**
708 * dev_get_by_index - find a device by its ifindex
c4ea43c5 709 * @net: the applicable net namespace
1da177e4
LT
710 * @ifindex: index of device
711 *
712 * Search for an interface by index. Returns NULL if the device
713 * is not found or a pointer to the device. The device returned has
714 * had a reference added and the pointer is safe until the user calls
715 * dev_put to indicate they have finished with it.
716 */
717
881d966b 718struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
719{
720 struct net_device *dev;
721
fb699dfd
ED
722 rcu_read_lock();
723 dev = dev_get_by_index_rcu(net, ifindex);
1da177e4
LT
724 if (dev)
725 dev_hold(dev);
fb699dfd 726 rcu_read_unlock();
1da177e4
LT
727 return dev;
728}
d1b19dff 729EXPORT_SYMBOL(dev_get_by_index);
1da177e4
LT
730
731/**
732 * dev_getbyhwaddr - find a device by its hardware address
c4ea43c5 733 * @net: the applicable net namespace
1da177e4
LT
734 * @type: media type of device
735 * @ha: hardware address
736 *
737 * Search for an interface by MAC address. Returns NULL if the device
738 * is not found or a pointer to the device. The caller must hold the
739 * rtnl semaphore. The returned device has not had its ref count increased
740 * and the caller must therefore be careful about locking
741 *
742 * BUGS:
743 * If the API was consistent this would be __dev_get_by_hwaddr
744 */
745
881d966b 746struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
1da177e4
LT
747{
748 struct net_device *dev;
749
750 ASSERT_RTNL();
751
81103a52 752 for_each_netdev(net, dev)
1da177e4
LT
753 if (dev->type == type &&
754 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
755 return dev;
756
757 return NULL;
1da177e4 758}
cf309e3f
JF
759EXPORT_SYMBOL(dev_getbyhwaddr);
760
881d966b 761struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
762{
763 struct net_device *dev;
764
4e9cac2b 765 ASSERT_RTNL();
881d966b 766 for_each_netdev(net, dev)
4e9cac2b 767 if (dev->type == type)
7562f876
PE
768 return dev;
769
770 return NULL;
4e9cac2b 771}
4e9cac2b
PM
772EXPORT_SYMBOL(__dev_getfirstbyhwtype);
773
881d966b 774struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b
PM
775{
776 struct net_device *dev;
777
778 rtnl_lock();
881d966b 779 dev = __dev_getfirstbyhwtype(net, type);
4e9cac2b
PM
780 if (dev)
781 dev_hold(dev);
1da177e4
LT
782 rtnl_unlock();
783 return dev;
784}
1da177e4
LT
785EXPORT_SYMBOL(dev_getfirstbyhwtype);
786
787/**
788 * dev_get_by_flags - find any device with given flags
c4ea43c5 789 * @net: the applicable net namespace
1da177e4
LT
790 * @if_flags: IFF_* values
791 * @mask: bitmask of bits in if_flags to check
792 *
793 * Search for any interface with the given flags. Returns NULL if a device
4ec93edb 794 * is not found or a pointer to the device. The device returned has
1da177e4
LT
795 * had a reference added and the pointer is safe until the user calls
796 * dev_put to indicate they have finished with it.
797 */
798
d1b19dff
ED
799struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
800 unsigned short mask)
1da177e4 801{
7562f876 802 struct net_device *dev, *ret;
1da177e4 803
7562f876 804 ret = NULL;
c6d14c84
ED
805 rcu_read_lock();
806 for_each_netdev_rcu(net, dev) {
1da177e4
LT
807 if (((dev->flags ^ if_flags) & mask) == 0) {
808 dev_hold(dev);
7562f876 809 ret = dev;
1da177e4
LT
810 break;
811 }
812 }
c6d14c84 813 rcu_read_unlock();
7562f876 814 return ret;
1da177e4 815}
d1b19dff 816EXPORT_SYMBOL(dev_get_by_flags);
1da177e4
LT
817
818/**
819 * dev_valid_name - check if name is okay for network device
820 * @name: name string
821 *
822 * Network device names need to be valid file names to
c7fa9d18
DM
823 * to allow sysfs to work. We also disallow any kind of
824 * whitespace.
1da177e4 825 */
c2373ee9 826int dev_valid_name(const char *name)
1da177e4 827{
c7fa9d18
DM
828 if (*name == '\0')
829 return 0;
b6fe17d6
SH
830 if (strlen(name) >= IFNAMSIZ)
831 return 0;
c7fa9d18
DM
832 if (!strcmp(name, ".") || !strcmp(name, ".."))
833 return 0;
834
835 while (*name) {
836 if (*name == '/' || isspace(*name))
837 return 0;
838 name++;
839 }
840 return 1;
1da177e4 841}
d1b19dff 842EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
843
844/**
b267b179
EB
845 * __dev_alloc_name - allocate a name for a device
846 * @net: network namespace to allocate the device name in
1da177e4 847 * @name: name format string
b267b179 848 * @buf: scratch buffer and result name string
1da177e4
LT
849 *
850 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
851 * id. It scans list of devices to build up a free map, then chooses
852 * the first empty slot. The caller must hold the dev_base or rtnl lock
853 * while allocating the name and adding the device in order to avoid
854 * duplicates.
855 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
856 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
857 */
858
b267b179 859static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
860{
861 int i = 0;
1da177e4
LT
862 const char *p;
863 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 864 unsigned long *inuse;
1da177e4
LT
865 struct net_device *d;
866
867 p = strnchr(name, IFNAMSIZ-1, '%');
868 if (p) {
869 /*
870 * Verify the string as this thing may have come from
871 * the user. There must be either one "%d" and no other "%"
872 * characters.
873 */
874 if (p[1] != 'd' || strchr(p + 2, '%'))
875 return -EINVAL;
876
877 /* Use one page as a bit array of possible slots */
cfcabdcc 878 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
879 if (!inuse)
880 return -ENOMEM;
881
881d966b 882 for_each_netdev(net, d) {
1da177e4
LT
883 if (!sscanf(d->name, name, &i))
884 continue;
885 if (i < 0 || i >= max_netdevices)
886 continue;
887
888 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 889 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
890 if (!strncmp(buf, d->name, IFNAMSIZ))
891 set_bit(i, inuse);
892 }
893
894 i = find_first_zero_bit(inuse, max_netdevices);
895 free_page((unsigned long) inuse);
896 }
897
d9031024
OP
898 if (buf != name)
899 snprintf(buf, IFNAMSIZ, name, i);
b267b179 900 if (!__dev_get_by_name(net, buf))
1da177e4 901 return i;
1da177e4
LT
902
903 /* It is possible to run out of possible slots
904 * when the name is long and there isn't enough space left
905 * for the digits, or if all bits are used.
906 */
907 return -ENFILE;
908}
909
b267b179
EB
910/**
911 * dev_alloc_name - allocate a name for a device
912 * @dev: device
913 * @name: name format string
914 *
915 * Passed a format string - eg "lt%d" it will try and find a suitable
916 * id. It scans list of devices to build up a free map, then chooses
917 * the first empty slot. The caller must hold the dev_base or rtnl lock
918 * while allocating the name and adding the device in order to avoid
919 * duplicates.
920 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
921 * Returns the number of the unit assigned or a negative errno code.
922 */
923
924int dev_alloc_name(struct net_device *dev, const char *name)
925{
926 char buf[IFNAMSIZ];
927 struct net *net;
928 int ret;
929
c346dca1
YH
930 BUG_ON(!dev_net(dev));
931 net = dev_net(dev);
b267b179
EB
932 ret = __dev_alloc_name(net, name, buf);
933 if (ret >= 0)
934 strlcpy(dev->name, buf, IFNAMSIZ);
935 return ret;
936}
d1b19dff 937EXPORT_SYMBOL(dev_alloc_name);
b267b179 938
d9031024
OP
939static int dev_get_valid_name(struct net *net, const char *name, char *buf,
940 bool fmt)
941{
942 if (!dev_valid_name(name))
943 return -EINVAL;
944
945 if (fmt && strchr(name, '%'))
946 return __dev_alloc_name(net, name, buf);
947 else if (__dev_get_by_name(net, name))
948 return -EEXIST;
949 else if (buf != name)
950 strlcpy(buf, name, IFNAMSIZ);
951
952 return 0;
953}
1da177e4
LT
954
955/**
956 * dev_change_name - change name of a device
957 * @dev: device
958 * @newname: name (or format string) must be at least IFNAMSIZ
959 *
960 * Change name of a device, can pass format strings "eth%d".
961 * for wildcarding.
962 */
cf04a4c7 963int dev_change_name(struct net_device *dev, const char *newname)
1da177e4 964{
fcc5a03a 965 char oldname[IFNAMSIZ];
1da177e4 966 int err = 0;
fcc5a03a 967 int ret;
881d966b 968 struct net *net;
1da177e4
LT
969
970 ASSERT_RTNL();
c346dca1 971 BUG_ON(!dev_net(dev));
1da177e4 972
c346dca1 973 net = dev_net(dev);
1da177e4
LT
974 if (dev->flags & IFF_UP)
975 return -EBUSY;
976
c8d90dca
SH
977 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
978 return 0;
979
fcc5a03a
HX
980 memcpy(oldname, dev->name, IFNAMSIZ);
981
d9031024
OP
982 err = dev_get_valid_name(net, newname, dev->name, 1);
983 if (err < 0)
984 return err;
1da177e4 985
fcc5a03a 986rollback:
3891845e
EB
987 /* For now only devices in the initial network namespace
988 * are in sysfs.
989 */
09ad9bc7 990 if (net_eq(net, &init_net)) {
3891845e
EB
991 ret = device_rename(&dev->dev, dev->name);
992 if (ret) {
993 memcpy(dev->name, oldname, IFNAMSIZ);
994 return ret;
995 }
dcc99773 996 }
7f988eab
HX
997
998 write_lock_bh(&dev_base_lock);
92749821 999 hlist_del(&dev->name_hlist);
72c9528b
ED
1000 write_unlock_bh(&dev_base_lock);
1001
1002 synchronize_rcu();
1003
1004 write_lock_bh(&dev_base_lock);
1005 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
1006 write_unlock_bh(&dev_base_lock);
1007
056925ab 1008 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
1009 ret = notifier_to_errno(ret);
1010
1011 if (ret) {
91e9c07b
ED
1012 /* err >= 0 after dev_alloc_name() or stores the first errno */
1013 if (err >= 0) {
fcc5a03a
HX
1014 err = ret;
1015 memcpy(dev->name, oldname, IFNAMSIZ);
1016 goto rollback;
91e9c07b
ED
1017 } else {
1018 printk(KERN_ERR
1019 "%s: name change rollback failed: %d.\n",
1020 dev->name, ret);
fcc5a03a
HX
1021 }
1022 }
1da177e4
LT
1023
1024 return err;
1025}
1026
0b815a1a
SH
1027/**
1028 * dev_set_alias - change ifalias of a device
1029 * @dev: device
1030 * @alias: name up to IFALIASZ
f0db275a 1031 * @len: limit of bytes to copy from info
0b815a1a
SH
1032 *
1033 * Set ifalias for a device,
1034 */
1035int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1036{
1037 ASSERT_RTNL();
1038
1039 if (len >= IFALIASZ)
1040 return -EINVAL;
1041
96ca4a2c
OH
1042 if (!len) {
1043 if (dev->ifalias) {
1044 kfree(dev->ifalias);
1045 dev->ifalias = NULL;
1046 }
1047 return 0;
1048 }
1049
d1b19dff 1050 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
0b815a1a
SH
1051 if (!dev->ifalias)
1052 return -ENOMEM;
1053
1054 strlcpy(dev->ifalias, alias, len+1);
1055 return len;
1056}
1057
1058
d8a33ac4 1059/**
3041a069 1060 * netdev_features_change - device changes features
d8a33ac4
SH
1061 * @dev: device to cause notification
1062 *
1063 * Called to indicate a device has changed features.
1064 */
1065void netdev_features_change(struct net_device *dev)
1066{
056925ab 1067 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
1068}
1069EXPORT_SYMBOL(netdev_features_change);
1070
1da177e4
LT
1071/**
1072 * netdev_state_change - device changes state
1073 * @dev: device to cause notification
1074 *
1075 * Called to indicate a device has changed state. This function calls
1076 * the notifier chains for netdev_chain and sends a NEWLINK message
1077 * to the routing socket.
1078 */
1079void netdev_state_change(struct net_device *dev)
1080{
1081 if (dev->flags & IFF_UP) {
056925ab 1082 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1da177e4
LT
1083 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1084 }
1085}
d1b19dff 1086EXPORT_SYMBOL(netdev_state_change);
1da177e4 1087
75c78500 1088void netdev_bonding_change(struct net_device *dev, unsigned long event)
c1da4ac7 1089{
75c78500 1090 call_netdevice_notifiers(event, dev);
c1da4ac7
OG
1091}
1092EXPORT_SYMBOL(netdev_bonding_change);
1093
1da177e4
LT
1094/**
1095 * dev_load - load a network module
c4ea43c5 1096 * @net: the applicable net namespace
1da177e4
LT
1097 * @name: name of interface
1098 *
1099 * If a network interface is not present and the process has suitable
1100 * privileges this function loads the module. If module loading is not
1101 * available in this kernel then it becomes a nop.
1102 */
1103
881d966b 1104void dev_load(struct net *net, const char *name)
1da177e4 1105{
4ec93edb 1106 struct net_device *dev;
1da177e4 1107
72c9528b
ED
1108 rcu_read_lock();
1109 dev = dev_get_by_name_rcu(net, name);
1110 rcu_read_unlock();
1da177e4 1111
a8f80e8f 1112 if (!dev && capable(CAP_NET_ADMIN))
1da177e4
LT
1113 request_module("%s", name);
1114}
d1b19dff 1115EXPORT_SYMBOL(dev_load);
1da177e4 1116
bd380811 1117static int __dev_open(struct net_device *dev)
1da177e4 1118{
d314774c 1119 const struct net_device_ops *ops = dev->netdev_ops;
3b8bcfd5 1120 int ret;
1da177e4 1121
e46b66bc
BH
1122 ASSERT_RTNL();
1123
1da177e4
LT
1124 /*
1125 * Is it even present?
1126 */
1127 if (!netif_device_present(dev))
1128 return -ENODEV;
1129
3b8bcfd5
JB
1130 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1131 ret = notifier_to_errno(ret);
1132 if (ret)
1133 return ret;
1134
1da177e4
LT
1135 /*
1136 * Call device private open method
1137 */
1138 set_bit(__LINK_STATE_START, &dev->state);
bada339b 1139
d314774c
SH
1140 if (ops->ndo_validate_addr)
1141 ret = ops->ndo_validate_addr(dev);
bada339b 1142
d314774c
SH
1143 if (!ret && ops->ndo_open)
1144 ret = ops->ndo_open(dev);
1da177e4 1145
4ec93edb 1146 /*
1da177e4
LT
1147 * If it went open OK then:
1148 */
1149
bada339b
JG
1150 if (ret)
1151 clear_bit(__LINK_STATE_START, &dev->state);
1152 else {
1da177e4
LT
1153 /*
1154 * Set the flags.
1155 */
1156 dev->flags |= IFF_UP;
1157
649274d9
DW
1158 /*
1159 * Enable NET_DMA
1160 */
b4bd07c2 1161 net_dmaengine_get();
649274d9 1162
1da177e4
LT
1163 /*
1164 * Initialize multicasting status
1165 */
4417da66 1166 dev_set_rx_mode(dev);
1da177e4
LT
1167
1168 /*
1169 * Wakeup transmit queue engine
1170 */
1171 dev_activate(dev);
1da177e4 1172 }
bada339b 1173
1da177e4
LT
1174 return ret;
1175}
1176
1177/**
bd380811
PM
1178 * dev_open - prepare an interface for use.
1179 * @dev: device to open
1da177e4 1180 *
bd380811
PM
1181 * Takes a device from down to up state. The device's private open
1182 * function is invoked and then the multicast lists are loaded. Finally
1183 * the device is moved into the up state and a %NETDEV_UP message is
1184 * sent to the netdev notifier chain.
1185 *
1186 * Calling this function on an active interface is a nop. On a failure
1187 * a negative errno code is returned.
1da177e4 1188 */
bd380811
PM
1189int dev_open(struct net_device *dev)
1190{
1191 int ret;
1192
1193 /*
1194 * Is it already up?
1195 */
1196 if (dev->flags & IFF_UP)
1197 return 0;
1198
1199 /*
1200 * Open device
1201 */
1202 ret = __dev_open(dev);
1203 if (ret < 0)
1204 return ret;
1205
1206 /*
1207 * ... and announce new interface.
1208 */
1209 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1210 call_netdevice_notifiers(NETDEV_UP, dev);
1211
1212 return ret;
1213}
1214EXPORT_SYMBOL(dev_open);
1215
1216static int __dev_close(struct net_device *dev)
1da177e4 1217{
d314774c 1218 const struct net_device_ops *ops = dev->netdev_ops;
e46b66bc 1219
bd380811 1220 ASSERT_RTNL();
9d5010db
DM
1221 might_sleep();
1222
1da177e4
LT
1223 /*
1224 * Tell people we are going down, so that they can
1225 * prepare to death, when device is still operating.
1226 */
056925ab 1227 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1228
1da177e4
LT
1229 clear_bit(__LINK_STATE_START, &dev->state);
1230
1231 /* Synchronize to scheduled poll. We cannot touch poll list,
bea3348e
SH
1232 * it can be even on different cpu. So just clear netif_running().
1233 *
1234 * dev->stop() will invoke napi_disable() on all of it's
1235 * napi_struct instances on this device.
1236 */
1da177e4 1237 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1da177e4 1238
d8b2a4d2
ML
1239 dev_deactivate(dev);
1240
1da177e4
LT
1241 /*
1242 * Call the device specific close. This cannot fail.
1243 * Only if device is UP
1244 *
1245 * We allow it to be called even after a DETACH hot-plug
1246 * event.
1247 */
d314774c
SH
1248 if (ops->ndo_stop)
1249 ops->ndo_stop(dev);
1da177e4
LT
1250
1251 /*
1252 * Device is now down.
1253 */
1254
1255 dev->flags &= ~IFF_UP;
1256
1257 /*
bd380811 1258 * Shutdown NET_DMA
1da177e4 1259 */
bd380811
PM
1260 net_dmaengine_put();
1261
1262 return 0;
1263}
1264
1265/**
1266 * dev_close - shutdown an interface.
1267 * @dev: device to shutdown
1268 *
1269 * This function moves an active device into down state. A
1270 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1271 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1272 * chain.
1273 */
1274int dev_close(struct net_device *dev)
1275{
1276 if (!(dev->flags & IFF_UP))
1277 return 0;
1278
1279 __dev_close(dev);
1da177e4 1280
649274d9 1281 /*
bd380811 1282 * Tell people we are down
649274d9 1283 */
bd380811
PM
1284 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1285 call_netdevice_notifiers(NETDEV_DOWN, dev);
649274d9 1286
1da177e4
LT
1287 return 0;
1288}
d1b19dff 1289EXPORT_SYMBOL(dev_close);
1da177e4
LT
1290
1291
0187bdfb
BH
1292/**
1293 * dev_disable_lro - disable Large Receive Offload on a device
1294 * @dev: device
1295 *
1296 * Disable Large Receive Offload (LRO) on a net device. Must be
1297 * called under RTNL. This is needed if received packets may be
1298 * forwarded to another interface.
1299 */
1300void dev_disable_lro(struct net_device *dev)
1301{
1302 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1303 dev->ethtool_ops->set_flags) {
1304 u32 flags = dev->ethtool_ops->get_flags(dev);
1305 if (flags & ETH_FLAG_LRO) {
1306 flags &= ~ETH_FLAG_LRO;
1307 dev->ethtool_ops->set_flags(dev, flags);
1308 }
1309 }
1310 WARN_ON(dev->features & NETIF_F_LRO);
1311}
1312EXPORT_SYMBOL(dev_disable_lro);
1313
1314
881d966b
EB
1315static int dev_boot_phase = 1;
1316
1da177e4
LT
1317/*
1318 * Device change register/unregister. These are not inline or static
1319 * as we export them to the world.
1320 */
1321
1322/**
1323 * register_netdevice_notifier - register a network notifier block
1324 * @nb: notifier
1325 *
1326 * Register a notifier to be called when network device events occur.
1327 * The notifier passed is linked into the kernel structures and must
1328 * not be reused until it has been unregistered. A negative errno code
1329 * is returned on a failure.
1330 *
1331 * When registered all registration and up events are replayed
4ec93edb 1332 * to the new notifier to allow device to have a race free
1da177e4
LT
1333 * view of the network device list.
1334 */
1335
1336int register_netdevice_notifier(struct notifier_block *nb)
1337{
1338 struct net_device *dev;
fcc5a03a 1339 struct net_device *last;
881d966b 1340 struct net *net;
1da177e4
LT
1341 int err;
1342
1343 rtnl_lock();
f07d5b94 1344 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1345 if (err)
1346 goto unlock;
881d966b
EB
1347 if (dev_boot_phase)
1348 goto unlock;
1349 for_each_net(net) {
1350 for_each_netdev(net, dev) {
1351 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1352 err = notifier_to_errno(err);
1353 if (err)
1354 goto rollback;
1355
1356 if (!(dev->flags & IFF_UP))
1357 continue;
1da177e4 1358
881d966b
EB
1359 nb->notifier_call(nb, NETDEV_UP, dev);
1360 }
1da177e4 1361 }
fcc5a03a
HX
1362
1363unlock:
1da177e4
LT
1364 rtnl_unlock();
1365 return err;
fcc5a03a
HX
1366
1367rollback:
1368 last = dev;
881d966b
EB
1369 for_each_net(net) {
1370 for_each_netdev(net, dev) {
1371 if (dev == last)
1372 break;
fcc5a03a 1373
881d966b
EB
1374 if (dev->flags & IFF_UP) {
1375 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1376 nb->notifier_call(nb, NETDEV_DOWN, dev);
1377 }
1378 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
a5ee1551 1379 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
fcc5a03a 1380 }
fcc5a03a 1381 }
c67625a1
PE
1382
1383 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1384 goto unlock;
1da177e4 1385}
d1b19dff 1386EXPORT_SYMBOL(register_netdevice_notifier);
1da177e4
LT
1387
1388/**
1389 * unregister_netdevice_notifier - unregister a network notifier block
1390 * @nb: notifier
1391 *
1392 * Unregister a notifier previously registered by
1393 * register_netdevice_notifier(). The notifier is unlinked into the
1394 * kernel structures and may then be reused. A negative errno code
1395 * is returned on a failure.
1396 */
1397
1398int unregister_netdevice_notifier(struct notifier_block *nb)
1399{
9f514950
HX
1400 int err;
1401
1402 rtnl_lock();
f07d5b94 1403 err = raw_notifier_chain_unregister(&netdev_chain, nb);
9f514950
HX
1404 rtnl_unlock();
1405 return err;
1da177e4 1406}
d1b19dff 1407EXPORT_SYMBOL(unregister_netdevice_notifier);
1da177e4
LT
1408
1409/**
1410 * call_netdevice_notifiers - call all network notifier blocks
1411 * @val: value passed unmodified to notifier function
c4ea43c5 1412 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1413 *
1414 * Call all network notifier blocks. Parameters and return value
f07d5b94 1415 * are as for raw_notifier_call_chain().
1da177e4
LT
1416 */
1417
ad7379d4 1418int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1419{
ad7379d4 1420 return raw_notifier_call_chain(&netdev_chain, val, dev);
1da177e4
LT
1421}
1422
1423/* When > 0 there are consumers of rx skb time stamps */
1424static atomic_t netstamp_needed = ATOMIC_INIT(0);
1425
1426void net_enable_timestamp(void)
1427{
1428 atomic_inc(&netstamp_needed);
1429}
d1b19dff 1430EXPORT_SYMBOL(net_enable_timestamp);
1da177e4
LT
1431
1432void net_disable_timestamp(void)
1433{
1434 atomic_dec(&netstamp_needed);
1435}
d1b19dff 1436EXPORT_SYMBOL(net_disable_timestamp);
1da177e4 1437
a61bbcf2 1438static inline void net_timestamp(struct sk_buff *skb)
1da177e4
LT
1439{
1440 if (atomic_read(&netstamp_needed))
a61bbcf2 1441 __net_timestamp(skb);
b7aa0bf7
ED
1442 else
1443 skb->tstamp.tv64 = 0;
1da177e4
LT
1444}
1445
44540960
AB
1446/**
1447 * dev_forward_skb - loopback an skb to another netif
1448 *
1449 * @dev: destination network device
1450 * @skb: buffer to forward
1451 *
1452 * return values:
1453 * NET_RX_SUCCESS (no congestion)
1454 * NET_RX_DROP (packet was dropped)
1455 *
1456 * dev_forward_skb can be used for injecting an skb from the
1457 * start_xmit function of one device into the receive queue
1458 * of another device.
1459 *
1460 * The receiving device may be in another namespace, so
1461 * we have to clear all information in the skb that could
1462 * impact namespace isolation.
1463 */
1464int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1465{
1466 skb_orphan(skb);
1467
1468 if (!(dev->flags & IFF_UP))
1469 return NET_RX_DROP;
1470
1471 if (skb->len > (dev->mtu + dev->hard_header_len))
1472 return NET_RX_DROP;
1473
8a83a00b 1474 skb_set_dev(skb, dev);
44540960
AB
1475 skb->tstamp.tv64 = 0;
1476 skb->pkt_type = PACKET_HOST;
1477 skb->protocol = eth_type_trans(skb, dev);
44540960
AB
1478 return netif_rx(skb);
1479}
1480EXPORT_SYMBOL_GPL(dev_forward_skb);
1481
1da177e4
LT
1482/*
1483 * Support routine. Sends outgoing frames to any network
1484 * taps currently in use.
1485 */
1486
f6a78bfc 1487static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1488{
1489 struct packet_type *ptype;
a61bbcf2 1490
8caf1539
JP
1491#ifdef CONFIG_NET_CLS_ACT
1492 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1493 net_timestamp(skb);
1494#else
a61bbcf2 1495 net_timestamp(skb);
8caf1539 1496#endif
1da177e4
LT
1497
1498 rcu_read_lock();
1499 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1500 /* Never send packets back to the socket
1501 * they originated from - MvS (miquels@drinkel.ow.org)
1502 */
1503 if ((ptype->dev == dev || !ptype->dev) &&
1504 (ptype->af_packet_priv == NULL ||
1505 (struct sock *)ptype->af_packet_priv != skb->sk)) {
d1b19dff 1506 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1da177e4
LT
1507 if (!skb2)
1508 break;
1509
1510 /* skb->nh should be correctly
1511 set by sender, so that the second statement is
1512 just protection against buggy protocols.
1513 */
459a98ed 1514 skb_reset_mac_header(skb2);
1da177e4 1515
d56f90a7 1516 if (skb_network_header(skb2) < skb2->data ||
27a884dc 1517 skb2->network_header > skb2->tail) {
1da177e4
LT
1518 if (net_ratelimit())
1519 printk(KERN_CRIT "protocol %04x is "
1520 "buggy, dev %s\n",
1521 skb2->protocol, dev->name);
c1d2bbe1 1522 skb_reset_network_header(skb2);
1da177e4
LT
1523 }
1524
b0e380b1 1525 skb2->transport_header = skb2->network_header;
1da177e4 1526 skb2->pkt_type = PACKET_OUTGOING;
f2ccd8fa 1527 ptype->func(skb2, skb->dev, ptype, skb->dev);
1da177e4
LT
1528 }
1529 }
1530 rcu_read_unlock();
1531}
1532
56079431 1533
def82a1d 1534static inline void __netif_reschedule(struct Qdisc *q)
56079431 1535{
def82a1d
JP
1536 struct softnet_data *sd;
1537 unsigned long flags;
56079431 1538
def82a1d
JP
1539 local_irq_save(flags);
1540 sd = &__get_cpu_var(softnet_data);
1541 q->next_sched = sd->output_queue;
1542 sd->output_queue = q;
1543 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1544 local_irq_restore(flags);
1545}
1546
1547void __netif_schedule(struct Qdisc *q)
1548{
1549 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1550 __netif_reschedule(q);
56079431
DV
1551}
1552EXPORT_SYMBOL(__netif_schedule);
1553
bea3348e 1554void dev_kfree_skb_irq(struct sk_buff *skb)
56079431 1555{
bea3348e
SH
1556 if (atomic_dec_and_test(&skb->users)) {
1557 struct softnet_data *sd;
1558 unsigned long flags;
56079431 1559
bea3348e
SH
1560 local_irq_save(flags);
1561 sd = &__get_cpu_var(softnet_data);
1562 skb->next = sd->completion_queue;
1563 sd->completion_queue = skb;
1564 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1565 local_irq_restore(flags);
1566 }
56079431 1567}
bea3348e 1568EXPORT_SYMBOL(dev_kfree_skb_irq);
56079431
DV
1569
1570void dev_kfree_skb_any(struct sk_buff *skb)
1571{
1572 if (in_irq() || irqs_disabled())
1573 dev_kfree_skb_irq(skb);
1574 else
1575 dev_kfree_skb(skb);
1576}
1577EXPORT_SYMBOL(dev_kfree_skb_any);
1578
1579
bea3348e
SH
1580/**
1581 * netif_device_detach - mark device as removed
1582 * @dev: network device
1583 *
1584 * Mark device as removed from system and therefore no longer available.
1585 */
56079431
DV
1586void netif_device_detach(struct net_device *dev)
1587{
1588 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1589 netif_running(dev)) {
d543103a 1590 netif_tx_stop_all_queues(dev);
56079431
DV
1591 }
1592}
1593EXPORT_SYMBOL(netif_device_detach);
1594
bea3348e
SH
1595/**
1596 * netif_device_attach - mark device as attached
1597 * @dev: network device
1598 *
1599 * Mark device as attached from system and restart if needed.
1600 */
56079431
DV
1601void netif_device_attach(struct net_device *dev)
1602{
1603 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1604 netif_running(dev)) {
d543103a 1605 netif_tx_wake_all_queues(dev);
4ec93edb 1606 __netdev_watchdog_up(dev);
56079431
DV
1607 }
1608}
1609EXPORT_SYMBOL(netif_device_attach);
1610
6de329e2
BH
1611static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1612{
1613 return ((features & NETIF_F_GEN_CSUM) ||
1614 ((features & NETIF_F_IP_CSUM) &&
1615 protocol == htons(ETH_P_IP)) ||
1616 ((features & NETIF_F_IPV6_CSUM) &&
1c8dbcf6
YZ
1617 protocol == htons(ETH_P_IPV6)) ||
1618 ((features & NETIF_F_FCOE_CRC) &&
1619 protocol == htons(ETH_P_FCOE)));
6de329e2
BH
1620}
1621
1622static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1623{
1624 if (can_checksum_protocol(dev->features, skb->protocol))
1625 return true;
1626
1627 if (skb->protocol == htons(ETH_P_8021Q)) {
1628 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1629 if (can_checksum_protocol(dev->features & dev->vlan_features,
1630 veh->h_vlan_encapsulated_proto))
1631 return true;
1632 }
1633
1634 return false;
1635}
56079431 1636
8a83a00b
AB
1637/**
1638 * skb_dev_set -- assign a new device to a buffer
1639 * @skb: buffer for the new device
1640 * @dev: network device
1641 *
1642 * If an skb is owned by a device already, we have to reset
1643 * all data private to the namespace a device belongs to
1644 * before assigning it a new device.
1645 */
1646#ifdef CONFIG_NET_NS
1647void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1648{
1649 skb_dst_drop(skb);
1650 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1651 secpath_reset(skb);
1652 nf_reset(skb);
1653 skb_init_secmark(skb);
1654 skb->mark = 0;
1655 skb->priority = 0;
1656 skb->nf_trace = 0;
1657 skb->ipvs_property = 0;
1658#ifdef CONFIG_NET_SCHED
1659 skb->tc_index = 0;
1660#endif
1661 }
1662 skb->dev = dev;
1663}
1664EXPORT_SYMBOL(skb_set_dev);
1665#endif /* CONFIG_NET_NS */
1666
1da177e4
LT
1667/*
1668 * Invalidate hardware checksum when packet is to be mangled, and
1669 * complete checksum manually on outgoing path.
1670 */
84fa7933 1671int skb_checksum_help(struct sk_buff *skb)
1da177e4 1672{
d3bc23e7 1673 __wsum csum;
663ead3b 1674 int ret = 0, offset;
1da177e4 1675
84fa7933 1676 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
1677 goto out_set_summed;
1678
1679 if (unlikely(skb_shinfo(skb)->gso_size)) {
a430a43d
HX
1680 /* Let GSO fix up the checksum. */
1681 goto out_set_summed;
1da177e4
LT
1682 }
1683
a030847e
HX
1684 offset = skb->csum_start - skb_headroom(skb);
1685 BUG_ON(offset >= skb_headlen(skb));
1686 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1687
1688 offset += skb->csum_offset;
1689 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1690
1691 if (skb_cloned(skb) &&
1692 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
1693 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1694 if (ret)
1695 goto out;
1696 }
1697
a030847e 1698 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
a430a43d 1699out_set_summed:
1da177e4 1700 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 1701out:
1da177e4
LT
1702 return ret;
1703}
d1b19dff 1704EXPORT_SYMBOL(skb_checksum_help);
1da177e4 1705
f6a78bfc
HX
1706/**
1707 * skb_gso_segment - Perform segmentation on skb.
1708 * @skb: buffer to segment
576a30eb 1709 * @features: features for the output path (see dev->features)
f6a78bfc
HX
1710 *
1711 * This function segments the given skb and returns a list of segments.
576a30eb
HX
1712 *
1713 * It may return NULL if the skb requires no segmentation. This is
1714 * only possible when GSO is used for verifying header integrity.
f6a78bfc 1715 */
576a30eb 1716struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
f6a78bfc
HX
1717{
1718 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1719 struct packet_type *ptype;
252e3346 1720 __be16 type = skb->protocol;
a430a43d 1721 int err;
f6a78bfc 1722
459a98ed 1723 skb_reset_mac_header(skb);
b0e380b1 1724 skb->mac_len = skb->network_header - skb->mac_header;
f6a78bfc
HX
1725 __skb_pull(skb, skb->mac_len);
1726
67fd1a73
HX
1727 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1728 struct net_device *dev = skb->dev;
1729 struct ethtool_drvinfo info = {};
1730
1731 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1732 dev->ethtool_ops->get_drvinfo(dev, &info);
1733
1734 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1735 "ip_summed=%d",
1736 info.driver, dev ? dev->features : 0L,
1737 skb->sk ? skb->sk->sk_route_caps : 0L,
1738 skb->len, skb->data_len, skb->ip_summed);
1739
a430a43d
HX
1740 if (skb_header_cloned(skb) &&
1741 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1742 return ERR_PTR(err);
1743 }
1744
f6a78bfc 1745 rcu_read_lock();
82d8a867
PE
1746 list_for_each_entry_rcu(ptype,
1747 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
f6a78bfc 1748 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
84fa7933 1749 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
a430a43d
HX
1750 err = ptype->gso_send_check(skb);
1751 segs = ERR_PTR(err);
1752 if (err || skb_gso_ok(skb, features))
1753 break;
d56f90a7
ACM
1754 __skb_push(skb, (skb->data -
1755 skb_network_header(skb)));
a430a43d 1756 }
576a30eb 1757 segs = ptype->gso_segment(skb, features);
f6a78bfc
HX
1758 break;
1759 }
1760 }
1761 rcu_read_unlock();
1762
98e399f8 1763 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 1764
f6a78bfc
HX
1765 return segs;
1766}
f6a78bfc
HX
1767EXPORT_SYMBOL(skb_gso_segment);
1768
fb286bb2
HX
1769/* Take action when hardware reception checksum errors are detected. */
1770#ifdef CONFIG_BUG
1771void netdev_rx_csum_fault(struct net_device *dev)
1772{
1773 if (net_ratelimit()) {
4ec93edb 1774 printk(KERN_ERR "%s: hw csum failure.\n",
246a4212 1775 dev ? dev->name : "<unknown>");
fb286bb2
HX
1776 dump_stack();
1777 }
1778}
1779EXPORT_SYMBOL(netdev_rx_csum_fault);
1780#endif
1781
1da177e4
LT
1782/* Actually, we should eliminate this check as soon as we know, that:
1783 * 1. IOMMU is present and allows to map all the memory.
1784 * 2. No high memory really exists on this machine.
1785 */
1786
1787static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1788{
3d3a8533 1789#ifdef CONFIG_HIGHMEM
1da177e4
LT
1790 int i;
1791
1792 if (dev->features & NETIF_F_HIGHDMA)
1793 return 0;
1794
1795 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1796 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1797 return 1;
1798
3d3a8533 1799#endif
1da177e4
LT
1800 return 0;
1801}
1da177e4 1802
f6a78bfc
HX
1803struct dev_gso_cb {
1804 void (*destructor)(struct sk_buff *skb);
1805};
1806
1807#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1808
1809static void dev_gso_skb_destructor(struct sk_buff *skb)
1810{
1811 struct dev_gso_cb *cb;
1812
1813 do {
1814 struct sk_buff *nskb = skb->next;
1815
1816 skb->next = nskb->next;
1817 nskb->next = NULL;
1818 kfree_skb(nskb);
1819 } while (skb->next);
1820
1821 cb = DEV_GSO_CB(skb);
1822 if (cb->destructor)
1823 cb->destructor(skb);
1824}
1825
1826/**
1827 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1828 * @skb: buffer to segment
1829 *
1830 * This function segments the given skb and stores the list of segments
1831 * in skb->next.
1832 */
1833static int dev_gso_segment(struct sk_buff *skb)
1834{
1835 struct net_device *dev = skb->dev;
1836 struct sk_buff *segs;
576a30eb
HX
1837 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1838 NETIF_F_SG : 0);
1839
1840 segs = skb_gso_segment(skb, features);
1841
1842 /* Verifying header integrity only. */
1843 if (!segs)
1844 return 0;
f6a78bfc 1845
801678c5 1846 if (IS_ERR(segs))
f6a78bfc
HX
1847 return PTR_ERR(segs);
1848
1849 skb->next = segs;
1850 DEV_GSO_CB(skb)->destructor = skb->destructor;
1851 skb->destructor = dev_gso_skb_destructor;
1852
1853 return 0;
1854}
1855
fd2ea0a7
DM
1856int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1857 struct netdev_queue *txq)
f6a78bfc 1858{
00829823 1859 const struct net_device_ops *ops = dev->netdev_ops;
572a9d7b 1860 int rc = NETDEV_TX_OK;
00829823 1861
f6a78bfc 1862 if (likely(!skb->next)) {
9be9a6b9 1863 if (!list_empty(&ptype_all))
f6a78bfc
HX
1864 dev_queue_xmit_nit(skb, dev);
1865
576a30eb
HX
1866 if (netif_needs_gso(dev, skb)) {
1867 if (unlikely(dev_gso_segment(skb)))
1868 goto out_kfree_skb;
1869 if (skb->next)
1870 goto gso;
1871 }
f6a78bfc 1872
93f154b5
ED
1873 /*
1874 * If device doesnt need skb->dst, release it right now while
1875 * its hot in this cpu cache
1876 */
adf30907
ED
1877 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1878 skb_dst_drop(skb);
1879
ac45f602 1880 rc = ops->ndo_start_xmit(skb, dev);
ec634fe3 1881 if (rc == NETDEV_TX_OK)
08baf561 1882 txq_trans_update(txq);
ac45f602
PO
1883 /*
1884 * TODO: if skb_orphan() was called by
1885 * dev->hard_start_xmit() (for example, the unmodified
1886 * igb driver does that; bnx2 doesn't), then
1887 * skb_tx_software_timestamp() will be unable to send
1888 * back the time stamp.
1889 *
1890 * How can this be prevented? Always create another
1891 * reference to the socket before calling
1892 * dev->hard_start_xmit()? Prevent that skb_orphan()
1893 * does anything in dev->hard_start_xmit() by clearing
1894 * the skb destructor before the call and restoring it
1895 * afterwards, then doing the skb_orphan() ourselves?
1896 */
ac45f602 1897 return rc;
f6a78bfc
HX
1898 }
1899
576a30eb 1900gso:
f6a78bfc
HX
1901 do {
1902 struct sk_buff *nskb = skb->next;
f6a78bfc
HX
1903
1904 skb->next = nskb->next;
1905 nskb->next = NULL;
068a2de5
KK
1906
1907 /*
1908 * If device doesnt need nskb->dst, release it right now while
1909 * its hot in this cpu cache
1910 */
1911 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1912 skb_dst_drop(nskb);
1913
00829823 1914 rc = ops->ndo_start_xmit(nskb, dev);
ec634fe3 1915 if (unlikely(rc != NETDEV_TX_OK)) {
572a9d7b
PM
1916 if (rc & ~NETDEV_TX_MASK)
1917 goto out_kfree_gso_skb;
f54d9e8d 1918 nskb->next = skb->next;
f6a78bfc
HX
1919 skb->next = nskb;
1920 return rc;
1921 }
08baf561 1922 txq_trans_update(txq);
fd2ea0a7 1923 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
f54d9e8d 1924 return NETDEV_TX_BUSY;
f6a78bfc 1925 } while (skb->next);
4ec93edb 1926
572a9d7b
PM
1927out_kfree_gso_skb:
1928 if (likely(skb->next == NULL))
1929 skb->destructor = DEV_GSO_CB(skb)->destructor;
f6a78bfc
HX
1930out_kfree_skb:
1931 kfree_skb(skb);
572a9d7b 1932 return rc;
f6a78bfc
HX
1933}
1934
7019298a 1935static u32 skb_tx_hashrnd;
b6b2fed1 1936
9247744e 1937u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
8f0f2223 1938{
7019298a 1939 u32 hash;
b6b2fed1 1940
513de11b
DM
1941 if (skb_rx_queue_recorded(skb)) {
1942 hash = skb_get_rx_queue(skb);
d1b19dff 1943 while (unlikely(hash >= dev->real_num_tx_queues))
513de11b
DM
1944 hash -= dev->real_num_tx_queues;
1945 return hash;
1946 }
ec581f6a
ED
1947
1948 if (skb->sk && skb->sk->sk_hash)
7019298a 1949 hash = skb->sk->sk_hash;
ec581f6a 1950 else
7019298a 1951 hash = skb->protocol;
d5a9e24a 1952
7019298a 1953 hash = jhash_1word(hash, skb_tx_hashrnd);
b6b2fed1
DM
1954
1955 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
8f0f2223 1956}
9247744e 1957EXPORT_SYMBOL(skb_tx_hash);
8f0f2223 1958
ed04642f
ED
1959static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1960{
1961 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1962 if (net_ratelimit()) {
1963 WARN(1, "%s selects TX queue %d, but "
1964 "real number of TX queues is %d\n",
1965 dev->name, queue_index,
1966 dev->real_num_tx_queues);
1967 }
1968 return 0;
1969 }
1970 return queue_index;
1971}
1972
e8a0464c
DM
1973static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1974 struct sk_buff *skb)
1975{
a4ee3ce3
KK
1976 u16 queue_index;
1977 struct sock *sk = skb->sk;
1978
1979 if (sk_tx_queue_recorded(sk)) {
1980 queue_index = sk_tx_queue_get(sk);
1981 } else {
1982 const struct net_device_ops *ops = dev->netdev_ops;
1983
1984 if (ops->ndo_select_queue) {
1985 queue_index = ops->ndo_select_queue(dev, skb);
ed04642f 1986 queue_index = dev_cap_txqueue(dev, queue_index);
a4ee3ce3
KK
1987 } else {
1988 queue_index = 0;
1989 if (dev->real_num_tx_queues > 1)
1990 queue_index = skb_tx_hash(dev, skb);
fd2ea0a7 1991
8728c544 1992 if (sk) {
05d17608 1993 struct dst_entry *dst = rcu_dereference_bh(sk->sk_dst_cache);
8728c544
ED
1994
1995 if (dst && skb_dst(skb) == dst)
1996 sk_tx_queue_set(sk, queue_index);
1997 }
a4ee3ce3
KK
1998 }
1999 }
eae792b7 2000
fd2ea0a7
DM
2001 skb_set_queue_mapping(skb, queue_index);
2002 return netdev_get_tx_queue(dev, queue_index);
e8a0464c
DM
2003}
2004
bbd8a0d3
KK
2005static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2006 struct net_device *dev,
2007 struct netdev_queue *txq)
2008{
2009 spinlock_t *root_lock = qdisc_lock(q);
2010 int rc;
2011
2012 spin_lock(root_lock);
2013 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2014 kfree_skb(skb);
2015 rc = NET_XMIT_DROP;
2016 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2017 !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
2018 /*
2019 * This is a work-conserving queue; there are no old skbs
2020 * waiting to be sent out; and the qdisc is not running -
2021 * xmit the skb directly.
2022 */
2023 __qdisc_update_bstats(q, skb->len);
2024 if (sch_direct_xmit(skb, q, dev, txq, root_lock))
2025 __qdisc_run(q);
2026 else
2027 clear_bit(__QDISC_STATE_RUNNING, &q->state);
2028
2029 rc = NET_XMIT_SUCCESS;
2030 } else {
2031 rc = qdisc_enqueue_root(skb, q);
2032 qdisc_run(q);
2033 }
2034 spin_unlock(root_lock);
2035
2036 return rc;
2037}
2038
4b258461
KK
2039/*
2040 * Returns true if either:
2041 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2042 * 2. skb is fragmented and the device does not support SG, or if
2043 * at least one of fragments is in highmem and device does not
2044 * support DMA from it.
2045 */
2046static inline int skb_needs_linearize(struct sk_buff *skb,
2047 struct net_device *dev)
2048{
2049 return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
2050 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
2051 illegal_highdma(dev, skb)));
2052}
2053
d29f749e
DJ
2054/**
2055 * dev_queue_xmit - transmit a buffer
2056 * @skb: buffer to transmit
2057 *
2058 * Queue a buffer for transmission to a network device. The caller must
2059 * have set the device and priority and built the buffer before calling
2060 * this function. The function can be called from an interrupt.
2061 *
2062 * A negative errno code is returned on a failure. A success does not
2063 * guarantee the frame will be transmitted as it may be dropped due
2064 * to congestion or traffic shaping.
2065 *
2066 * -----------------------------------------------------------------------------------
2067 * I notice this method can also return errors from the queue disciplines,
2068 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2069 * be positive.
2070 *
2071 * Regardless of the return value, the skb is consumed, so it is currently
2072 * difficult to retry a send to this method. (You can bump the ref count
2073 * before sending to hold a reference for retry if you are careful.)
2074 *
2075 * When calling this method, interrupts MUST be enabled. This is because
2076 * the BH enable code must have IRQs enabled so that it will not deadlock.
2077 * --BLG
2078 */
1da177e4
LT
2079int dev_queue_xmit(struct sk_buff *skb)
2080{
2081 struct net_device *dev = skb->dev;
dc2b4847 2082 struct netdev_queue *txq;
1da177e4
LT
2083 struct Qdisc *q;
2084 int rc = -ENOMEM;
2085
f6a78bfc
HX
2086 /* GSO will handle the following emulations directly. */
2087 if (netif_needs_gso(dev, skb))
2088 goto gso;
2089
4b258461
KK
2090 /* Convert a paged skb to linear, if required */
2091 if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
1da177e4
LT
2092 goto out_kfree_skb;
2093
2094 /* If packet is not checksummed and device does not support
2095 * checksumming for this protocol, complete checksumming here.
2096 */
663ead3b
HX
2097 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2098 skb_set_transport_header(skb, skb->csum_start -
2099 skb_headroom(skb));
6de329e2
BH
2100 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2101 goto out_kfree_skb;
663ead3b 2102 }
1da177e4 2103
f6a78bfc 2104gso:
4ec93edb
YH
2105 /* Disable soft irqs for various locks below. Also
2106 * stops preemption for RCU.
1da177e4 2107 */
4ec93edb 2108 rcu_read_lock_bh();
1da177e4 2109
eae792b7 2110 txq = dev_pick_tx(dev, skb);
a898def2 2111 q = rcu_dereference_bh(txq->qdisc);
37437bb2 2112
1da177e4 2113#ifdef CONFIG_NET_CLS_ACT
d1b19dff 2114 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1da177e4
LT
2115#endif
2116 if (q->enqueue) {
bbd8a0d3 2117 rc = __dev_xmit_skb(skb, q, dev, txq);
37437bb2 2118 goto out;
1da177e4
LT
2119 }
2120
2121 /* The device has no queue. Common case for software devices:
2122 loopback, all the sorts of tunnels...
2123
932ff279
HX
2124 Really, it is unlikely that netif_tx_lock protection is necessary
2125 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
2126 counters.)
2127 However, it is possible, that they rely on protection
2128 made by us here.
2129
2130 Check this and shot the lock. It is not prone from deadlocks.
2131 Either shot noqueue qdisc, it is even simpler 8)
2132 */
2133 if (dev->flags & IFF_UP) {
2134 int cpu = smp_processor_id(); /* ok because BHs are off */
2135
c773e847 2136 if (txq->xmit_lock_owner != cpu) {
1da177e4 2137
c773e847 2138 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 2139
fd2ea0a7 2140 if (!netif_tx_queue_stopped(txq)) {
572a9d7b
PM
2141 rc = dev_hard_start_xmit(skb, dev, txq);
2142 if (dev_xmit_complete(rc)) {
c773e847 2143 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2144 goto out;
2145 }
2146 }
c773e847 2147 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2148 if (net_ratelimit())
2149 printk(KERN_CRIT "Virtual device %s asks to "
2150 "queue packet!\n", dev->name);
2151 } else {
2152 /* Recursion is detected! It is possible,
2153 * unfortunately */
2154 if (net_ratelimit())
2155 printk(KERN_CRIT "Dead loop on virtual device "
2156 "%s, fix it urgently!\n", dev->name);
2157 }
2158 }
2159
2160 rc = -ENETDOWN;
d4828d85 2161 rcu_read_unlock_bh();
1da177e4
LT
2162
2163out_kfree_skb:
2164 kfree_skb(skb);
2165 return rc;
2166out:
d4828d85 2167 rcu_read_unlock_bh();
1da177e4
LT
2168 return rc;
2169}
d1b19dff 2170EXPORT_SYMBOL(dev_queue_xmit);
1da177e4
LT
2171
2172
2173/*=======================================================================
2174 Receiver routines
2175 =======================================================================*/
2176
6b2bedc3
SH
2177int netdev_max_backlog __read_mostly = 1000;
2178int netdev_budget __read_mostly = 300;
2179int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4
LT
2180
2181DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2182
2183
1da177e4
LT
2184/**
2185 * netif_rx - post buffer to the network code
2186 * @skb: buffer to post
2187 *
2188 * This function receives a packet from a device driver and queues it for
2189 * the upper (protocol) levels to process. It always succeeds. The buffer
2190 * may be dropped during processing for congestion control or by the
2191 * protocol layers.
2192 *
2193 * return values:
2194 * NET_RX_SUCCESS (no congestion)
1da177e4
LT
2195 * NET_RX_DROP (packet was dropped)
2196 *
2197 */
2198
2199int netif_rx(struct sk_buff *skb)
2200{
1da177e4
LT
2201 struct softnet_data *queue;
2202 unsigned long flags;
2203
2204 /* if netpoll wants it, pretend we never saw it */
2205 if (netpoll_rx(skb))
2206 return NET_RX_DROP;
2207
b7aa0bf7 2208 if (!skb->tstamp.tv64)
a61bbcf2 2209 net_timestamp(skb);
1da177e4
LT
2210
2211 /*
2212 * The code is rearranged so that the path is the most
2213 * short when CPU is congested, but is still operating.
2214 */
2215 local_irq_save(flags);
1da177e4
LT
2216 queue = &__get_cpu_var(softnet_data);
2217
2218 __get_cpu_var(netdev_rx_stat).total++;
2219 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2220 if (queue->input_pkt_queue.qlen) {
1da177e4 2221enqueue:
1da177e4 2222 __skb_queue_tail(&queue->input_pkt_queue, skb);
1da177e4 2223 local_irq_restore(flags);
34008d8c 2224 return NET_RX_SUCCESS;
1da177e4
LT
2225 }
2226
bea3348e 2227 napi_schedule(&queue->backlog);
1da177e4
LT
2228 goto enqueue;
2229 }
2230
1da177e4
LT
2231 __get_cpu_var(netdev_rx_stat).dropped++;
2232 local_irq_restore(flags);
2233
2234 kfree_skb(skb);
2235 return NET_RX_DROP;
2236}
d1b19dff 2237EXPORT_SYMBOL(netif_rx);
1da177e4
LT
2238
2239int netif_rx_ni(struct sk_buff *skb)
2240{
2241 int err;
2242
2243 preempt_disable();
2244 err = netif_rx(skb);
2245 if (local_softirq_pending())
2246 do_softirq();
2247 preempt_enable();
2248
2249 return err;
2250}
1da177e4
LT
2251EXPORT_SYMBOL(netif_rx_ni);
2252
1da177e4
LT
2253static void net_tx_action(struct softirq_action *h)
2254{
2255 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2256
2257 if (sd->completion_queue) {
2258 struct sk_buff *clist;
2259
2260 local_irq_disable();
2261 clist = sd->completion_queue;
2262 sd->completion_queue = NULL;
2263 local_irq_enable();
2264
2265 while (clist) {
2266 struct sk_buff *skb = clist;
2267 clist = clist->next;
2268
547b792c 2269 WARN_ON(atomic_read(&skb->users));
1da177e4
LT
2270 __kfree_skb(skb);
2271 }
2272 }
2273
2274 if (sd->output_queue) {
37437bb2 2275 struct Qdisc *head;
1da177e4
LT
2276
2277 local_irq_disable();
2278 head = sd->output_queue;
2279 sd->output_queue = NULL;
2280 local_irq_enable();
2281
2282 while (head) {
37437bb2
DM
2283 struct Qdisc *q = head;
2284 spinlock_t *root_lock;
2285
1da177e4
LT
2286 head = head->next_sched;
2287
5fb66229 2288 root_lock = qdisc_lock(q);
37437bb2 2289 if (spin_trylock(root_lock)) {
def82a1d
JP
2290 smp_mb__before_clear_bit();
2291 clear_bit(__QDISC_STATE_SCHED,
2292 &q->state);
37437bb2
DM
2293 qdisc_run(q);
2294 spin_unlock(root_lock);
1da177e4 2295 } else {
195648bb 2296 if (!test_bit(__QDISC_STATE_DEACTIVATED,
e8a83e10 2297 &q->state)) {
195648bb 2298 __netif_reschedule(q);
e8a83e10
JP
2299 } else {
2300 smp_mb__before_clear_bit();
2301 clear_bit(__QDISC_STATE_SCHED,
2302 &q->state);
2303 }
1da177e4
LT
2304 }
2305 }
2306 }
2307}
2308
6f05f629
SH
2309static inline int deliver_skb(struct sk_buff *skb,
2310 struct packet_type *pt_prev,
2311 struct net_device *orig_dev)
1da177e4
LT
2312{
2313 atomic_inc(&skb->users);
f2ccd8fa 2314 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4
LT
2315}
2316
2317#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
da678292
MM
2318
2319#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2320/* This hook is defined here for ATM LANE */
2321int (*br_fdb_test_addr_hook)(struct net_device *dev,
2322 unsigned char *addr) __read_mostly;
4fb019a0 2323EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
da678292 2324#endif
1da177e4 2325
6229e362
SH
2326/*
2327 * If bridge module is loaded call bridging hook.
2328 * returns NULL if packet was consumed.
2329 */
2330struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2331 struct sk_buff *skb) __read_mostly;
4fb019a0 2332EXPORT_SYMBOL_GPL(br_handle_frame_hook);
da678292 2333
6229e362
SH
2334static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2335 struct packet_type **pt_prev, int *ret,
2336 struct net_device *orig_dev)
1da177e4
LT
2337{
2338 struct net_bridge_port *port;
2339
6229e362
SH
2340 if (skb->pkt_type == PACKET_LOOPBACK ||
2341 (port = rcu_dereference(skb->dev->br_port)) == NULL)
2342 return skb;
1da177e4
LT
2343
2344 if (*pt_prev) {
6229e362 2345 *ret = deliver_skb(skb, *pt_prev, orig_dev);
1da177e4 2346 *pt_prev = NULL;
4ec93edb
YH
2347 }
2348
6229e362 2349 return br_handle_frame_hook(port, skb);
1da177e4
LT
2350}
2351#else
6229e362 2352#define handle_bridge(skb, pt_prev, ret, orig_dev) (skb)
1da177e4
LT
2353#endif
2354
b863ceb7
PM
2355#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2356struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2357EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2358
2359static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2360 struct packet_type **pt_prev,
2361 int *ret,
2362 struct net_device *orig_dev)
2363{
2364 if (skb->dev->macvlan_port == NULL)
2365 return skb;
2366
2367 if (*pt_prev) {
2368 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2369 *pt_prev = NULL;
2370 }
2371 return macvlan_handle_frame_hook(skb);
2372}
2373#else
2374#define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb)
2375#endif
2376
1da177e4
LT
2377#ifdef CONFIG_NET_CLS_ACT
2378/* TODO: Maybe we should just force sch_ingress to be compiled in
2379 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2380 * a compare and 2 stores extra right now if we dont have it on
2381 * but have CONFIG_NET_CLS_ACT
4ec93edb 2382 * NOTE: This doesnt stop any functionality; if you dont have
1da177e4
LT
2383 * the ingress scheduler, you just cant add policies on ingress.
2384 *
2385 */
4ec93edb 2386static int ing_filter(struct sk_buff *skb)
1da177e4 2387{
1da177e4 2388 struct net_device *dev = skb->dev;
f697c3e8 2389 u32 ttl = G_TC_RTTL(skb->tc_verd);
555353cf
DM
2390 struct netdev_queue *rxq;
2391 int result = TC_ACT_OK;
2392 struct Qdisc *q;
4ec93edb 2393
f697c3e8
HX
2394 if (MAX_RED_LOOP < ttl++) {
2395 printk(KERN_WARNING
2396 "Redir loop detected Dropping packet (%d->%d)\n",
8964be4a 2397 skb->skb_iif, dev->ifindex);
f697c3e8
HX
2398 return TC_ACT_SHOT;
2399 }
1da177e4 2400
f697c3e8
HX
2401 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2402 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1da177e4 2403
555353cf
DM
2404 rxq = &dev->rx_queue;
2405
83874000 2406 q = rxq->qdisc;
8d50b53d 2407 if (q != &noop_qdisc) {
83874000 2408 spin_lock(qdisc_lock(q));
a9312ae8
DM
2409 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2410 result = qdisc_enqueue_root(skb, q);
83874000
DM
2411 spin_unlock(qdisc_lock(q));
2412 }
f697c3e8
HX
2413
2414 return result;
2415}
86e65da9 2416
f697c3e8
HX
2417static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2418 struct packet_type **pt_prev,
2419 int *ret, struct net_device *orig_dev)
2420{
8d50b53d 2421 if (skb->dev->rx_queue.qdisc == &noop_qdisc)
f697c3e8 2422 goto out;
1da177e4 2423
f697c3e8
HX
2424 if (*pt_prev) {
2425 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2426 *pt_prev = NULL;
2427 } else {
2428 /* Huh? Why does turning on AF_PACKET affect this? */
2429 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
1da177e4
LT
2430 }
2431
f697c3e8
HX
2432 switch (ing_filter(skb)) {
2433 case TC_ACT_SHOT:
2434 case TC_ACT_STOLEN:
2435 kfree_skb(skb);
2436 return NULL;
2437 }
2438
2439out:
2440 skb->tc_verd = 0;
2441 return skb;
1da177e4
LT
2442}
2443#endif
2444
bc1d0411
PM
2445/*
2446 * netif_nit_deliver - deliver received packets to network taps
2447 * @skb: buffer
2448 *
2449 * This function is used to deliver incoming packets to network
2450 * taps. It should be used when the normal netif_receive_skb path
2451 * is bypassed, for example because of VLAN acceleration.
2452 */
2453void netif_nit_deliver(struct sk_buff *skb)
2454{
2455 struct packet_type *ptype;
2456
2457 if (list_empty(&ptype_all))
2458 return;
2459
2460 skb_reset_network_header(skb);
2461 skb_reset_transport_header(skb);
2462 skb->mac_len = skb->network_header - skb->mac_header;
2463
2464 rcu_read_lock();
2465 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2466 if (!ptype->dev || ptype->dev == skb->dev)
2467 deliver_skb(skb, ptype, skb->dev);
2468 }
2469 rcu_read_unlock();
2470}
2471
3b582cc1
SH
2472/**
2473 * netif_receive_skb - process receive buffer from network
2474 * @skb: buffer to process
2475 *
2476 * netif_receive_skb() is the main receive data processing function.
2477 * It always succeeds. The buffer may be dropped during processing
2478 * for congestion control or by the protocol layers.
2479 *
2480 * This function may only be called from softirq context and interrupts
2481 * should be enabled.
2482 *
2483 * Return values (usually ignored):
2484 * NET_RX_SUCCESS: no congestion
2485 * NET_RX_DROP: packet was dropped
2486 */
1da177e4
LT
2487int netif_receive_skb(struct sk_buff *skb)
2488{
2489 struct packet_type *ptype, *pt_prev;
f2ccd8fa 2490 struct net_device *orig_dev;
0641e4fb 2491 struct net_device *master;
0d7a3681 2492 struct net_device *null_or_orig;
ca8d9ea3 2493 struct net_device *null_or_bond;
1da177e4 2494 int ret = NET_RX_DROP;
252e3346 2495 __be16 type;
1da177e4 2496
81bbb3d4
ED
2497 if (!skb->tstamp.tv64)
2498 net_timestamp(skb);
2499
05423b24 2500 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
9b22ea56
PM
2501 return NET_RX_SUCCESS;
2502
1da177e4 2503 /* if we've gotten here through NAPI, check netpoll */
bea3348e 2504 if (netpoll_receive_skb(skb))
1da177e4
LT
2505 return NET_RX_DROP;
2506
8964be4a
ED
2507 if (!skb->skb_iif)
2508 skb->skb_iif = skb->dev->ifindex;
86e65da9 2509
0d7a3681 2510 null_or_orig = NULL;
cc9bd5ce 2511 orig_dev = skb->dev;
0641e4fb
ED
2512 master = ACCESS_ONCE(orig_dev->master);
2513 if (master) {
2514 if (skb_bond_should_drop(skb, master))
0d7a3681
JE
2515 null_or_orig = orig_dev; /* deliver only exact match */
2516 else
0641e4fb 2517 skb->dev = master;
cc9bd5ce 2518 }
8f903c70 2519
1da177e4
LT
2520 __get_cpu_var(netdev_rx_stat).total++;
2521
c1d2bbe1 2522 skb_reset_network_header(skb);
badff6d0 2523 skb_reset_transport_header(skb);
b0e380b1 2524 skb->mac_len = skb->network_header - skb->mac_header;
1da177e4
LT
2525
2526 pt_prev = NULL;
2527
2528 rcu_read_lock();
2529
2530#ifdef CONFIG_NET_CLS_ACT
2531 if (skb->tc_verd & TC_NCLS) {
2532 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2533 goto ncls;
2534 }
2535#endif
2536
2537 list_for_each_entry_rcu(ptype, &ptype_all, list) {
f982307f
JE
2538 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2539 ptype->dev == orig_dev) {
4ec93edb 2540 if (pt_prev)
f2ccd8fa 2541 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
2542 pt_prev = ptype;
2543 }
2544 }
2545
2546#ifdef CONFIG_NET_CLS_ACT
f697c3e8
HX
2547 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2548 if (!skb)
1da177e4 2549 goto out;
1da177e4
LT
2550ncls:
2551#endif
2552
6229e362 2553 skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
b863ceb7
PM
2554 if (!skb)
2555 goto out;
2556 skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
6229e362 2557 if (!skb)
1da177e4
LT
2558 goto out;
2559
1f3c8804
AG
2560 /*
2561 * Make sure frames received on VLAN interfaces stacked on
2562 * bonding interfaces still make their way to any base bonding
2563 * device that may have registered for a specific ptype. The
2564 * handler may have to adjust skb->dev and orig_dev.
1f3c8804 2565 */
ca8d9ea3 2566 null_or_bond = NULL;
1f3c8804
AG
2567 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2568 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
ca8d9ea3 2569 null_or_bond = vlan_dev_real_dev(skb->dev);
1f3c8804
AG
2570 }
2571
1da177e4 2572 type = skb->protocol;
82d8a867
PE
2573 list_for_each_entry_rcu(ptype,
2574 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1f3c8804 2575 if (ptype->type == type && (ptype->dev == null_or_orig ||
ca8d9ea3
AG
2576 ptype->dev == skb->dev || ptype->dev == orig_dev ||
2577 ptype->dev == null_or_bond)) {
4ec93edb 2578 if (pt_prev)
f2ccd8fa 2579 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
2580 pt_prev = ptype;
2581 }
2582 }
2583
2584 if (pt_prev) {
f2ccd8fa 2585 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4
LT
2586 } else {
2587 kfree_skb(skb);
2588 /* Jamal, now you will not able to escape explaining
2589 * me how you were going to use this. :-)
2590 */
2591 ret = NET_RX_DROP;
2592 }
2593
2594out:
2595 rcu_read_unlock();
2596 return ret;
2597}
d1b19dff 2598EXPORT_SYMBOL(netif_receive_skb);
1da177e4 2599
6e583ce5
SH
2600/* Network device is going away, flush any packets still pending */
2601static void flush_backlog(void *arg)
2602{
2603 struct net_device *dev = arg;
2604 struct softnet_data *queue = &__get_cpu_var(softnet_data);
2605 struct sk_buff *skb, *tmp;
2606
2607 skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2608 if (skb->dev == dev) {
2609 __skb_unlink(skb, &queue->input_pkt_queue);
2610 kfree_skb(skb);
2611 }
2612}
2613
d565b0a1
HX
2614static int napi_gro_complete(struct sk_buff *skb)
2615{
2616 struct packet_type *ptype;
2617 __be16 type = skb->protocol;
2618 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2619 int err = -ENOENT;
2620
fc59f9a3
HX
2621 if (NAPI_GRO_CB(skb)->count == 1) {
2622 skb_shinfo(skb)->gso_size = 0;
d565b0a1 2623 goto out;
fc59f9a3 2624 }
d565b0a1
HX
2625
2626 rcu_read_lock();
2627 list_for_each_entry_rcu(ptype, head, list) {
2628 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2629 continue;
2630
2631 err = ptype->gro_complete(skb);
2632 break;
2633 }
2634 rcu_read_unlock();
2635
2636 if (err) {
2637 WARN_ON(&ptype->list == head);
2638 kfree_skb(skb);
2639 return NET_RX_SUCCESS;
2640 }
2641
2642out:
d565b0a1
HX
2643 return netif_receive_skb(skb);
2644}
2645
11380a4b 2646static void napi_gro_flush(struct napi_struct *napi)
d565b0a1
HX
2647{
2648 struct sk_buff *skb, *next;
2649
2650 for (skb = napi->gro_list; skb; skb = next) {
2651 next = skb->next;
2652 skb->next = NULL;
2653 napi_gro_complete(skb);
2654 }
2655
4ae5544f 2656 napi->gro_count = 0;
d565b0a1
HX
2657 napi->gro_list = NULL;
2658}
d565b0a1 2659
5b252f0c 2660enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
d565b0a1
HX
2661{
2662 struct sk_buff **pp = NULL;
2663 struct packet_type *ptype;
2664 __be16 type = skb->protocol;
2665 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
0da2afd5 2666 int same_flow;
d565b0a1 2667 int mac_len;
5b252f0c 2668 enum gro_result ret;
d565b0a1
HX
2669
2670 if (!(skb->dev->features & NETIF_F_GRO))
2671 goto normal;
2672
4cf704fb 2673 if (skb_is_gso(skb) || skb_has_frags(skb))
f17f5c91
HX
2674 goto normal;
2675
d565b0a1
HX
2676 rcu_read_lock();
2677 list_for_each_entry_rcu(ptype, head, list) {
d565b0a1
HX
2678 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2679 continue;
2680
86911732 2681 skb_set_network_header(skb, skb_gro_offset(skb));
d565b0a1
HX
2682 mac_len = skb->network_header - skb->mac_header;
2683 skb->mac_len = mac_len;
2684 NAPI_GRO_CB(skb)->same_flow = 0;
2685 NAPI_GRO_CB(skb)->flush = 0;
5d38a079 2686 NAPI_GRO_CB(skb)->free = 0;
d565b0a1 2687
d565b0a1
HX
2688 pp = ptype->gro_receive(&napi->gro_list, skb);
2689 break;
2690 }
2691 rcu_read_unlock();
2692
2693 if (&ptype->list == head)
2694 goto normal;
2695
0da2afd5 2696 same_flow = NAPI_GRO_CB(skb)->same_flow;
5d0d9be8 2697 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
0da2afd5 2698
d565b0a1
HX
2699 if (pp) {
2700 struct sk_buff *nskb = *pp;
2701
2702 *pp = nskb->next;
2703 nskb->next = NULL;
2704 napi_gro_complete(nskb);
4ae5544f 2705 napi->gro_count--;
d565b0a1
HX
2706 }
2707
0da2afd5 2708 if (same_flow)
d565b0a1
HX
2709 goto ok;
2710
4ae5544f 2711 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
d565b0a1 2712 goto normal;
d565b0a1 2713
4ae5544f 2714 napi->gro_count++;
d565b0a1 2715 NAPI_GRO_CB(skb)->count = 1;
86911732 2716 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
d565b0a1
HX
2717 skb->next = napi->gro_list;
2718 napi->gro_list = skb;
5d0d9be8 2719 ret = GRO_HELD;
d565b0a1 2720
ad0f9904 2721pull:
cb18978c
HX
2722 if (skb_headlen(skb) < skb_gro_offset(skb)) {
2723 int grow = skb_gro_offset(skb) - skb_headlen(skb);
2724
2725 BUG_ON(skb->end - skb->tail < grow);
2726
2727 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
2728
2729 skb->tail += grow;
2730 skb->data_len -= grow;
2731
2732 skb_shinfo(skb)->frags[0].page_offset += grow;
2733 skb_shinfo(skb)->frags[0].size -= grow;
2734
2735 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
2736 put_page(skb_shinfo(skb)->frags[0].page);
2737 memmove(skb_shinfo(skb)->frags,
2738 skb_shinfo(skb)->frags + 1,
2739 --skb_shinfo(skb)->nr_frags);
2740 }
ad0f9904
HX
2741 }
2742
d565b0a1 2743ok:
5d0d9be8 2744 return ret;
d565b0a1
HX
2745
2746normal:
ad0f9904
HX
2747 ret = GRO_NORMAL;
2748 goto pull;
5d38a079 2749}
96e93eab
HX
2750EXPORT_SYMBOL(dev_gro_receive);
2751
5b252f0c
BH
2752static gro_result_t
2753__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
96e93eab
HX
2754{
2755 struct sk_buff *p;
2756
d1c76af9
HX
2757 if (netpoll_rx_on(skb))
2758 return GRO_NORMAL;
2759
96e93eab 2760 for (p = napi->gro_list; p; p = p->next) {
f64f9e71
JP
2761 NAPI_GRO_CB(p)->same_flow =
2762 (p->dev == skb->dev) &&
2763 !compare_ether_header(skb_mac_header(p),
2764 skb_gro_mac_header(skb));
96e93eab
HX
2765 NAPI_GRO_CB(p)->flush = 0;
2766 }
2767
2768 return dev_gro_receive(napi, skb);
2769}
5d38a079 2770
c7c4b3b6 2771gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5d38a079 2772{
5d0d9be8
HX
2773 switch (ret) {
2774 case GRO_NORMAL:
c7c4b3b6
BH
2775 if (netif_receive_skb(skb))
2776 ret = GRO_DROP;
2777 break;
5d38a079 2778
5d0d9be8 2779 case GRO_DROP:
5d0d9be8 2780 case GRO_MERGED_FREE:
5d38a079
HX
2781 kfree_skb(skb);
2782 break;
5b252f0c
BH
2783
2784 case GRO_HELD:
2785 case GRO_MERGED:
2786 break;
5d38a079
HX
2787 }
2788
c7c4b3b6 2789 return ret;
5d0d9be8
HX
2790}
2791EXPORT_SYMBOL(napi_skb_finish);
2792
78a478d0
HX
2793void skb_gro_reset_offset(struct sk_buff *skb)
2794{
2795 NAPI_GRO_CB(skb)->data_offset = 0;
2796 NAPI_GRO_CB(skb)->frag0 = NULL;
7489594c 2797 NAPI_GRO_CB(skb)->frag0_len = 0;
78a478d0 2798
78d3fd0b 2799 if (skb->mac_header == skb->tail &&
7489594c 2800 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
78a478d0
HX
2801 NAPI_GRO_CB(skb)->frag0 =
2802 page_address(skb_shinfo(skb)->frags[0].page) +
2803 skb_shinfo(skb)->frags[0].page_offset;
7489594c
HX
2804 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
2805 }
78a478d0
HX
2806}
2807EXPORT_SYMBOL(skb_gro_reset_offset);
2808
c7c4b3b6 2809gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5d0d9be8 2810{
86911732
HX
2811 skb_gro_reset_offset(skb);
2812
5d0d9be8 2813 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
d565b0a1
HX
2814}
2815EXPORT_SYMBOL(napi_gro_receive);
2816
96e93eab
HX
2817void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2818{
96e93eab
HX
2819 __skb_pull(skb, skb_headlen(skb));
2820 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2821
2822 napi->skb = skb;
2823}
2824EXPORT_SYMBOL(napi_reuse_skb);
2825
76620aaf 2826struct sk_buff *napi_get_frags(struct napi_struct *napi)
5d38a079 2827{
5d38a079 2828 struct sk_buff *skb = napi->skb;
5d38a079
HX
2829
2830 if (!skb) {
89d71a66
ED
2831 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
2832 if (skb)
2833 napi->skb = skb;
80595d59 2834 }
96e93eab
HX
2835 return skb;
2836}
76620aaf 2837EXPORT_SYMBOL(napi_get_frags);
96e93eab 2838
c7c4b3b6
BH
2839gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
2840 gro_result_t ret)
96e93eab 2841{
5d0d9be8
HX
2842 switch (ret) {
2843 case GRO_NORMAL:
86911732 2844 case GRO_HELD:
e76b69cc 2845 skb->protocol = eth_type_trans(skb, skb->dev);
86911732 2846
c7c4b3b6
BH
2847 if (ret == GRO_HELD)
2848 skb_gro_pull(skb, -ETH_HLEN);
2849 else if (netif_receive_skb(skb))
2850 ret = GRO_DROP;
86911732 2851 break;
5d38a079 2852
5d0d9be8 2853 case GRO_DROP:
5d0d9be8
HX
2854 case GRO_MERGED_FREE:
2855 napi_reuse_skb(napi, skb);
2856 break;
5b252f0c
BH
2857
2858 case GRO_MERGED:
2859 break;
5d0d9be8 2860 }
5d38a079 2861
c7c4b3b6 2862 return ret;
5d38a079 2863}
5d0d9be8
HX
2864EXPORT_SYMBOL(napi_frags_finish);
2865
76620aaf
HX
2866struct sk_buff *napi_frags_skb(struct napi_struct *napi)
2867{
2868 struct sk_buff *skb = napi->skb;
2869 struct ethhdr *eth;
a5b1cf28
HX
2870 unsigned int hlen;
2871 unsigned int off;
76620aaf
HX
2872
2873 napi->skb = NULL;
2874
2875 skb_reset_mac_header(skb);
2876 skb_gro_reset_offset(skb);
2877
a5b1cf28
HX
2878 off = skb_gro_offset(skb);
2879 hlen = off + sizeof(*eth);
2880 eth = skb_gro_header_fast(skb, off);
2881 if (skb_gro_header_hard(skb, hlen)) {
2882 eth = skb_gro_header_slow(skb, hlen, off);
2883 if (unlikely(!eth)) {
2884 napi_reuse_skb(napi, skb);
2885 skb = NULL;
2886 goto out;
2887 }
76620aaf
HX
2888 }
2889
2890 skb_gro_pull(skb, sizeof(*eth));
2891
2892 /*
2893 * This works because the only protocols we care about don't require
2894 * special handling. We'll fix it up properly at the end.
2895 */
2896 skb->protocol = eth->h_proto;
2897
2898out:
2899 return skb;
2900}
2901EXPORT_SYMBOL(napi_frags_skb);
2902
c7c4b3b6 2903gro_result_t napi_gro_frags(struct napi_struct *napi)
5d0d9be8 2904{
76620aaf 2905 struct sk_buff *skb = napi_frags_skb(napi);
5d0d9be8
HX
2906
2907 if (!skb)
c7c4b3b6 2908 return GRO_DROP;
5d0d9be8
HX
2909
2910 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2911}
5d38a079
HX
2912EXPORT_SYMBOL(napi_gro_frags);
2913
bea3348e 2914static int process_backlog(struct napi_struct *napi, int quota)
1da177e4
LT
2915{
2916 int work = 0;
1da177e4
LT
2917 struct softnet_data *queue = &__get_cpu_var(softnet_data);
2918 unsigned long start_time = jiffies;
2919
bea3348e
SH
2920 napi->weight = weight_p;
2921 do {
1da177e4 2922 struct sk_buff *skb;
1da177e4
LT
2923
2924 local_irq_disable();
2925 skb = __skb_dequeue(&queue->input_pkt_queue);
bea3348e 2926 if (!skb) {
8f1ead2d 2927 __napi_complete(napi);
bea3348e 2928 local_irq_enable();
8f1ead2d 2929 break;
bea3348e 2930 }
1da177e4
LT
2931 local_irq_enable();
2932
8f1ead2d 2933 netif_receive_skb(skb);
bea3348e 2934 } while (++work < quota && jiffies == start_time);
1da177e4 2935
bea3348e
SH
2936 return work;
2937}
1da177e4 2938
bea3348e
SH
2939/**
2940 * __napi_schedule - schedule for receive
c4ea43c5 2941 * @n: entry to schedule
bea3348e
SH
2942 *
2943 * The entry's receive function will be scheduled to run
2944 */
b5606c2d 2945void __napi_schedule(struct napi_struct *n)
bea3348e
SH
2946{
2947 unsigned long flags;
1da177e4 2948
bea3348e
SH
2949 local_irq_save(flags);
2950 list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2951 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2952 local_irq_restore(flags);
1da177e4 2953}
bea3348e
SH
2954EXPORT_SYMBOL(__napi_schedule);
2955
d565b0a1
HX
2956void __napi_complete(struct napi_struct *n)
2957{
2958 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2959 BUG_ON(n->gro_list);
2960
2961 list_del(&n->poll_list);
2962 smp_mb__before_clear_bit();
2963 clear_bit(NAPI_STATE_SCHED, &n->state);
2964}
2965EXPORT_SYMBOL(__napi_complete);
2966
2967void napi_complete(struct napi_struct *n)
2968{
2969 unsigned long flags;
2970
2971 /*
2972 * don't let napi dequeue from the cpu poll list
2973 * just in case its running on a different cpu
2974 */
2975 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2976 return;
2977
2978 napi_gro_flush(n);
2979 local_irq_save(flags);
2980 __napi_complete(n);
2981 local_irq_restore(flags);
2982}
2983EXPORT_SYMBOL(napi_complete);
2984
2985void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2986 int (*poll)(struct napi_struct *, int), int weight)
2987{
2988 INIT_LIST_HEAD(&napi->poll_list);
4ae5544f 2989 napi->gro_count = 0;
d565b0a1 2990 napi->gro_list = NULL;
5d38a079 2991 napi->skb = NULL;
d565b0a1
HX
2992 napi->poll = poll;
2993 napi->weight = weight;
2994 list_add(&napi->dev_list, &dev->napi_list);
d565b0a1 2995 napi->dev = dev;
5d38a079 2996#ifdef CONFIG_NETPOLL
d565b0a1
HX
2997 spin_lock_init(&napi->poll_lock);
2998 napi->poll_owner = -1;
2999#endif
3000 set_bit(NAPI_STATE_SCHED, &napi->state);
3001}
3002EXPORT_SYMBOL(netif_napi_add);
3003
3004void netif_napi_del(struct napi_struct *napi)
3005{
3006 struct sk_buff *skb, *next;
3007
d7b06636 3008 list_del_init(&napi->dev_list);
76620aaf 3009 napi_free_frags(napi);
d565b0a1
HX
3010
3011 for (skb = napi->gro_list; skb; skb = next) {
3012 next = skb->next;
3013 skb->next = NULL;
3014 kfree_skb(skb);
3015 }
3016
3017 napi->gro_list = NULL;
4ae5544f 3018 napi->gro_count = 0;
d565b0a1
HX
3019}
3020EXPORT_SYMBOL(netif_napi_del);
3021
1da177e4
LT
3022
3023static void net_rx_action(struct softirq_action *h)
3024{
bea3348e 3025 struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
24f8b238 3026 unsigned long time_limit = jiffies + 2;
51b0bded 3027 int budget = netdev_budget;
53fb95d3
MM
3028 void *have;
3029
1da177e4
LT
3030 local_irq_disable();
3031
bea3348e
SH
3032 while (!list_empty(list)) {
3033 struct napi_struct *n;
3034 int work, weight;
1da177e4 3035
bea3348e 3036 /* If softirq window is exhuasted then punt.
24f8b238
SH
3037 * Allow this to run for 2 jiffies since which will allow
3038 * an average latency of 1.5/HZ.
bea3348e 3039 */
24f8b238 3040 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
1da177e4
LT
3041 goto softnet_break;
3042
3043 local_irq_enable();
3044
bea3348e
SH
3045 /* Even though interrupts have been re-enabled, this
3046 * access is safe because interrupts can only add new
3047 * entries to the tail of this list, and only ->poll()
3048 * calls can remove this head entry from the list.
3049 */
e5e26d75 3050 n = list_first_entry(list, struct napi_struct, poll_list);
1da177e4 3051
bea3348e
SH
3052 have = netpoll_poll_lock(n);
3053
3054 weight = n->weight;
3055
0a7606c1
DM
3056 /* This NAPI_STATE_SCHED test is for avoiding a race
3057 * with netpoll's poll_napi(). Only the entity which
3058 * obtains the lock and sees NAPI_STATE_SCHED set will
3059 * actually make the ->poll() call. Therefore we avoid
3060 * accidently calling ->poll() when NAPI is not scheduled.
3061 */
3062 work = 0;
4ea7e386 3063 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
0a7606c1 3064 work = n->poll(n, weight);
4ea7e386
NH
3065 trace_napi_poll(n);
3066 }
bea3348e
SH
3067
3068 WARN_ON_ONCE(work > weight);
3069
3070 budget -= work;
3071
3072 local_irq_disable();
3073
3074 /* Drivers must not modify the NAPI state if they
3075 * consume the entire weight. In such cases this code
3076 * still "owns" the NAPI instance and therefore can
3077 * move the instance around on the list at-will.
3078 */
fed17f30 3079 if (unlikely(work == weight)) {
ff780cd8
HX
3080 if (unlikely(napi_disable_pending(n))) {
3081 local_irq_enable();
3082 napi_complete(n);
3083 local_irq_disable();
3084 } else
fed17f30
DM
3085 list_move_tail(&n->poll_list, list);
3086 }
bea3348e
SH
3087
3088 netpoll_poll_unlock(have);
1da177e4
LT
3089 }
3090out:
515e06c4 3091 local_irq_enable();
bea3348e 3092
db217334
CL
3093#ifdef CONFIG_NET_DMA
3094 /*
3095 * There may not be any more sk_buffs coming right now, so push
3096 * any pending DMA copies to hardware
3097 */
2ba05622 3098 dma_issue_pending_all();
db217334 3099#endif
bea3348e 3100
1da177e4
LT
3101 return;
3102
3103softnet_break:
3104 __get_cpu_var(netdev_rx_stat).time_squeeze++;
3105 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3106 goto out;
3107}
3108
d1b19dff 3109static gifconf_func_t *gifconf_list[NPROTO];
1da177e4
LT
3110
3111/**
3112 * register_gifconf - register a SIOCGIF handler
3113 * @family: Address family
3114 * @gifconf: Function handler
3115 *
3116 * Register protocol dependent address dumping routines. The handler
3117 * that is passed must not be freed or reused until it has been replaced
3118 * by another handler.
3119 */
d1b19dff 3120int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
1da177e4
LT
3121{
3122 if (family >= NPROTO)
3123 return -EINVAL;
3124 gifconf_list[family] = gifconf;
3125 return 0;
3126}
d1b19dff 3127EXPORT_SYMBOL(register_gifconf);
1da177e4
LT
3128
3129
3130/*
3131 * Map an interface index to its name (SIOCGIFNAME)
3132 */
3133
3134/*
3135 * We need this ioctl for efficient implementation of the
3136 * if_indextoname() function required by the IPv6 API. Without
3137 * it, we would have to search all the interfaces to find a
3138 * match. --pb
3139 */
3140
881d966b 3141static int dev_ifname(struct net *net, struct ifreq __user *arg)
1da177e4
LT
3142{
3143 struct net_device *dev;
3144 struct ifreq ifr;
3145
3146 /*
3147 * Fetch the caller's info block.
3148 */
3149
3150 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3151 return -EFAULT;
3152
fb699dfd
ED
3153 rcu_read_lock();
3154 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
1da177e4 3155 if (!dev) {
fb699dfd 3156 rcu_read_unlock();
1da177e4
LT
3157 return -ENODEV;
3158 }
3159
3160 strcpy(ifr.ifr_name, dev->name);
fb699dfd 3161 rcu_read_unlock();
1da177e4
LT
3162
3163 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3164 return -EFAULT;
3165 return 0;
3166}
3167
3168/*
3169 * Perform a SIOCGIFCONF call. This structure will change
3170 * size eventually, and there is nothing I can do about it.
3171 * Thus we will need a 'compatibility mode'.
3172 */
3173
881d966b 3174static int dev_ifconf(struct net *net, char __user *arg)
1da177e4
LT
3175{
3176 struct ifconf ifc;
3177 struct net_device *dev;
3178 char __user *pos;
3179 int len;
3180 int total;
3181 int i;
3182
3183 /*
3184 * Fetch the caller's info block.
3185 */
3186
3187 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3188 return -EFAULT;
3189
3190 pos = ifc.ifc_buf;
3191 len = ifc.ifc_len;
3192
3193 /*
3194 * Loop over the interfaces, and write an info block for each.
3195 */
3196
3197 total = 0;
881d966b 3198 for_each_netdev(net, dev) {
1da177e4
LT
3199 for (i = 0; i < NPROTO; i++) {
3200 if (gifconf_list[i]) {
3201 int done;
3202 if (!pos)
3203 done = gifconf_list[i](dev, NULL, 0);
3204 else
3205 done = gifconf_list[i](dev, pos + total,
3206 len - total);
3207 if (done < 0)
3208 return -EFAULT;
3209 total += done;
3210 }
3211 }
4ec93edb 3212 }
1da177e4
LT
3213
3214 /*
3215 * All done. Write the updated control block back to the caller.
3216 */
3217 ifc.ifc_len = total;
3218
3219 /*
3220 * Both BSD and Solaris return 0 here, so we do too.
3221 */
3222 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3223}
3224
3225#ifdef CONFIG_PROC_FS
3226/*
3227 * This is invoked by the /proc filesystem handler to display a device
3228 * in detail.
3229 */
7562f876 3230void *dev_seq_start(struct seq_file *seq, loff_t *pos)
c6d14c84 3231 __acquires(RCU)
1da177e4 3232{
e372c414 3233 struct net *net = seq_file_net(seq);
7562f876 3234 loff_t off;
1da177e4 3235 struct net_device *dev;
1da177e4 3236
c6d14c84 3237 rcu_read_lock();
7562f876
PE
3238 if (!*pos)
3239 return SEQ_START_TOKEN;
1da177e4 3240
7562f876 3241 off = 1;
c6d14c84 3242 for_each_netdev_rcu(net, dev)
7562f876
PE
3243 if (off++ == *pos)
3244 return dev;
1da177e4 3245
7562f876 3246 return NULL;
1da177e4
LT
3247}
3248
3249void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3250{
c6d14c84
ED
3251 struct net_device *dev = (v == SEQ_START_TOKEN) ?
3252 first_net_device(seq_file_net(seq)) :
3253 next_net_device((struct net_device *)v);
3254
1da177e4 3255 ++*pos;
c6d14c84 3256 return rcu_dereference(dev);
1da177e4
LT
3257}
3258
3259void dev_seq_stop(struct seq_file *seq, void *v)
c6d14c84 3260 __releases(RCU)
1da177e4 3261{
c6d14c84 3262 rcu_read_unlock();
1da177e4
LT
3263}
3264
3265static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3266{
eeda3fd6 3267 const struct net_device_stats *stats = dev_get_stats(dev);
1da177e4 3268
2d13bafe 3269 seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
5a1b5898
RR
3270 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3271 dev->name, stats->rx_bytes, stats->rx_packets,
3272 stats->rx_errors,
3273 stats->rx_dropped + stats->rx_missed_errors,
3274 stats->rx_fifo_errors,
3275 stats->rx_length_errors + stats->rx_over_errors +
3276 stats->rx_crc_errors + stats->rx_frame_errors,
3277 stats->rx_compressed, stats->multicast,
3278 stats->tx_bytes, stats->tx_packets,
3279 stats->tx_errors, stats->tx_dropped,
3280 stats->tx_fifo_errors, stats->collisions,
3281 stats->tx_carrier_errors +
3282 stats->tx_aborted_errors +
3283 stats->tx_window_errors +
3284 stats->tx_heartbeat_errors,
3285 stats->tx_compressed);
1da177e4
LT
3286}
3287
3288/*
3289 * Called from the PROCfs module. This now uses the new arbitrary sized
3290 * /proc/net interface to create /proc/net/dev
3291 */
3292static int dev_seq_show(struct seq_file *seq, void *v)
3293{
3294 if (v == SEQ_START_TOKEN)
3295 seq_puts(seq, "Inter-| Receive "
3296 " | Transmit\n"
3297 " face |bytes packets errs drop fifo frame "
3298 "compressed multicast|bytes packets errs "
3299 "drop fifo colls carrier compressed\n");
3300 else
3301 dev_seq_printf_stats(seq, v);
3302 return 0;
3303}
3304
3305static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3306{
3307 struct netif_rx_stats *rc = NULL;
3308
0c0b0aca 3309 while (*pos < nr_cpu_ids)
4ec93edb 3310 if (cpu_online(*pos)) {
1da177e4
LT
3311 rc = &per_cpu(netdev_rx_stat, *pos);
3312 break;
3313 } else
3314 ++*pos;
3315 return rc;
3316}
3317
3318static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3319{
3320 return softnet_get_online(pos);
3321}
3322
3323static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3324{
3325 ++*pos;
3326 return softnet_get_online(pos);
3327}
3328
3329static void softnet_seq_stop(struct seq_file *seq, void *v)
3330{
3331}
3332
3333static int softnet_seq_show(struct seq_file *seq, void *v)
3334{
3335 struct netif_rx_stats *s = v;
3336
3337 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
31aa02c5 3338 s->total, s->dropped, s->time_squeeze, 0,
c1ebcdb8 3339 0, 0, 0, 0, /* was fastroute */
d1b19dff 3340 s->cpu_collision);
1da177e4
LT
3341 return 0;
3342}
3343
f690808e 3344static const struct seq_operations dev_seq_ops = {
1da177e4
LT
3345 .start = dev_seq_start,
3346 .next = dev_seq_next,
3347 .stop = dev_seq_stop,
3348 .show = dev_seq_show,
3349};
3350
3351static int dev_seq_open(struct inode *inode, struct file *file)
3352{
e372c414
DL
3353 return seq_open_net(inode, file, &dev_seq_ops,
3354 sizeof(struct seq_net_private));
1da177e4
LT
3355}
3356
9a32144e 3357static const struct file_operations dev_seq_fops = {
1da177e4
LT
3358 .owner = THIS_MODULE,
3359 .open = dev_seq_open,
3360 .read = seq_read,
3361 .llseek = seq_lseek,
e372c414 3362 .release = seq_release_net,
1da177e4
LT
3363};
3364
f690808e 3365static const struct seq_operations softnet_seq_ops = {
1da177e4
LT
3366 .start = softnet_seq_start,
3367 .next = softnet_seq_next,
3368 .stop = softnet_seq_stop,
3369 .show = softnet_seq_show,
3370};
3371
3372static int softnet_seq_open(struct inode *inode, struct file *file)
3373{
3374 return seq_open(file, &softnet_seq_ops);
3375}
3376
9a32144e 3377static const struct file_operations softnet_seq_fops = {
1da177e4
LT
3378 .owner = THIS_MODULE,
3379 .open = softnet_seq_open,
3380 .read = seq_read,
3381 .llseek = seq_lseek,
3382 .release = seq_release,
3383};
3384
0e1256ff
SH
3385static void *ptype_get_idx(loff_t pos)
3386{
3387 struct packet_type *pt = NULL;
3388 loff_t i = 0;
3389 int t;
3390
3391 list_for_each_entry_rcu(pt, &ptype_all, list) {
3392 if (i == pos)
3393 return pt;
3394 ++i;
3395 }
3396
82d8a867 3397 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
0e1256ff
SH
3398 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3399 if (i == pos)
3400 return pt;
3401 ++i;
3402 }
3403 }
3404 return NULL;
3405}
3406
3407static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
72348a42 3408 __acquires(RCU)
0e1256ff
SH
3409{
3410 rcu_read_lock();
3411 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3412}
3413
3414static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3415{
3416 struct packet_type *pt;
3417 struct list_head *nxt;
3418 int hash;
3419
3420 ++*pos;
3421 if (v == SEQ_START_TOKEN)
3422 return ptype_get_idx(0);
3423
3424 pt = v;
3425 nxt = pt->list.next;
3426 if (pt->type == htons(ETH_P_ALL)) {
3427 if (nxt != &ptype_all)
3428 goto found;
3429 hash = 0;
3430 nxt = ptype_base[0].next;
3431 } else
82d8a867 3432 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
0e1256ff
SH
3433
3434 while (nxt == &ptype_base[hash]) {
82d8a867 3435 if (++hash >= PTYPE_HASH_SIZE)
0e1256ff
SH
3436 return NULL;
3437 nxt = ptype_base[hash].next;
3438 }
3439found:
3440 return list_entry(nxt, struct packet_type, list);
3441}
3442
3443static void ptype_seq_stop(struct seq_file *seq, void *v)
72348a42 3444 __releases(RCU)
0e1256ff
SH
3445{
3446 rcu_read_unlock();
3447}
3448
0e1256ff
SH
3449static int ptype_seq_show(struct seq_file *seq, void *v)
3450{
3451 struct packet_type *pt = v;
3452
3453 if (v == SEQ_START_TOKEN)
3454 seq_puts(seq, "Type Device Function\n");
c346dca1 3455 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
0e1256ff
SH
3456 if (pt->type == htons(ETH_P_ALL))
3457 seq_puts(seq, "ALL ");
3458 else
3459 seq_printf(seq, "%04x", ntohs(pt->type));
3460
908cd2da
AD
3461 seq_printf(seq, " %-8s %pF\n",
3462 pt->dev ? pt->dev->name : "", pt->func);
0e1256ff
SH
3463 }
3464
3465 return 0;
3466}
3467
3468static const struct seq_operations ptype_seq_ops = {
3469 .start = ptype_seq_start,
3470 .next = ptype_seq_next,
3471 .stop = ptype_seq_stop,
3472 .show = ptype_seq_show,
3473};
3474
3475static int ptype_seq_open(struct inode *inode, struct file *file)
3476{
2feb27db
PE
3477 return seq_open_net(inode, file, &ptype_seq_ops,
3478 sizeof(struct seq_net_private));
0e1256ff
SH
3479}
3480
3481static const struct file_operations ptype_seq_fops = {
3482 .owner = THIS_MODULE,
3483 .open = ptype_seq_open,
3484 .read = seq_read,
3485 .llseek = seq_lseek,
2feb27db 3486 .release = seq_release_net,
0e1256ff
SH
3487};
3488
3489
4665079c 3490static int __net_init dev_proc_net_init(struct net *net)
1da177e4
LT
3491{
3492 int rc = -ENOMEM;
3493
881d966b 3494 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
1da177e4 3495 goto out;
881d966b 3496 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
1da177e4 3497 goto out_dev;
881d966b 3498 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
457c4cbc 3499 goto out_softnet;
0e1256ff 3500
881d966b 3501 if (wext_proc_init(net))
457c4cbc 3502 goto out_ptype;
1da177e4
LT
3503 rc = 0;
3504out:
3505 return rc;
457c4cbc 3506out_ptype:
881d966b 3507 proc_net_remove(net, "ptype");
1da177e4 3508out_softnet:
881d966b 3509 proc_net_remove(net, "softnet_stat");
1da177e4 3510out_dev:
881d966b 3511 proc_net_remove(net, "dev");
1da177e4
LT
3512 goto out;
3513}
881d966b 3514
4665079c 3515static void __net_exit dev_proc_net_exit(struct net *net)
881d966b
EB
3516{
3517 wext_proc_exit(net);
3518
3519 proc_net_remove(net, "ptype");
3520 proc_net_remove(net, "softnet_stat");
3521 proc_net_remove(net, "dev");
3522}
3523
022cbae6 3524static struct pernet_operations __net_initdata dev_proc_ops = {
881d966b
EB
3525 .init = dev_proc_net_init,
3526 .exit = dev_proc_net_exit,
3527};
3528
3529static int __init dev_proc_init(void)
3530{
3531 return register_pernet_subsys(&dev_proc_ops);
3532}
1da177e4
LT
3533#else
3534#define dev_proc_init() 0
3535#endif /* CONFIG_PROC_FS */
3536
3537
3538/**
3539 * netdev_set_master - set up master/slave pair
3540 * @slave: slave device
3541 * @master: new master device
3542 *
3543 * Changes the master device of the slave. Pass %NULL to break the
3544 * bonding. The caller must hold the RTNL semaphore. On a failure
3545 * a negative errno code is returned. On success the reference counts
3546 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3547 * function returns zero.
3548 */
3549int netdev_set_master(struct net_device *slave, struct net_device *master)
3550{
3551 struct net_device *old = slave->master;
3552
3553 ASSERT_RTNL();
3554
3555 if (master) {
3556 if (old)
3557 return -EBUSY;
3558 dev_hold(master);
3559 }
3560
3561 slave->master = master;
4ec93edb 3562
1da177e4
LT
3563 synchronize_net();
3564
3565 if (old)
3566 dev_put(old);
3567
3568 if (master)
3569 slave->flags |= IFF_SLAVE;
3570 else
3571 slave->flags &= ~IFF_SLAVE;
3572
3573 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3574 return 0;
3575}
d1b19dff 3576EXPORT_SYMBOL(netdev_set_master);
1da177e4 3577
b6c40d68
PM
3578static void dev_change_rx_flags(struct net_device *dev, int flags)
3579{
d314774c
SH
3580 const struct net_device_ops *ops = dev->netdev_ops;
3581
3582 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3583 ops->ndo_change_rx_flags(dev, flags);
b6c40d68
PM
3584}
3585
dad9b335 3586static int __dev_set_promiscuity(struct net_device *dev, int inc)
1da177e4
LT
3587{
3588 unsigned short old_flags = dev->flags;
8192b0c4
DH
3589 uid_t uid;
3590 gid_t gid;
1da177e4 3591
24023451
PM
3592 ASSERT_RTNL();
3593
dad9b335
WC
3594 dev->flags |= IFF_PROMISC;
3595 dev->promiscuity += inc;
3596 if (dev->promiscuity == 0) {
3597 /*
3598 * Avoid overflow.
3599 * If inc causes overflow, untouch promisc and return error.
3600 */
3601 if (inc < 0)
3602 dev->flags &= ~IFF_PROMISC;
3603 else {
3604 dev->promiscuity -= inc;
3605 printk(KERN_WARNING "%s: promiscuity touches roof, "
3606 "set promiscuity failed, promiscuity feature "
3607 "of device might be broken.\n", dev->name);
3608 return -EOVERFLOW;
3609 }
3610 }
52609c0b 3611 if (dev->flags != old_flags) {
1da177e4
LT
3612 printk(KERN_INFO "device %s %s promiscuous mode\n",
3613 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4ec93edb 3614 "left");
8192b0c4
DH
3615 if (audit_enabled) {
3616 current_uid_gid(&uid, &gid);
7759db82
KHK
3617 audit_log(current->audit_context, GFP_ATOMIC,
3618 AUDIT_ANOM_PROMISCUOUS,
3619 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3620 dev->name, (dev->flags & IFF_PROMISC),
3621 (old_flags & IFF_PROMISC),
3622 audit_get_loginuid(current),
8192b0c4 3623 uid, gid,
7759db82 3624 audit_get_sessionid(current));
8192b0c4 3625 }
24023451 3626
b6c40d68 3627 dev_change_rx_flags(dev, IFF_PROMISC);
1da177e4 3628 }
dad9b335 3629 return 0;
1da177e4
LT
3630}
3631
4417da66
PM
3632/**
3633 * dev_set_promiscuity - update promiscuity count on a device
3634 * @dev: device
3635 * @inc: modifier
3636 *
3637 * Add or remove promiscuity from a device. While the count in the device
3638 * remains above zero the interface remains promiscuous. Once it hits zero
3639 * the device reverts back to normal filtering operation. A negative inc
3640 * value is used to drop promiscuity on the device.
dad9b335 3641 * Return 0 if successful or a negative errno code on error.
4417da66 3642 */
dad9b335 3643int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66
PM
3644{
3645 unsigned short old_flags = dev->flags;
dad9b335 3646 int err;
4417da66 3647
dad9b335 3648 err = __dev_set_promiscuity(dev, inc);
4b5a698e 3649 if (err < 0)
dad9b335 3650 return err;
4417da66
PM
3651 if (dev->flags != old_flags)
3652 dev_set_rx_mode(dev);
dad9b335 3653 return err;
4417da66 3654}
d1b19dff 3655EXPORT_SYMBOL(dev_set_promiscuity);
4417da66 3656
1da177e4
LT
3657/**
3658 * dev_set_allmulti - update allmulti count on a device
3659 * @dev: device
3660 * @inc: modifier
3661 *
3662 * Add or remove reception of all multicast frames to a device. While the
3663 * count in the device remains above zero the interface remains listening
3664 * to all interfaces. Once it hits zero the device reverts back to normal
3665 * filtering operation. A negative @inc value is used to drop the counter
3666 * when releasing a resource needing all multicasts.
dad9b335 3667 * Return 0 if successful or a negative errno code on error.
1da177e4
LT
3668 */
3669
dad9b335 3670int dev_set_allmulti(struct net_device *dev, int inc)
1da177e4
LT
3671{
3672 unsigned short old_flags = dev->flags;
3673
24023451
PM
3674 ASSERT_RTNL();
3675
1da177e4 3676 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
3677 dev->allmulti += inc;
3678 if (dev->allmulti == 0) {
3679 /*
3680 * Avoid overflow.
3681 * If inc causes overflow, untouch allmulti and return error.
3682 */
3683 if (inc < 0)
3684 dev->flags &= ~IFF_ALLMULTI;
3685 else {
3686 dev->allmulti -= inc;
3687 printk(KERN_WARNING "%s: allmulti touches roof, "
3688 "set allmulti failed, allmulti feature of "
3689 "device might be broken.\n", dev->name);
3690 return -EOVERFLOW;
3691 }
3692 }
24023451 3693 if (dev->flags ^ old_flags) {
b6c40d68 3694 dev_change_rx_flags(dev, IFF_ALLMULTI);
4417da66 3695 dev_set_rx_mode(dev);
24023451 3696 }
dad9b335 3697 return 0;
4417da66 3698}
d1b19dff 3699EXPORT_SYMBOL(dev_set_allmulti);
4417da66
PM
3700
3701/*
3702 * Upload unicast and multicast address lists to device and
3703 * configure RX filtering. When the device doesn't support unicast
53ccaae1 3704 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
3705 * are present.
3706 */
3707void __dev_set_rx_mode(struct net_device *dev)
3708{
d314774c
SH
3709 const struct net_device_ops *ops = dev->netdev_ops;
3710
4417da66
PM
3711 /* dev_open will call this function so the list will stay sane. */
3712 if (!(dev->flags&IFF_UP))
3713 return;
3714
3715 if (!netif_device_present(dev))
40b77c94 3716 return;
4417da66 3717
d314774c
SH
3718 if (ops->ndo_set_rx_mode)
3719 ops->ndo_set_rx_mode(dev);
4417da66
PM
3720 else {
3721 /* Unicast addresses changes may only happen under the rtnl,
3722 * therefore calling __dev_set_promiscuity here is safe.
3723 */
32e7bfc4 3724 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4417da66
PM
3725 __dev_set_promiscuity(dev, 1);
3726 dev->uc_promisc = 1;
32e7bfc4 3727 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4417da66
PM
3728 __dev_set_promiscuity(dev, -1);
3729 dev->uc_promisc = 0;
3730 }
3731
d314774c
SH
3732 if (ops->ndo_set_multicast_list)
3733 ops->ndo_set_multicast_list(dev);
4417da66
PM
3734 }
3735}
3736
3737void dev_set_rx_mode(struct net_device *dev)
3738{
b9e40857 3739 netif_addr_lock_bh(dev);
4417da66 3740 __dev_set_rx_mode(dev);
b9e40857 3741 netif_addr_unlock_bh(dev);
1da177e4
LT
3742}
3743
f001fde5
JP
3744/* hw addresses list handling functions */
3745
31278e71
JP
3746static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
3747 int addr_len, unsigned char addr_type)
f001fde5
JP
3748{
3749 struct netdev_hw_addr *ha;
3750 int alloc_size;
3751
3752 if (addr_len > MAX_ADDR_LEN)
3753 return -EINVAL;
3754
31278e71 3755 list_for_each_entry(ha, &list->list, list) {
ccffad25
JP
3756 if (!memcmp(ha->addr, addr, addr_len) &&
3757 ha->type == addr_type) {
3758 ha->refcount++;
3759 return 0;
3760 }
3761 }
3762
3763
f001fde5
JP
3764 alloc_size = sizeof(*ha);
3765 if (alloc_size < L1_CACHE_BYTES)
3766 alloc_size = L1_CACHE_BYTES;
3767 ha = kmalloc(alloc_size, GFP_ATOMIC);
3768 if (!ha)
3769 return -ENOMEM;
3770 memcpy(ha->addr, addr, addr_len);
3771 ha->type = addr_type;
ccffad25
JP
3772 ha->refcount = 1;
3773 ha->synced = false;
31278e71
JP
3774 list_add_tail_rcu(&ha->list, &list->list);
3775 list->count++;
f001fde5
JP
3776 return 0;
3777}
3778
3779static void ha_rcu_free(struct rcu_head *head)
3780{
3781 struct netdev_hw_addr *ha;
3782
3783 ha = container_of(head, struct netdev_hw_addr, rcu_head);
3784 kfree(ha);
3785}
3786
31278e71
JP
3787static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
3788 int addr_len, unsigned char addr_type)
f001fde5
JP
3789{
3790 struct netdev_hw_addr *ha;
f001fde5 3791
31278e71 3792 list_for_each_entry(ha, &list->list, list) {
ccffad25 3793 if (!memcmp(ha->addr, addr, addr_len) &&
f001fde5 3794 (ha->type == addr_type || !addr_type)) {
ccffad25
JP
3795 if (--ha->refcount)
3796 return 0;
f001fde5
JP
3797 list_del_rcu(&ha->list);
3798 call_rcu(&ha->rcu_head, ha_rcu_free);
31278e71 3799 list->count--;
f001fde5
JP
3800 return 0;
3801 }
3802 }
3803 return -ENOENT;
3804}
3805
31278e71
JP
3806static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
3807 struct netdev_hw_addr_list *from_list,
3808 int addr_len,
ccffad25 3809 unsigned char addr_type)
f001fde5
JP
3810{
3811 int err;
3812 struct netdev_hw_addr *ha, *ha2;
3813 unsigned char type;
3814
31278e71 3815 list_for_each_entry(ha, &from_list->list, list) {
f001fde5 3816 type = addr_type ? addr_type : ha->type;
31278e71 3817 err = __hw_addr_add(to_list, ha->addr, addr_len, type);
f001fde5
JP
3818 if (err)
3819 goto unroll;
3820 }
3821 return 0;
3822
3823unroll:
31278e71 3824 list_for_each_entry(ha2, &from_list->list, list) {
f001fde5
JP
3825 if (ha2 == ha)
3826 break;
3827 type = addr_type ? addr_type : ha2->type;
31278e71 3828 __hw_addr_del(to_list, ha2->addr, addr_len, type);
f001fde5
JP
3829 }
3830 return err;
3831}
3832
31278e71
JP
3833static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
3834 struct netdev_hw_addr_list *from_list,
3835 int addr_len,
ccffad25 3836 unsigned char addr_type)
f001fde5
JP
3837{
3838 struct netdev_hw_addr *ha;
3839 unsigned char type;
3840
31278e71 3841 list_for_each_entry(ha, &from_list->list, list) {
f001fde5 3842 type = addr_type ? addr_type : ha->type;
31278e71 3843 __hw_addr_del(to_list, ha->addr, addr_len, addr_type);
ccffad25
JP
3844 }
3845}
3846
31278e71
JP
3847static int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
3848 struct netdev_hw_addr_list *from_list,
ccffad25
JP
3849 int addr_len)
3850{
3851 int err = 0;
3852 struct netdev_hw_addr *ha, *tmp;
3853
31278e71 3854 list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
ccffad25 3855 if (!ha->synced) {
31278e71 3856 err = __hw_addr_add(to_list, ha->addr,
ccffad25
JP
3857 addr_len, ha->type);
3858 if (err)
3859 break;
3860 ha->synced = true;
3861 ha->refcount++;
3862 } else if (ha->refcount == 1) {
31278e71
JP
3863 __hw_addr_del(to_list, ha->addr, addr_len, ha->type);
3864 __hw_addr_del(from_list, ha->addr, addr_len, ha->type);
ccffad25 3865 }
f001fde5 3866 }
ccffad25 3867 return err;
f001fde5
JP
3868}
3869
31278e71
JP
3870static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
3871 struct netdev_hw_addr_list *from_list,
ccffad25
JP
3872 int addr_len)
3873{
3874 struct netdev_hw_addr *ha, *tmp;
3875
31278e71 3876 list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
ccffad25 3877 if (ha->synced) {
31278e71 3878 __hw_addr_del(to_list, ha->addr,
ccffad25
JP
3879 addr_len, ha->type);
3880 ha->synced = false;
31278e71 3881 __hw_addr_del(from_list, ha->addr,
ccffad25
JP
3882 addr_len, ha->type);
3883 }
3884 }
3885}
3886
31278e71 3887static void __hw_addr_flush(struct netdev_hw_addr_list *list)
f001fde5
JP
3888{
3889 struct netdev_hw_addr *ha, *tmp;
3890
31278e71 3891 list_for_each_entry_safe(ha, tmp, &list->list, list) {
f001fde5
JP
3892 list_del_rcu(&ha->list);
3893 call_rcu(&ha->rcu_head, ha_rcu_free);
3894 }
31278e71
JP
3895 list->count = 0;
3896}
3897
3898static void __hw_addr_init(struct netdev_hw_addr_list *list)
3899{
3900 INIT_LIST_HEAD(&list->list);
3901 list->count = 0;
f001fde5
JP
3902}
3903
3904/* Device addresses handling functions */
3905
3906static void dev_addr_flush(struct net_device *dev)
3907{
3908 /* rtnl_mutex must be held here */
3909
31278e71 3910 __hw_addr_flush(&dev->dev_addrs);
f001fde5
JP
3911 dev->dev_addr = NULL;
3912}
3913
3914static int dev_addr_init(struct net_device *dev)
3915{
3916 unsigned char addr[MAX_ADDR_LEN];
3917 struct netdev_hw_addr *ha;
3918 int err;
3919
3920 /* rtnl_mutex must be held here */
3921
31278e71 3922 __hw_addr_init(&dev->dev_addrs);
0c27922e 3923 memset(addr, 0, sizeof(addr));
31278e71 3924 err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
f001fde5
JP
3925 NETDEV_HW_ADDR_T_LAN);
3926 if (!err) {
3927 /*
3928 * Get the first (previously created) address from the list
3929 * and set dev_addr pointer to this location.
3930 */
31278e71 3931 ha = list_first_entry(&dev->dev_addrs.list,
f001fde5
JP
3932 struct netdev_hw_addr, list);
3933 dev->dev_addr = ha->addr;
3934 }
3935 return err;
3936}
3937
3938/**
3939 * dev_addr_add - Add a device address
3940 * @dev: device
3941 * @addr: address to add
3942 * @addr_type: address type
3943 *
3944 * Add a device address to the device or increase the reference count if
3945 * it already exists.
3946 *
3947 * The caller must hold the rtnl_mutex.
3948 */
3949int dev_addr_add(struct net_device *dev, unsigned char *addr,
3950 unsigned char addr_type)
3951{
3952 int err;
3953
3954 ASSERT_RTNL();
3955
31278e71 3956 err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
f001fde5
JP
3957 if (!err)
3958 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3959 return err;
3960}
3961EXPORT_SYMBOL(dev_addr_add);
3962
3963/**
3964 * dev_addr_del - Release a device address.
3965 * @dev: device
3966 * @addr: address to delete
3967 * @addr_type: address type
3968 *
3969 * Release reference to a device address and remove it from the device
3970 * if the reference count drops to zero.
3971 *
3972 * The caller must hold the rtnl_mutex.
3973 */
3974int dev_addr_del(struct net_device *dev, unsigned char *addr,
3975 unsigned char addr_type)
3976{
3977 int err;
ccffad25 3978 struct netdev_hw_addr *ha;
f001fde5
JP
3979
3980 ASSERT_RTNL();
3981
ccffad25
JP
3982 /*
3983 * We can not remove the first address from the list because
3984 * dev->dev_addr points to that.
3985 */
31278e71
JP
3986 ha = list_first_entry(&dev->dev_addrs.list,
3987 struct netdev_hw_addr, list);
ccffad25
JP
3988 if (ha->addr == dev->dev_addr && ha->refcount == 1)
3989 return -ENOENT;
3990
31278e71 3991 err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
ccffad25 3992 addr_type);
f001fde5
JP
3993 if (!err)
3994 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3995 return err;
3996}
3997EXPORT_SYMBOL(dev_addr_del);
3998
3999/**
4000 * dev_addr_add_multiple - Add device addresses from another device
4001 * @to_dev: device to which addresses will be added
4002 * @from_dev: device from which addresses will be added
4003 * @addr_type: address type - 0 means type will be used from from_dev
4004 *
4005 * Add device addresses of the one device to another.
4006 **
4007 * The caller must hold the rtnl_mutex.
4008 */
4009int dev_addr_add_multiple(struct net_device *to_dev,
4010 struct net_device *from_dev,
4011 unsigned char addr_type)
4012{
4013 int err;
4014
4015 ASSERT_RTNL();
4016
4017 if (from_dev->addr_len != to_dev->addr_len)
4018 return -EINVAL;
31278e71 4019 err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
ccffad25 4020 to_dev->addr_len, addr_type);
f001fde5
JP
4021 if (!err)
4022 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
4023 return err;
4024}
4025EXPORT_SYMBOL(dev_addr_add_multiple);
4026
4027/**
4028 * dev_addr_del_multiple - Delete device addresses by another device
4029 * @to_dev: device where the addresses will be deleted
4030 * @from_dev: device by which addresses the addresses will be deleted
4031 * @addr_type: address type - 0 means type will used from from_dev
4032 *
4033 * Deletes addresses in to device by the list of addresses in from device.
4034 *
4035 * The caller must hold the rtnl_mutex.
4036 */
4037int dev_addr_del_multiple(struct net_device *to_dev,
4038 struct net_device *from_dev,
4039 unsigned char addr_type)
4040{
4041 ASSERT_RTNL();
4042
4043 if (from_dev->addr_len != to_dev->addr_len)
4044 return -EINVAL;
31278e71 4045 __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
ccffad25 4046 to_dev->addr_len, addr_type);
f001fde5
JP
4047 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
4048 return 0;
4049}
4050EXPORT_SYMBOL(dev_addr_del_multiple);
4051
31278e71 4052/* multicast addresses handling functions */
f001fde5 4053
61cbc2fc
PM
4054int __dev_addr_delete(struct dev_addr_list **list, int *count,
4055 void *addr, int alen, int glbl)
bf742482
PM
4056{
4057 struct dev_addr_list *da;
4058
4059 for (; (da = *list) != NULL; list = &da->next) {
4060 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4061 alen == da->da_addrlen) {
4062 if (glbl) {
4063 int old_glbl = da->da_gusers;
4064 da->da_gusers = 0;
4065 if (old_glbl == 0)
4066 break;
4067 }
4068 if (--da->da_users)
4069 return 0;
4070
4071 *list = da->next;
4072 kfree(da);
61cbc2fc 4073 (*count)--;
bf742482
PM
4074 return 0;
4075 }
4076 }
4077 return -ENOENT;
4078}
4079
61cbc2fc
PM
4080int __dev_addr_add(struct dev_addr_list **list, int *count,
4081 void *addr, int alen, int glbl)
bf742482
PM
4082{
4083 struct dev_addr_list *da;
4084
4085 for (da = *list; da != NULL; da = da->next) {
4086 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4087 da->da_addrlen == alen) {
4088 if (glbl) {
4089 int old_glbl = da->da_gusers;
4090 da->da_gusers = 1;
4091 if (old_glbl)
4092 return 0;
4093 }
4094 da->da_users++;
4095 return 0;
4096 }
4097 }
4098
12aa343a 4099 da = kzalloc(sizeof(*da), GFP_ATOMIC);
bf742482
PM
4100 if (da == NULL)
4101 return -ENOMEM;
4102 memcpy(da->da_addr, addr, alen);
4103 da->da_addrlen = alen;
4104 da->da_users = 1;
4105 da->da_gusers = glbl ? 1 : 0;
4106 da->next = *list;
4107 *list = da;
61cbc2fc 4108 (*count)++;
bf742482
PM
4109 return 0;
4110}
4111
4417da66
PM
4112/**
4113 * dev_unicast_delete - Release secondary unicast address.
4114 * @dev: device
0ed72ec4 4115 * @addr: address to delete
4417da66
PM
4116 *
4117 * Release reference to a secondary unicast address and remove it
0ed72ec4 4118 * from the device if the reference count drops to zero.
4417da66
PM
4119 *
4120 * The caller must hold the rtnl_mutex.
4121 */
ccffad25 4122int dev_unicast_delete(struct net_device *dev, void *addr)
4417da66
PM
4123{
4124 int err;
4125
4126 ASSERT_RTNL();
4127
a6ac65db 4128 netif_addr_lock_bh(dev);
31278e71
JP
4129 err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
4130 NETDEV_HW_ADDR_T_UNICAST);
61cbc2fc 4131 if (!err)
4417da66 4132 __dev_set_rx_mode(dev);
a6ac65db 4133 netif_addr_unlock_bh(dev);
4417da66
PM
4134 return err;
4135}
4136EXPORT_SYMBOL(dev_unicast_delete);
4137
4138/**
4139 * dev_unicast_add - add a secondary unicast address
4140 * @dev: device
5dbaec5d 4141 * @addr: address to add
4417da66
PM
4142 *
4143 * Add a secondary unicast address to the device or increase
4144 * the reference count if it already exists.
4145 *
4146 * The caller must hold the rtnl_mutex.
4147 */
ccffad25 4148int dev_unicast_add(struct net_device *dev, void *addr)
4417da66
PM
4149{
4150 int err;
4151
4152 ASSERT_RTNL();
4153
a6ac65db 4154 netif_addr_lock_bh(dev);
31278e71
JP
4155 err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
4156 NETDEV_HW_ADDR_T_UNICAST);
61cbc2fc 4157 if (!err)
4417da66 4158 __dev_set_rx_mode(dev);
a6ac65db 4159 netif_addr_unlock_bh(dev);
4417da66
PM
4160 return err;
4161}
4162EXPORT_SYMBOL(dev_unicast_add);
4163
e83a2ea8
CL
4164int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
4165 struct dev_addr_list **from, int *from_count)
4166{
4167 struct dev_addr_list *da, *next;
4168 int err = 0;
4169
4170 da = *from;
4171 while (da != NULL) {
4172 next = da->next;
4173 if (!da->da_synced) {
4174 err = __dev_addr_add(to, to_count,
4175 da->da_addr, da->da_addrlen, 0);
4176 if (err < 0)
4177 break;
4178 da->da_synced = 1;
4179 da->da_users++;
4180 } else if (da->da_users == 1) {
4181 __dev_addr_delete(to, to_count,
4182 da->da_addr, da->da_addrlen, 0);
4183 __dev_addr_delete(from, from_count,
4184 da->da_addr, da->da_addrlen, 0);
4185 }
4186 da = next;
4187 }
4188 return err;
4189}
c4029083 4190EXPORT_SYMBOL_GPL(__dev_addr_sync);
e83a2ea8
CL
4191
4192void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
4193 struct dev_addr_list **from, int *from_count)
4194{
4195 struct dev_addr_list *da, *next;
4196
4197 da = *from;
4198 while (da != NULL) {
4199 next = da->next;
4200 if (da->da_synced) {
4201 __dev_addr_delete(to, to_count,
4202 da->da_addr, da->da_addrlen, 0);
4203 da->da_synced = 0;
4204 __dev_addr_delete(from, from_count,
4205 da->da_addr, da->da_addrlen, 0);
4206 }
4207 da = next;
4208 }
4209}
c4029083 4210EXPORT_SYMBOL_GPL(__dev_addr_unsync);
e83a2ea8
CL
4211
4212/**
4213 * dev_unicast_sync - Synchronize device's unicast list to another device
4214 * @to: destination device
4215 * @from: source device
4216 *
4217 * Add newly added addresses to the destination device and release
a6ac65db
JP
4218 * addresses that have no users left. The source device must be
4219 * locked by netif_tx_lock_bh.
e83a2ea8
CL
4220 *
4221 * This function is intended to be called from the dev->set_rx_mode
4222 * function of layered software devices.
4223 */
4224int dev_unicast_sync(struct net_device *to, struct net_device *from)
4225{
4226 int err = 0;
4227
ccffad25
JP
4228 if (to->addr_len != from->addr_len)
4229 return -EINVAL;
4230
a6ac65db 4231 netif_addr_lock_bh(to);
31278e71 4232 err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
e83a2ea8
CL
4233 if (!err)
4234 __dev_set_rx_mode(to);
a6ac65db 4235 netif_addr_unlock_bh(to);
e83a2ea8
CL
4236 return err;
4237}
4238EXPORT_SYMBOL(dev_unicast_sync);
4239
4240/**
bc2cda1e 4241 * dev_unicast_unsync - Remove synchronized addresses from the destination device
e83a2ea8
CL
4242 * @to: destination device
4243 * @from: source device
4244 *
4245 * Remove all addresses that were added to the destination device by
4246 * dev_unicast_sync(). This function is intended to be called from the
4247 * dev->stop function of layered software devices.
4248 */
4249void dev_unicast_unsync(struct net_device *to, struct net_device *from)
4250{
ccffad25
JP
4251 if (to->addr_len != from->addr_len)
4252 return;
e83a2ea8 4253
a6ac65db
JP
4254 netif_addr_lock_bh(from);
4255 netif_addr_lock(to);
31278e71 4256 __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
ccffad25 4257 __dev_set_rx_mode(to);
a6ac65db
JP
4258 netif_addr_unlock(to);
4259 netif_addr_unlock_bh(from);
e83a2ea8
CL
4260}
4261EXPORT_SYMBOL(dev_unicast_unsync);
4262
ccffad25
JP
4263static void dev_unicast_flush(struct net_device *dev)
4264{
a6ac65db 4265 netif_addr_lock_bh(dev);
31278e71 4266 __hw_addr_flush(&dev->uc);
a6ac65db 4267 netif_addr_unlock_bh(dev);
ccffad25
JP
4268}
4269
4270static void dev_unicast_init(struct net_device *dev)
4271{
31278e71 4272 __hw_addr_init(&dev->uc);
ccffad25
JP
4273}
4274
4275
12972621
DC
4276static void __dev_addr_discard(struct dev_addr_list **list)
4277{
4278 struct dev_addr_list *tmp;
4279
4280 while (*list != NULL) {
4281 tmp = *list;
4282 *list = tmp->next;
4283 if (tmp->da_users > tmp->da_gusers)
4284 printk("__dev_addr_discard: address leakage! "
4285 "da_users=%d\n", tmp->da_users);
4286 kfree(tmp);
4287 }
4288}
4289
26cc2522 4290static void dev_addr_discard(struct net_device *dev)
4417da66 4291{
b9e40857 4292 netif_addr_lock_bh(dev);
26cc2522 4293
456ad75c 4294 __dev_addr_discard(&dev->mc_list);
4cd24eaf 4295 netdev_mc_count(dev) = 0;
26cc2522 4296
b9e40857 4297 netif_addr_unlock_bh(dev);
456ad75c
DC
4298}
4299
f0db275a
SH
4300/**
4301 * dev_get_flags - get flags reported to userspace
4302 * @dev: device
4303 *
4304 * Get the combination of flag bits exported through APIs to userspace.
4305 */
1da177e4
LT
4306unsigned dev_get_flags(const struct net_device *dev)
4307{
4308 unsigned flags;
4309
4310 flags = (dev->flags & ~(IFF_PROMISC |
4311 IFF_ALLMULTI |
b00055aa
SR
4312 IFF_RUNNING |
4313 IFF_LOWER_UP |
4314 IFF_DORMANT)) |
1da177e4
LT
4315 (dev->gflags & (IFF_PROMISC |
4316 IFF_ALLMULTI));
4317
b00055aa
SR
4318 if (netif_running(dev)) {
4319 if (netif_oper_up(dev))
4320 flags |= IFF_RUNNING;
4321 if (netif_carrier_ok(dev))
4322 flags |= IFF_LOWER_UP;
4323 if (netif_dormant(dev))
4324 flags |= IFF_DORMANT;
4325 }
1da177e4
LT
4326
4327 return flags;
4328}
d1b19dff 4329EXPORT_SYMBOL(dev_get_flags);
1da177e4 4330
bd380811 4331int __dev_change_flags(struct net_device *dev, unsigned int flags)
1da177e4 4332{
1da177e4 4333 int old_flags = dev->flags;
bd380811 4334 int ret;
1da177e4 4335
24023451
PM
4336 ASSERT_RTNL();
4337
1da177e4
LT
4338 /*
4339 * Set the flags on our device.
4340 */
4341
4342 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4343 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4344 IFF_AUTOMEDIA)) |
4345 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4346 IFF_ALLMULTI));
4347
4348 /*
4349 * Load in the correct multicast list now the flags have changed.
4350 */
4351
b6c40d68
PM
4352 if ((old_flags ^ flags) & IFF_MULTICAST)
4353 dev_change_rx_flags(dev, IFF_MULTICAST);
24023451 4354
4417da66 4355 dev_set_rx_mode(dev);
1da177e4
LT
4356
4357 /*
4358 * Have we downed the interface. We handle IFF_UP ourselves
4359 * according to user attempts to set it, rather than blindly
4360 * setting it.
4361 */
4362
4363 ret = 0;
4364 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
bd380811 4365 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
1da177e4
LT
4366
4367 if (!ret)
4417da66 4368 dev_set_rx_mode(dev);
1da177e4
LT
4369 }
4370
1da177e4 4371 if ((flags ^ dev->gflags) & IFF_PROMISC) {
d1b19dff
ED
4372 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4373
1da177e4
LT
4374 dev->gflags ^= IFF_PROMISC;
4375 dev_set_promiscuity(dev, inc);
4376 }
4377
4378 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4379 is important. Some (broken) drivers set IFF_PROMISC, when
4380 IFF_ALLMULTI is requested not asking us and not reporting.
4381 */
4382 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
d1b19dff
ED
4383 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4384
1da177e4
LT
4385 dev->gflags ^= IFF_ALLMULTI;
4386 dev_set_allmulti(dev, inc);
4387 }
4388
bd380811
PM
4389 return ret;
4390}
4391
4392void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4393{
4394 unsigned int changes = dev->flags ^ old_flags;
4395
4396 if (changes & IFF_UP) {
4397 if (dev->flags & IFF_UP)
4398 call_netdevice_notifiers(NETDEV_UP, dev);
4399 else
4400 call_netdevice_notifiers(NETDEV_DOWN, dev);
4401 }
4402
4403 if (dev->flags & IFF_UP &&
4404 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4405 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4406}
4407
4408/**
4409 * dev_change_flags - change device settings
4410 * @dev: device
4411 * @flags: device state flags
4412 *
4413 * Change settings on device based state flags. The flags are
4414 * in the userspace exported format.
4415 */
4416int dev_change_flags(struct net_device *dev, unsigned flags)
4417{
4418 int ret, changes;
4419 int old_flags = dev->flags;
4420
4421 ret = __dev_change_flags(dev, flags);
4422 if (ret < 0)
4423 return ret;
4424
4425 changes = old_flags ^ dev->flags;
7c355f53
TG
4426 if (changes)
4427 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
1da177e4 4428
bd380811 4429 __dev_notify_flags(dev, old_flags);
1da177e4
LT
4430 return ret;
4431}
d1b19dff 4432EXPORT_SYMBOL(dev_change_flags);
1da177e4 4433
f0db275a
SH
4434/**
4435 * dev_set_mtu - Change maximum transfer unit
4436 * @dev: device
4437 * @new_mtu: new transfer unit
4438 *
4439 * Change the maximum transfer size of the network device.
4440 */
1da177e4
LT
4441int dev_set_mtu(struct net_device *dev, int new_mtu)
4442{
d314774c 4443 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4444 int err;
4445
4446 if (new_mtu == dev->mtu)
4447 return 0;
4448
4449 /* MTU must be positive. */
4450 if (new_mtu < 0)
4451 return -EINVAL;
4452
4453 if (!netif_device_present(dev))
4454 return -ENODEV;
4455
4456 err = 0;
d314774c
SH
4457 if (ops->ndo_change_mtu)
4458 err = ops->ndo_change_mtu(dev, new_mtu);
1da177e4
LT
4459 else
4460 dev->mtu = new_mtu;
d314774c 4461
1da177e4 4462 if (!err && dev->flags & IFF_UP)
056925ab 4463 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
1da177e4
LT
4464 return err;
4465}
d1b19dff 4466EXPORT_SYMBOL(dev_set_mtu);
1da177e4 4467
f0db275a
SH
4468/**
4469 * dev_set_mac_address - Change Media Access Control Address
4470 * @dev: device
4471 * @sa: new address
4472 *
4473 * Change the hardware (MAC) address of the device
4474 */
1da177e4
LT
4475int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4476{
d314774c 4477 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4478 int err;
4479
d314774c 4480 if (!ops->ndo_set_mac_address)
1da177e4
LT
4481 return -EOPNOTSUPP;
4482 if (sa->sa_family != dev->type)
4483 return -EINVAL;
4484 if (!netif_device_present(dev))
4485 return -ENODEV;
d314774c 4486 err = ops->ndo_set_mac_address(dev, sa);
1da177e4 4487 if (!err)
056925ab 4488 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
1da177e4
LT
4489 return err;
4490}
d1b19dff 4491EXPORT_SYMBOL(dev_set_mac_address);
1da177e4
LT
4492
4493/*
3710becf 4494 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
1da177e4 4495 */
14e3e079 4496static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
1da177e4
LT
4497{
4498 int err;
3710becf 4499 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
1da177e4
LT
4500
4501 if (!dev)
4502 return -ENODEV;
4503
4504 switch (cmd) {
d1b19dff
ED
4505 case SIOCGIFFLAGS: /* Get interface flags */
4506 ifr->ifr_flags = (short) dev_get_flags(dev);
4507 return 0;
1da177e4 4508
d1b19dff
ED
4509 case SIOCGIFMETRIC: /* Get the metric on the interface
4510 (currently unused) */
4511 ifr->ifr_metric = 0;
4512 return 0;
1da177e4 4513
d1b19dff
ED
4514 case SIOCGIFMTU: /* Get the MTU of a device */
4515 ifr->ifr_mtu = dev->mtu;
4516 return 0;
1da177e4 4517
d1b19dff
ED
4518 case SIOCGIFHWADDR:
4519 if (!dev->addr_len)
4520 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4521 else
4522 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4523 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4524 ifr->ifr_hwaddr.sa_family = dev->type;
4525 return 0;
1da177e4 4526
d1b19dff
ED
4527 case SIOCGIFSLAVE:
4528 err = -EINVAL;
4529 break;
14e3e079 4530
d1b19dff
ED
4531 case SIOCGIFMAP:
4532 ifr->ifr_map.mem_start = dev->mem_start;
4533 ifr->ifr_map.mem_end = dev->mem_end;
4534 ifr->ifr_map.base_addr = dev->base_addr;
4535 ifr->ifr_map.irq = dev->irq;
4536 ifr->ifr_map.dma = dev->dma;
4537 ifr->ifr_map.port = dev->if_port;
4538 return 0;
14e3e079 4539
d1b19dff
ED
4540 case SIOCGIFINDEX:
4541 ifr->ifr_ifindex = dev->ifindex;
4542 return 0;
14e3e079 4543
d1b19dff
ED
4544 case SIOCGIFTXQLEN:
4545 ifr->ifr_qlen = dev->tx_queue_len;
4546 return 0;
14e3e079 4547
d1b19dff
ED
4548 default:
4549 /* dev_ioctl() should ensure this case
4550 * is never reached
4551 */
4552 WARN_ON(1);
4553 err = -EINVAL;
4554 break;
14e3e079
JG
4555
4556 }
4557 return err;
4558}
4559
4560/*
4561 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4562 */
4563static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4564{
4565 int err;
4566 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5f2f6da7 4567 const struct net_device_ops *ops;
14e3e079
JG
4568
4569 if (!dev)
4570 return -ENODEV;
4571
5f2f6da7
JP
4572 ops = dev->netdev_ops;
4573
14e3e079 4574 switch (cmd) {
d1b19dff
ED
4575 case SIOCSIFFLAGS: /* Set interface flags */
4576 return dev_change_flags(dev, ifr->ifr_flags);
14e3e079 4577
d1b19dff
ED
4578 case SIOCSIFMETRIC: /* Set the metric on the interface
4579 (currently unused) */
4580 return -EOPNOTSUPP;
14e3e079 4581
d1b19dff
ED
4582 case SIOCSIFMTU: /* Set the MTU of a device */
4583 return dev_set_mtu(dev, ifr->ifr_mtu);
1da177e4 4584
d1b19dff
ED
4585 case SIOCSIFHWADDR:
4586 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
1da177e4 4587
d1b19dff
ED
4588 case SIOCSIFHWBROADCAST:
4589 if (ifr->ifr_hwaddr.sa_family != dev->type)
4590 return -EINVAL;
4591 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4592 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4593 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4594 return 0;
1da177e4 4595
d1b19dff
ED
4596 case SIOCSIFMAP:
4597 if (ops->ndo_set_config) {
1da177e4
LT
4598 if (!netif_device_present(dev))
4599 return -ENODEV;
d1b19dff
ED
4600 return ops->ndo_set_config(dev, &ifr->ifr_map);
4601 }
4602 return -EOPNOTSUPP;
1da177e4 4603
d1b19dff
ED
4604 case SIOCADDMULTI:
4605 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4606 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4607 return -EINVAL;
4608 if (!netif_device_present(dev))
4609 return -ENODEV;
4610 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
4611 dev->addr_len, 1);
4612
4613 case SIOCDELMULTI:
4614 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4615 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4616 return -EINVAL;
4617 if (!netif_device_present(dev))
4618 return -ENODEV;
4619 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
4620 dev->addr_len, 1);
1da177e4 4621
d1b19dff
ED
4622 case SIOCSIFTXQLEN:
4623 if (ifr->ifr_qlen < 0)
4624 return -EINVAL;
4625 dev->tx_queue_len = ifr->ifr_qlen;
4626 return 0;
1da177e4 4627
d1b19dff
ED
4628 case SIOCSIFNAME:
4629 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4630 return dev_change_name(dev, ifr->ifr_newname);
1da177e4 4631
d1b19dff
ED
4632 /*
4633 * Unknown or private ioctl
4634 */
4635 default:
4636 if ((cmd >= SIOCDEVPRIVATE &&
4637 cmd <= SIOCDEVPRIVATE + 15) ||
4638 cmd == SIOCBONDENSLAVE ||
4639 cmd == SIOCBONDRELEASE ||
4640 cmd == SIOCBONDSETHWADDR ||
4641 cmd == SIOCBONDSLAVEINFOQUERY ||
4642 cmd == SIOCBONDINFOQUERY ||
4643 cmd == SIOCBONDCHANGEACTIVE ||
4644 cmd == SIOCGMIIPHY ||
4645 cmd == SIOCGMIIREG ||
4646 cmd == SIOCSMIIREG ||
4647 cmd == SIOCBRADDIF ||
4648 cmd == SIOCBRDELIF ||
4649 cmd == SIOCSHWTSTAMP ||
4650 cmd == SIOCWANDEV) {
4651 err = -EOPNOTSUPP;
4652 if (ops->ndo_do_ioctl) {
4653 if (netif_device_present(dev))
4654 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4655 else
4656 err = -ENODEV;
4657 }
4658 } else
4659 err = -EINVAL;
1da177e4
LT
4660
4661 }
4662 return err;
4663}
4664
4665/*
4666 * This function handles all "interface"-type I/O control requests. The actual
4667 * 'doing' part of this is dev_ifsioc above.
4668 */
4669
4670/**
4671 * dev_ioctl - network device ioctl
c4ea43c5 4672 * @net: the applicable net namespace
1da177e4
LT
4673 * @cmd: command to issue
4674 * @arg: pointer to a struct ifreq in user space
4675 *
4676 * Issue ioctl functions to devices. This is normally called by the
4677 * user space syscall interfaces but can sometimes be useful for
4678 * other purposes. The return value is the return from the syscall if
4679 * positive or a negative errno code on error.
4680 */
4681
881d966b 4682int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1da177e4
LT
4683{
4684 struct ifreq ifr;
4685 int ret;
4686 char *colon;
4687
4688 /* One special case: SIOCGIFCONF takes ifconf argument
4689 and requires shared lock, because it sleeps writing
4690 to user space.
4691 */
4692
4693 if (cmd == SIOCGIFCONF) {
6756ae4b 4694 rtnl_lock();
881d966b 4695 ret = dev_ifconf(net, (char __user *) arg);
6756ae4b 4696 rtnl_unlock();
1da177e4
LT
4697 return ret;
4698 }
4699 if (cmd == SIOCGIFNAME)
881d966b 4700 return dev_ifname(net, (struct ifreq __user *)arg);
1da177e4
LT
4701
4702 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4703 return -EFAULT;
4704
4705 ifr.ifr_name[IFNAMSIZ-1] = 0;
4706
4707 colon = strchr(ifr.ifr_name, ':');
4708 if (colon)
4709 *colon = 0;
4710
4711 /*
4712 * See which interface the caller is talking about.
4713 */
4714
4715 switch (cmd) {
d1b19dff
ED
4716 /*
4717 * These ioctl calls:
4718 * - can be done by all.
4719 * - atomic and do not require locking.
4720 * - return a value
4721 */
4722 case SIOCGIFFLAGS:
4723 case SIOCGIFMETRIC:
4724 case SIOCGIFMTU:
4725 case SIOCGIFHWADDR:
4726 case SIOCGIFSLAVE:
4727 case SIOCGIFMAP:
4728 case SIOCGIFINDEX:
4729 case SIOCGIFTXQLEN:
4730 dev_load(net, ifr.ifr_name);
3710becf 4731 rcu_read_lock();
d1b19dff 4732 ret = dev_ifsioc_locked(net, &ifr, cmd);
3710becf 4733 rcu_read_unlock();
d1b19dff
ED
4734 if (!ret) {
4735 if (colon)
4736 *colon = ':';
4737 if (copy_to_user(arg, &ifr,
4738 sizeof(struct ifreq)))
4739 ret = -EFAULT;
4740 }
4741 return ret;
1da177e4 4742
d1b19dff
ED
4743 case SIOCETHTOOL:
4744 dev_load(net, ifr.ifr_name);
4745 rtnl_lock();
4746 ret = dev_ethtool(net, &ifr);
4747 rtnl_unlock();
4748 if (!ret) {
4749 if (colon)
4750 *colon = ':';
4751 if (copy_to_user(arg, &ifr,
4752 sizeof(struct ifreq)))
4753 ret = -EFAULT;
4754 }
4755 return ret;
1da177e4 4756
d1b19dff
ED
4757 /*
4758 * These ioctl calls:
4759 * - require superuser power.
4760 * - require strict serialization.
4761 * - return a value
4762 */
4763 case SIOCGMIIPHY:
4764 case SIOCGMIIREG:
4765 case SIOCSIFNAME:
4766 if (!capable(CAP_NET_ADMIN))
4767 return -EPERM;
4768 dev_load(net, ifr.ifr_name);
4769 rtnl_lock();
4770 ret = dev_ifsioc(net, &ifr, cmd);
4771 rtnl_unlock();
4772 if (!ret) {
4773 if (colon)
4774 *colon = ':';
4775 if (copy_to_user(arg, &ifr,
4776 sizeof(struct ifreq)))
4777 ret = -EFAULT;
4778 }
4779 return ret;
1da177e4 4780
d1b19dff
ED
4781 /*
4782 * These ioctl calls:
4783 * - require superuser power.
4784 * - require strict serialization.
4785 * - do not return a value
4786 */
4787 case SIOCSIFFLAGS:
4788 case SIOCSIFMETRIC:
4789 case SIOCSIFMTU:
4790 case SIOCSIFMAP:
4791 case SIOCSIFHWADDR:
4792 case SIOCSIFSLAVE:
4793 case SIOCADDMULTI:
4794 case SIOCDELMULTI:
4795 case SIOCSIFHWBROADCAST:
4796 case SIOCSIFTXQLEN:
4797 case SIOCSMIIREG:
4798 case SIOCBONDENSLAVE:
4799 case SIOCBONDRELEASE:
4800 case SIOCBONDSETHWADDR:
4801 case SIOCBONDCHANGEACTIVE:
4802 case SIOCBRADDIF:
4803 case SIOCBRDELIF:
4804 case SIOCSHWTSTAMP:
4805 if (!capable(CAP_NET_ADMIN))
4806 return -EPERM;
4807 /* fall through */
4808 case SIOCBONDSLAVEINFOQUERY:
4809 case SIOCBONDINFOQUERY:
4810 dev_load(net, ifr.ifr_name);
4811 rtnl_lock();
4812 ret = dev_ifsioc(net, &ifr, cmd);
4813 rtnl_unlock();
4814 return ret;
4815
4816 case SIOCGIFMEM:
4817 /* Get the per device memory space. We can add this but
4818 * currently do not support it */
4819 case SIOCSIFMEM:
4820 /* Set the per device memory buffer space.
4821 * Not applicable in our case */
4822 case SIOCSIFLINK:
4823 return -EINVAL;
4824
4825 /*
4826 * Unknown or private ioctl.
4827 */
4828 default:
4829 if (cmd == SIOCWANDEV ||
4830 (cmd >= SIOCDEVPRIVATE &&
4831 cmd <= SIOCDEVPRIVATE + 15)) {
881d966b 4832 dev_load(net, ifr.ifr_name);
1da177e4 4833 rtnl_lock();
881d966b 4834 ret = dev_ifsioc(net, &ifr, cmd);
1da177e4 4835 rtnl_unlock();
d1b19dff
ED
4836 if (!ret && copy_to_user(arg, &ifr,
4837 sizeof(struct ifreq)))
4838 ret = -EFAULT;
1da177e4 4839 return ret;
d1b19dff
ED
4840 }
4841 /* Take care of Wireless Extensions */
4842 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4843 return wext_handle_ioctl(net, &ifr, cmd, arg);
4844 return -EINVAL;
1da177e4
LT
4845 }
4846}
4847
4848
4849/**
4850 * dev_new_index - allocate an ifindex
c4ea43c5 4851 * @net: the applicable net namespace
1da177e4
LT
4852 *
4853 * Returns a suitable unique value for a new device interface
4854 * number. The caller must hold the rtnl semaphore or the
4855 * dev_base_lock to be sure it remains unique.
4856 */
881d966b 4857static int dev_new_index(struct net *net)
1da177e4
LT
4858{
4859 static int ifindex;
4860 for (;;) {
4861 if (++ifindex <= 0)
4862 ifindex = 1;
881d966b 4863 if (!__dev_get_by_index(net, ifindex))
1da177e4
LT
4864 return ifindex;
4865 }
4866}
4867
1da177e4 4868/* Delayed registration/unregisteration */
3b5b34fd 4869static LIST_HEAD(net_todo_list);
1da177e4 4870
6f05f629 4871static void net_set_todo(struct net_device *dev)
1da177e4 4872{
1da177e4 4873 list_add_tail(&dev->todo_list, &net_todo_list);
1da177e4
LT
4874}
4875
9b5e383c 4876static void rollback_registered_many(struct list_head *head)
93ee31f1 4877{
e93737b0 4878 struct net_device *dev, *tmp;
9b5e383c 4879
93ee31f1
DL
4880 BUG_ON(dev_boot_phase);
4881 ASSERT_RTNL();
4882
e93737b0 4883 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9b5e383c 4884 /* Some devices call without registering
e93737b0
KK
4885 * for initialization unwind. Remove those
4886 * devices and proceed with the remaining.
9b5e383c
ED
4887 */
4888 if (dev->reg_state == NETREG_UNINITIALIZED) {
4889 pr_debug("unregister_netdevice: device %s/%p never "
4890 "was registered\n", dev->name, dev);
93ee31f1 4891
9b5e383c 4892 WARN_ON(1);
e93737b0
KK
4893 list_del(&dev->unreg_list);
4894 continue;
9b5e383c 4895 }
93ee31f1 4896
9b5e383c 4897 BUG_ON(dev->reg_state != NETREG_REGISTERED);
93ee31f1 4898
9b5e383c
ED
4899 /* If device is running, close it first. */
4900 dev_close(dev);
93ee31f1 4901
9b5e383c
ED
4902 /* And unlink it from device chain. */
4903 unlist_netdevice(dev);
93ee31f1 4904
9b5e383c
ED
4905 dev->reg_state = NETREG_UNREGISTERING;
4906 }
93ee31f1
DL
4907
4908 synchronize_net();
4909
9b5e383c
ED
4910 list_for_each_entry(dev, head, unreg_list) {
4911 /* Shutdown queueing discipline. */
4912 dev_shutdown(dev);
93ee31f1
DL
4913
4914
9b5e383c
ED
4915 /* Notify protocols, that we are about to destroy
4916 this device. They should clean all the things.
4917 */
4918 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
93ee31f1 4919
a2835763
PM
4920 if (!dev->rtnl_link_ops ||
4921 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4922 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4923
9b5e383c
ED
4924 /*
4925 * Flush the unicast and multicast chains
4926 */
4927 dev_unicast_flush(dev);
4928 dev_addr_discard(dev);
93ee31f1 4929
9b5e383c
ED
4930 if (dev->netdev_ops->ndo_uninit)
4931 dev->netdev_ops->ndo_uninit(dev);
93ee31f1 4932
9b5e383c
ED
4933 /* Notifier chain MUST detach us from master device. */
4934 WARN_ON(dev->master);
93ee31f1 4935
9b5e383c
ED
4936 /* Remove entries from kobject tree */
4937 netdev_unregister_kobject(dev);
4938 }
93ee31f1 4939
a5ee1551 4940 /* Process any work delayed until the end of the batch */
e5e26d75 4941 dev = list_first_entry(head, struct net_device, unreg_list);
a5ee1551 4942 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
93ee31f1 4943
a5ee1551 4944 synchronize_net();
395264d5 4945
a5ee1551 4946 list_for_each_entry(dev, head, unreg_list)
9b5e383c
ED
4947 dev_put(dev);
4948}
4949
4950static void rollback_registered(struct net_device *dev)
4951{
4952 LIST_HEAD(single);
4953
4954 list_add(&dev->unreg_list, &single);
4955 rollback_registered_many(&single);
93ee31f1
DL
4956}
4957
e8a0464c
DM
4958static void __netdev_init_queue_locks_one(struct net_device *dev,
4959 struct netdev_queue *dev_queue,
4960 void *_unused)
c773e847
DM
4961{
4962 spin_lock_init(&dev_queue->_xmit_lock);
cf508b12 4963 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
c773e847
DM
4964 dev_queue->xmit_lock_owner = -1;
4965}
4966
4967static void netdev_init_queue_locks(struct net_device *dev)
4968{
e8a0464c
DM
4969 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4970 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
c773e847
DM
4971}
4972
b63365a2
HX
4973unsigned long netdev_fix_features(unsigned long features, const char *name)
4974{
4975 /* Fix illegal SG+CSUM combinations. */
4976 if ((features & NETIF_F_SG) &&
4977 !(features & NETIF_F_ALL_CSUM)) {
4978 if (name)
4979 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4980 "checksum feature.\n", name);
4981 features &= ~NETIF_F_SG;
4982 }
4983
4984 /* TSO requires that SG is present as well. */
4985 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4986 if (name)
4987 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4988 "SG feature.\n", name);
4989 features &= ~NETIF_F_TSO;
4990 }
4991
4992 if (features & NETIF_F_UFO) {
4993 if (!(features & NETIF_F_GEN_CSUM)) {
4994 if (name)
4995 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4996 "since no NETIF_F_HW_CSUM feature.\n",
4997 name);
4998 features &= ~NETIF_F_UFO;
4999 }
5000
5001 if (!(features & NETIF_F_SG)) {
5002 if (name)
5003 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5004 "since no NETIF_F_SG feature.\n", name);
5005 features &= ~NETIF_F_UFO;
5006 }
5007 }
5008
5009 return features;
5010}
5011EXPORT_SYMBOL(netdev_fix_features);
5012
fc4a7489
PM
5013/**
5014 * netif_stacked_transfer_operstate - transfer operstate
5015 * @rootdev: the root or lower level device to transfer state from
5016 * @dev: the device to transfer operstate to
5017 *
5018 * Transfer operational state from root to device. This is normally
5019 * called when a stacking relationship exists between the root
5020 * device and the device(a leaf device).
5021 */
5022void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5023 struct net_device *dev)
5024{
5025 if (rootdev->operstate == IF_OPER_DORMANT)
5026 netif_dormant_on(dev);
5027 else
5028 netif_dormant_off(dev);
5029
5030 if (netif_carrier_ok(rootdev)) {
5031 if (!netif_carrier_ok(dev))
5032 netif_carrier_on(dev);
5033 } else {
5034 if (netif_carrier_ok(dev))
5035 netif_carrier_off(dev);
5036 }
5037}
5038EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5039
1da177e4
LT
5040/**
5041 * register_netdevice - register a network device
5042 * @dev: device to register
5043 *
5044 * Take a completed network device structure and add it to the kernel
5045 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5046 * chain. 0 is returned on success. A negative errno code is returned
5047 * on a failure to set up the device, or if the name is a duplicate.
5048 *
5049 * Callers must hold the rtnl semaphore. You may want
5050 * register_netdev() instead of this.
5051 *
5052 * BUGS:
5053 * The locking appears insufficient to guarantee two parallel registers
5054 * will not get the same name.
5055 */
5056
5057int register_netdevice(struct net_device *dev)
5058{
1da177e4 5059 int ret;
d314774c 5060 struct net *net = dev_net(dev);
1da177e4
LT
5061
5062 BUG_ON(dev_boot_phase);
5063 ASSERT_RTNL();
5064
b17a7c17
SH
5065 might_sleep();
5066
1da177e4
LT
5067 /* When net_device's are persistent, this will be fatal. */
5068 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
d314774c 5069 BUG_ON(!net);
1da177e4 5070
f1f28aa3 5071 spin_lock_init(&dev->addr_list_lock);
cf508b12 5072 netdev_set_addr_lockdep_class(dev);
c773e847 5073 netdev_init_queue_locks(dev);
1da177e4 5074
1da177e4
LT
5075 dev->iflink = -1;
5076
5077 /* Init, if this function is available */
d314774c
SH
5078 if (dev->netdev_ops->ndo_init) {
5079 ret = dev->netdev_ops->ndo_init(dev);
1da177e4
LT
5080 if (ret) {
5081 if (ret > 0)
5082 ret = -EIO;
90833aa4 5083 goto out;
1da177e4
LT
5084 }
5085 }
4ec93edb 5086
d9031024
OP
5087 ret = dev_get_valid_name(net, dev->name, dev->name, 0);
5088 if (ret)
7ce1b0ed 5089 goto err_uninit;
1da177e4 5090
881d966b 5091 dev->ifindex = dev_new_index(net);
1da177e4
LT
5092 if (dev->iflink == -1)
5093 dev->iflink = dev->ifindex;
5094
d212f87b
SH
5095 /* Fix illegal checksum combinations */
5096 if ((dev->features & NETIF_F_HW_CSUM) &&
5097 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5098 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5099 dev->name);
5100 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5101 }
5102
5103 if ((dev->features & NETIF_F_NO_CSUM) &&
5104 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5105 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5106 dev->name);
5107 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5108 }
5109
b63365a2 5110 dev->features = netdev_fix_features(dev->features, dev->name);
1da177e4 5111
e5a4a72d
LB
5112 /* Enable software GSO if SG is supported. */
5113 if (dev->features & NETIF_F_SG)
5114 dev->features |= NETIF_F_GSO;
5115
aaf8cdc3 5116 netdev_initialize_kobject(dev);
7ffbe3fd
JB
5117
5118 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5119 ret = notifier_to_errno(ret);
5120 if (ret)
5121 goto err_uninit;
5122
8b41d188 5123 ret = netdev_register_kobject(dev);
b17a7c17 5124 if (ret)
7ce1b0ed 5125 goto err_uninit;
b17a7c17
SH
5126 dev->reg_state = NETREG_REGISTERED;
5127
1da177e4
LT
5128 /*
5129 * Default initial state at registry is that the
5130 * device is present.
5131 */
5132
5133 set_bit(__LINK_STATE_PRESENT, &dev->state);
5134
1da177e4 5135 dev_init_scheduler(dev);
1da177e4 5136 dev_hold(dev);
ce286d32 5137 list_netdevice(dev);
1da177e4
LT
5138
5139 /* Notify protocols, that a new device appeared. */
056925ab 5140 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 5141 ret = notifier_to_errno(ret);
93ee31f1
DL
5142 if (ret) {
5143 rollback_registered(dev);
5144 dev->reg_state = NETREG_UNREGISTERED;
5145 }
d90a909e
EB
5146 /*
5147 * Prevent userspace races by waiting until the network
5148 * device is fully setup before sending notifications.
5149 */
a2835763
PM
5150 if (!dev->rtnl_link_ops ||
5151 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5152 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
1da177e4
LT
5153
5154out:
5155 return ret;
7ce1b0ed
HX
5156
5157err_uninit:
d314774c
SH
5158 if (dev->netdev_ops->ndo_uninit)
5159 dev->netdev_ops->ndo_uninit(dev);
7ce1b0ed 5160 goto out;
1da177e4 5161}
d1b19dff 5162EXPORT_SYMBOL(register_netdevice);
1da177e4 5163
937f1ba5
BH
5164/**
5165 * init_dummy_netdev - init a dummy network device for NAPI
5166 * @dev: device to init
5167 *
5168 * This takes a network device structure and initialize the minimum
5169 * amount of fields so it can be used to schedule NAPI polls without
5170 * registering a full blown interface. This is to be used by drivers
5171 * that need to tie several hardware interfaces to a single NAPI
5172 * poll scheduler due to HW limitations.
5173 */
5174int init_dummy_netdev(struct net_device *dev)
5175{
5176 /* Clear everything. Note we don't initialize spinlocks
5177 * are they aren't supposed to be taken by any of the
5178 * NAPI code and this dummy netdev is supposed to be
5179 * only ever used for NAPI polls
5180 */
5181 memset(dev, 0, sizeof(struct net_device));
5182
5183 /* make sure we BUG if trying to hit standard
5184 * register/unregister code path
5185 */
5186 dev->reg_state = NETREG_DUMMY;
5187
5188 /* initialize the ref count */
5189 atomic_set(&dev->refcnt, 1);
5190
5191 /* NAPI wants this */
5192 INIT_LIST_HEAD(&dev->napi_list);
5193
5194 /* a dummy interface is started by default */
5195 set_bit(__LINK_STATE_PRESENT, &dev->state);
5196 set_bit(__LINK_STATE_START, &dev->state);
5197
5198 return 0;
5199}
5200EXPORT_SYMBOL_GPL(init_dummy_netdev);
5201
5202
1da177e4
LT
5203/**
5204 * register_netdev - register a network device
5205 * @dev: device to register
5206 *
5207 * Take a completed network device structure and add it to the kernel
5208 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5209 * chain. 0 is returned on success. A negative errno code is returned
5210 * on a failure to set up the device, or if the name is a duplicate.
5211 *
38b4da38 5212 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
5213 * and expands the device name if you passed a format string to
5214 * alloc_netdev.
5215 */
5216int register_netdev(struct net_device *dev)
5217{
5218 int err;
5219
5220 rtnl_lock();
5221
5222 /*
5223 * If the name is a format string the caller wants us to do a
5224 * name allocation.
5225 */
5226 if (strchr(dev->name, '%')) {
5227 err = dev_alloc_name(dev, dev->name);
5228 if (err < 0)
5229 goto out;
5230 }
4ec93edb 5231
1da177e4
LT
5232 err = register_netdevice(dev);
5233out:
5234 rtnl_unlock();
5235 return err;
5236}
5237EXPORT_SYMBOL(register_netdev);
5238
5239/*
5240 * netdev_wait_allrefs - wait until all references are gone.
5241 *
5242 * This is called when unregistering network devices.
5243 *
5244 * Any protocol or device that holds a reference should register
5245 * for netdevice notification, and cleanup and put back the
5246 * reference if they receive an UNREGISTER event.
5247 * We can get stuck here if buggy protocols don't correctly
4ec93edb 5248 * call dev_put.
1da177e4
LT
5249 */
5250static void netdev_wait_allrefs(struct net_device *dev)
5251{
5252 unsigned long rebroadcast_time, warning_time;
5253
e014debe
ED
5254 linkwatch_forget_dev(dev);
5255
1da177e4
LT
5256 rebroadcast_time = warning_time = jiffies;
5257 while (atomic_read(&dev->refcnt) != 0) {
5258 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 5259 rtnl_lock();
1da177e4
LT
5260
5261 /* Rebroadcast unregister notification */
056925ab 5262 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
a5ee1551 5263 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
395264d5 5264 * should have already handle it the first time */
1da177e4
LT
5265
5266 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5267 &dev->state)) {
5268 /* We must not have linkwatch events
5269 * pending on unregister. If this
5270 * happens, we simply run the queue
5271 * unscheduled, resulting in a noop
5272 * for this device.
5273 */
5274 linkwatch_run_queue();
5275 }
5276
6756ae4b 5277 __rtnl_unlock();
1da177e4
LT
5278
5279 rebroadcast_time = jiffies;
5280 }
5281
5282 msleep(250);
5283
5284 if (time_after(jiffies, warning_time + 10 * HZ)) {
5285 printk(KERN_EMERG "unregister_netdevice: "
5286 "waiting for %s to become free. Usage "
5287 "count = %d\n",
5288 dev->name, atomic_read(&dev->refcnt));
5289 warning_time = jiffies;
5290 }
5291 }
5292}
5293
5294/* The sequence is:
5295 *
5296 * rtnl_lock();
5297 * ...
5298 * register_netdevice(x1);
5299 * register_netdevice(x2);
5300 * ...
5301 * unregister_netdevice(y1);
5302 * unregister_netdevice(y2);
5303 * ...
5304 * rtnl_unlock();
5305 * free_netdev(y1);
5306 * free_netdev(y2);
5307 *
58ec3b4d 5308 * We are invoked by rtnl_unlock().
1da177e4 5309 * This allows us to deal with problems:
b17a7c17 5310 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
5311 * without deadlocking with linkwatch via keventd.
5312 * 2) Since we run with the RTNL semaphore not held, we can sleep
5313 * safely in order to wait for the netdev refcnt to drop to zero.
58ec3b4d
HX
5314 *
5315 * We must not return until all unregister events added during
5316 * the interval the lock was held have been completed.
1da177e4 5317 */
1da177e4
LT
5318void netdev_run_todo(void)
5319{
626ab0e6 5320 struct list_head list;
1da177e4 5321
1da177e4 5322 /* Snapshot list, allow later requests */
626ab0e6 5323 list_replace_init(&net_todo_list, &list);
58ec3b4d
HX
5324
5325 __rtnl_unlock();
626ab0e6 5326
1da177e4
LT
5327 while (!list_empty(&list)) {
5328 struct net_device *dev
e5e26d75 5329 = list_first_entry(&list, struct net_device, todo_list);
1da177e4
LT
5330 list_del(&dev->todo_list);
5331
b17a7c17
SH
5332 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5333 printk(KERN_ERR "network todo '%s' but state %d\n",
5334 dev->name, dev->reg_state);
5335 dump_stack();
5336 continue;
5337 }
1da177e4 5338
b17a7c17 5339 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 5340
6e583ce5
SH
5341 on_each_cpu(flush_backlog, dev, 1);
5342
b17a7c17 5343 netdev_wait_allrefs(dev);
1da177e4 5344
b17a7c17
SH
5345 /* paranoia */
5346 BUG_ON(atomic_read(&dev->refcnt));
547b792c
IJ
5347 WARN_ON(dev->ip_ptr);
5348 WARN_ON(dev->ip6_ptr);
5349 WARN_ON(dev->dn_ptr);
1da177e4 5350
b17a7c17
SH
5351 if (dev->destructor)
5352 dev->destructor(dev);
9093bbb2
SH
5353
5354 /* Free network device */
5355 kobject_put(&dev->dev.kobj);
1da177e4 5356 }
1da177e4
LT
5357}
5358
d83345ad
ED
5359/**
5360 * dev_txq_stats_fold - fold tx_queues stats
5361 * @dev: device to get statistics from
5362 * @stats: struct net_device_stats to hold results
5363 */
5364void dev_txq_stats_fold(const struct net_device *dev,
5365 struct net_device_stats *stats)
5366{
5367 unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5368 unsigned int i;
5369 struct netdev_queue *txq;
5370
5371 for (i = 0; i < dev->num_tx_queues; i++) {
5372 txq = netdev_get_tx_queue(dev, i);
5373 tx_bytes += txq->tx_bytes;
5374 tx_packets += txq->tx_packets;
5375 tx_dropped += txq->tx_dropped;
5376 }
5377 if (tx_bytes || tx_packets || tx_dropped) {
5378 stats->tx_bytes = tx_bytes;
5379 stats->tx_packets = tx_packets;
5380 stats->tx_dropped = tx_dropped;
5381 }
5382}
5383EXPORT_SYMBOL(dev_txq_stats_fold);
5384
eeda3fd6
SH
5385/**
5386 * dev_get_stats - get network device statistics
5387 * @dev: device to get statistics from
5388 *
5389 * Get network statistics from device. The device driver may provide
5390 * its own method by setting dev->netdev_ops->get_stats; otherwise
5391 * the internal statistics structure is used.
5392 */
5393const struct net_device_stats *dev_get_stats(struct net_device *dev)
7004bf25 5394{
eeda3fd6
SH
5395 const struct net_device_ops *ops = dev->netdev_ops;
5396
5397 if (ops->ndo_get_stats)
5398 return ops->ndo_get_stats(dev);
d83345ad
ED
5399
5400 dev_txq_stats_fold(dev, &dev->stats);
5401 return &dev->stats;
c45d286e 5402}
eeda3fd6 5403EXPORT_SYMBOL(dev_get_stats);
c45d286e 5404
dc2b4847 5405static void netdev_init_one_queue(struct net_device *dev,
e8a0464c
DM
5406 struct netdev_queue *queue,
5407 void *_unused)
dc2b4847 5408{
dc2b4847
DM
5409 queue->dev = dev;
5410}
5411
bb949fbd
DM
5412static void netdev_init_queues(struct net_device *dev)
5413{
e8a0464c
DM
5414 netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5415 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
c3f26a26 5416 spin_lock_init(&dev->tx_global_lock);
bb949fbd
DM
5417}
5418
1da177e4 5419/**
f25f4e44 5420 * alloc_netdev_mq - allocate network device
1da177e4
LT
5421 * @sizeof_priv: size of private data to allocate space for
5422 * @name: device name format string
5423 * @setup: callback to initialize device
f25f4e44 5424 * @queue_count: the number of subqueues to allocate
1da177e4
LT
5425 *
5426 * Allocates a struct net_device with private data area for driver use
f25f4e44
PWJ
5427 * and performs basic initialization. Also allocates subquue structs
5428 * for each queue on the device at the end of the netdevice.
1da177e4 5429 */
f25f4e44
PWJ
5430struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5431 void (*setup)(struct net_device *), unsigned int queue_count)
1da177e4 5432{
e8a0464c 5433 struct netdev_queue *tx;
1da177e4 5434 struct net_device *dev;
7943986c 5435 size_t alloc_size;
1ce8e7b5 5436 struct net_device *p;
1da177e4 5437
b6fe17d6
SH
5438 BUG_ON(strlen(name) >= sizeof(dev->name));
5439
fd2ea0a7 5440 alloc_size = sizeof(struct net_device);
d1643d24
AD
5441 if (sizeof_priv) {
5442 /* ensure 32-byte alignment of private area */
1ce8e7b5 5443 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
d1643d24
AD
5444 alloc_size += sizeof_priv;
5445 }
5446 /* ensure 32-byte alignment of whole construct */
1ce8e7b5 5447 alloc_size += NETDEV_ALIGN - 1;
1da177e4 5448
31380de9 5449 p = kzalloc(alloc_size, GFP_KERNEL);
1da177e4 5450 if (!p) {
b6fe17d6 5451 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
1da177e4
LT
5452 return NULL;
5453 }
1da177e4 5454
7943986c 5455 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
e8a0464c
DM
5456 if (!tx) {
5457 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5458 "tx qdiscs.\n");
ab9c73cc 5459 goto free_p;
e8a0464c
DM
5460 }
5461
1ce8e7b5 5462 dev = PTR_ALIGN(p, NETDEV_ALIGN);
1da177e4 5463 dev->padded = (char *)dev - (char *)p;
ab9c73cc
JP
5464
5465 if (dev_addr_init(dev))
5466 goto free_tx;
5467
ccffad25
JP
5468 dev_unicast_init(dev);
5469
c346dca1 5470 dev_net_set(dev, &init_net);
1da177e4 5471
e8a0464c
DM
5472 dev->_tx = tx;
5473 dev->num_tx_queues = queue_count;
fd2ea0a7 5474 dev->real_num_tx_queues = queue_count;
e8a0464c 5475
82cc1a7a 5476 dev->gso_max_size = GSO_MAX_SIZE;
1da177e4 5477
bb949fbd
DM
5478 netdev_init_queues(dev);
5479
15682bc4
PWJ
5480 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5481 dev->ethtool_ntuple_list.count = 0;
d565b0a1 5482 INIT_LIST_HEAD(&dev->napi_list);
9fdce099 5483 INIT_LIST_HEAD(&dev->unreg_list);
e014debe 5484 INIT_LIST_HEAD(&dev->link_watch_list);
93f154b5 5485 dev->priv_flags = IFF_XMIT_DST_RELEASE;
1da177e4
LT
5486 setup(dev);
5487 strcpy(dev->name, name);
5488 return dev;
ab9c73cc
JP
5489
5490free_tx:
5491 kfree(tx);
5492
5493free_p:
5494 kfree(p);
5495 return NULL;
1da177e4 5496}
f25f4e44 5497EXPORT_SYMBOL(alloc_netdev_mq);
1da177e4
LT
5498
5499/**
5500 * free_netdev - free network device
5501 * @dev: device
5502 *
4ec93edb
YH
5503 * This function does the last stage of destroying an allocated device
5504 * interface. The reference to the device object is released.
1da177e4
LT
5505 * If this is the last reference then it will be freed.
5506 */
5507void free_netdev(struct net_device *dev)
5508{
d565b0a1
HX
5509 struct napi_struct *p, *n;
5510
f3005d7f
DL
5511 release_net(dev_net(dev));
5512
e8a0464c
DM
5513 kfree(dev->_tx);
5514
f001fde5
JP
5515 /* Flush device addresses */
5516 dev_addr_flush(dev);
5517
15682bc4
PWJ
5518 /* Clear ethtool n-tuple list */
5519 ethtool_ntuple_flush(dev);
5520
d565b0a1
HX
5521 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5522 netif_napi_del(p);
5523
3041a069 5524 /* Compatibility with error handling in drivers */
1da177e4
LT
5525 if (dev->reg_state == NETREG_UNINITIALIZED) {
5526 kfree((char *)dev - dev->padded);
5527 return;
5528 }
5529
5530 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5531 dev->reg_state = NETREG_RELEASED;
5532
43cb76d9
GKH
5533 /* will free via device release */
5534 put_device(&dev->dev);
1da177e4 5535}
d1b19dff 5536EXPORT_SYMBOL(free_netdev);
4ec93edb 5537
f0db275a
SH
5538/**
5539 * synchronize_net - Synchronize with packet receive processing
5540 *
5541 * Wait for packets currently being received to be done.
5542 * Does not block later packets from starting.
5543 */
4ec93edb 5544void synchronize_net(void)
1da177e4
LT
5545{
5546 might_sleep();
fbd568a3 5547 synchronize_rcu();
1da177e4 5548}
d1b19dff 5549EXPORT_SYMBOL(synchronize_net);
1da177e4
LT
5550
5551/**
44a0873d 5552 * unregister_netdevice_queue - remove device from the kernel
1da177e4 5553 * @dev: device
44a0873d 5554 * @head: list
6ebfbc06 5555 *
1da177e4 5556 * This function shuts down a device interface and removes it
d59b54b1 5557 * from the kernel tables.
44a0873d 5558 * If head not NULL, device is queued to be unregistered later.
1da177e4
LT
5559 *
5560 * Callers must hold the rtnl semaphore. You may want
5561 * unregister_netdev() instead of this.
5562 */
5563
44a0873d 5564void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1da177e4 5565{
a6620712
HX
5566 ASSERT_RTNL();
5567
44a0873d 5568 if (head) {
9fdce099 5569 list_move_tail(&dev->unreg_list, head);
44a0873d
ED
5570 } else {
5571 rollback_registered(dev);
5572 /* Finish processing unregister after unlock */
5573 net_set_todo(dev);
5574 }
1da177e4 5575}
44a0873d 5576EXPORT_SYMBOL(unregister_netdevice_queue);
1da177e4 5577
9b5e383c
ED
5578/**
5579 * unregister_netdevice_many - unregister many devices
5580 * @head: list of devices
9b5e383c
ED
5581 */
5582void unregister_netdevice_many(struct list_head *head)
5583{
5584 struct net_device *dev;
5585
5586 if (!list_empty(head)) {
5587 rollback_registered_many(head);
5588 list_for_each_entry(dev, head, unreg_list)
5589 net_set_todo(dev);
5590 }
5591}
63c8099d 5592EXPORT_SYMBOL(unregister_netdevice_many);
9b5e383c 5593
1da177e4
LT
5594/**
5595 * unregister_netdev - remove device from the kernel
5596 * @dev: device
5597 *
5598 * This function shuts down a device interface and removes it
d59b54b1 5599 * from the kernel tables.
1da177e4
LT
5600 *
5601 * This is just a wrapper for unregister_netdevice that takes
5602 * the rtnl semaphore. In general you want to use this and not
5603 * unregister_netdevice.
5604 */
5605void unregister_netdev(struct net_device *dev)
5606{
5607 rtnl_lock();
5608 unregister_netdevice(dev);
5609 rtnl_unlock();
5610}
1da177e4
LT
5611EXPORT_SYMBOL(unregister_netdev);
5612
ce286d32
EB
5613/**
5614 * dev_change_net_namespace - move device to different nethost namespace
5615 * @dev: device
5616 * @net: network namespace
5617 * @pat: If not NULL name pattern to try if the current device name
5618 * is already taken in the destination network namespace.
5619 *
5620 * This function shuts down a device interface and moves it
5621 * to a new network namespace. On success 0 is returned, on
5622 * a failure a netagive errno code is returned.
5623 *
5624 * Callers must hold the rtnl semaphore.
5625 */
5626
5627int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5628{
ce286d32
EB
5629 int err;
5630
5631 ASSERT_RTNL();
5632
5633 /* Don't allow namespace local devices to be moved. */
5634 err = -EINVAL;
5635 if (dev->features & NETIF_F_NETNS_LOCAL)
5636 goto out;
5637
3891845e
EB
5638#ifdef CONFIG_SYSFS
5639 /* Don't allow real devices to be moved when sysfs
5640 * is enabled.
5641 */
5642 err = -EINVAL;
5643 if (dev->dev.parent)
5644 goto out;
5645#endif
5646
ce286d32
EB
5647 /* Ensure the device has been registrered */
5648 err = -EINVAL;
5649 if (dev->reg_state != NETREG_REGISTERED)
5650 goto out;
5651
5652 /* Get out if there is nothing todo */
5653 err = 0;
878628fb 5654 if (net_eq(dev_net(dev), net))
ce286d32
EB
5655 goto out;
5656
5657 /* Pick the destination device name, and ensure
5658 * we can use it in the destination network namespace.
5659 */
5660 err = -EEXIST;
d9031024 5661 if (__dev_get_by_name(net, dev->name)) {
ce286d32
EB
5662 /* We get here if we can't use the current device name */
5663 if (!pat)
5664 goto out;
d9031024 5665 if (dev_get_valid_name(net, pat, dev->name, 1))
ce286d32
EB
5666 goto out;
5667 }
5668
5669 /*
5670 * And now a mini version of register_netdevice unregister_netdevice.
5671 */
5672
5673 /* If device is running close it first. */
9b772652 5674 dev_close(dev);
ce286d32
EB
5675
5676 /* And unlink it from device chain */
5677 err = -ENODEV;
5678 unlist_netdevice(dev);
5679
5680 synchronize_net();
5681
5682 /* Shutdown queueing discipline. */
5683 dev_shutdown(dev);
5684
5685 /* Notify protocols, that we are about to destroy
5686 this device. They should clean all the things.
5687 */
5688 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
a5ee1551 5689 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
ce286d32
EB
5690
5691 /*
5692 * Flush the unicast and multicast chains
5693 */
ccffad25 5694 dev_unicast_flush(dev);
ce286d32
EB
5695 dev_addr_discard(dev);
5696
3891845e
EB
5697 netdev_unregister_kobject(dev);
5698
ce286d32 5699 /* Actually switch the network namespace */
c346dca1 5700 dev_net_set(dev, net);
ce286d32 5701
ce286d32
EB
5702 /* If there is an ifindex conflict assign a new one */
5703 if (__dev_get_by_index(net, dev->ifindex)) {
5704 int iflink = (dev->iflink == dev->ifindex);
5705 dev->ifindex = dev_new_index(net);
5706 if (iflink)
5707 dev->iflink = dev->ifindex;
5708 }
5709
8b41d188 5710 /* Fixup kobjects */
aaf8cdc3 5711 err = netdev_register_kobject(dev);
8b41d188 5712 WARN_ON(err);
ce286d32
EB
5713
5714 /* Add the device back in the hashes */
5715 list_netdevice(dev);
5716
5717 /* Notify protocols, that a new device appeared. */
5718 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5719
d90a909e
EB
5720 /*
5721 * Prevent userspace races by waiting until the network
5722 * device is fully setup before sending notifications.
5723 */
5724 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5725
ce286d32
EB
5726 synchronize_net();
5727 err = 0;
5728out:
5729 return err;
5730}
463d0183 5731EXPORT_SYMBOL_GPL(dev_change_net_namespace);
ce286d32 5732
1da177e4
LT
5733static int dev_cpu_callback(struct notifier_block *nfb,
5734 unsigned long action,
5735 void *ocpu)
5736{
5737 struct sk_buff **list_skb;
37437bb2 5738 struct Qdisc **list_net;
1da177e4
LT
5739 struct sk_buff *skb;
5740 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5741 struct softnet_data *sd, *oldsd;
5742
8bb78442 5743 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
5744 return NOTIFY_OK;
5745
5746 local_irq_disable();
5747 cpu = smp_processor_id();
5748 sd = &per_cpu(softnet_data, cpu);
5749 oldsd = &per_cpu(softnet_data, oldcpu);
5750
5751 /* Find end of our completion_queue. */
5752 list_skb = &sd->completion_queue;
5753 while (*list_skb)
5754 list_skb = &(*list_skb)->next;
5755 /* Append completion queue from offline CPU. */
5756 *list_skb = oldsd->completion_queue;
5757 oldsd->completion_queue = NULL;
5758
5759 /* Find end of our output_queue. */
5760 list_net = &sd->output_queue;
5761 while (*list_net)
5762 list_net = &(*list_net)->next_sched;
5763 /* Append output queue from offline CPU. */
5764 *list_net = oldsd->output_queue;
5765 oldsd->output_queue = NULL;
5766
5767 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5768 local_irq_enable();
5769
5770 /* Process offline CPU's input_pkt_queue */
5771 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5772 netif_rx(skb);
5773
5774 return NOTIFY_OK;
5775}
1da177e4
LT
5776
5777
7f353bf2 5778/**
b63365a2
HX
5779 * netdev_increment_features - increment feature set by one
5780 * @all: current feature set
5781 * @one: new feature set
5782 * @mask: mask feature set
7f353bf2
HX
5783 *
5784 * Computes a new feature set after adding a device with feature set
b63365a2
HX
5785 * @one to the master device with current feature set @all. Will not
5786 * enable anything that is off in @mask. Returns the new feature set.
7f353bf2 5787 */
b63365a2
HX
5788unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5789 unsigned long mask)
5790{
5791 /* If device needs checksumming, downgrade to it. */
d1b19dff 5792 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
b63365a2
HX
5793 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5794 else if (mask & NETIF_F_ALL_CSUM) {
5795 /* If one device supports v4/v6 checksumming, set for all. */
5796 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5797 !(all & NETIF_F_GEN_CSUM)) {
5798 all &= ~NETIF_F_ALL_CSUM;
5799 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5800 }
e2a6b852 5801
b63365a2
HX
5802 /* If one device supports hw checksumming, set for all. */
5803 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5804 all &= ~NETIF_F_ALL_CSUM;
5805 all |= NETIF_F_HW_CSUM;
5806 }
5807 }
7f353bf2 5808
b63365a2 5809 one |= NETIF_F_ALL_CSUM;
7f353bf2 5810
b63365a2 5811 one |= all & NETIF_F_ONE_FOR_ALL;
d9f5950f 5812 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
b63365a2 5813 all |= one & mask & NETIF_F_ONE_FOR_ALL;
7f353bf2
HX
5814
5815 return all;
5816}
b63365a2 5817EXPORT_SYMBOL(netdev_increment_features);
7f353bf2 5818
30d97d35
PE
5819static struct hlist_head *netdev_create_hash(void)
5820{
5821 int i;
5822 struct hlist_head *hash;
5823
5824 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5825 if (hash != NULL)
5826 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5827 INIT_HLIST_HEAD(&hash[i]);
5828
5829 return hash;
5830}
5831
881d966b 5832/* Initialize per network namespace state */
4665079c 5833static int __net_init netdev_init(struct net *net)
881d966b 5834{
881d966b 5835 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 5836
30d97d35
PE
5837 net->dev_name_head = netdev_create_hash();
5838 if (net->dev_name_head == NULL)
5839 goto err_name;
881d966b 5840
30d97d35
PE
5841 net->dev_index_head = netdev_create_hash();
5842 if (net->dev_index_head == NULL)
5843 goto err_idx;
881d966b
EB
5844
5845 return 0;
30d97d35
PE
5846
5847err_idx:
5848 kfree(net->dev_name_head);
5849err_name:
5850 return -ENOMEM;
881d966b
EB
5851}
5852
f0db275a
SH
5853/**
5854 * netdev_drivername - network driver for the device
5855 * @dev: network device
5856 * @buffer: buffer for resulting name
5857 * @len: size of buffer
5858 *
5859 * Determine network driver for device.
5860 */
cf04a4c7 5861char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6579e57b 5862{
cf04a4c7
SH
5863 const struct device_driver *driver;
5864 const struct device *parent;
6579e57b
AV
5865
5866 if (len <= 0 || !buffer)
5867 return buffer;
5868 buffer[0] = 0;
5869
5870 parent = dev->dev.parent;
5871
5872 if (!parent)
5873 return buffer;
5874
5875 driver = parent->driver;
5876 if (driver && driver->name)
5877 strlcpy(buffer, driver->name, len);
5878 return buffer;
5879}
5880
4665079c 5881static void __net_exit netdev_exit(struct net *net)
881d966b
EB
5882{
5883 kfree(net->dev_name_head);
5884 kfree(net->dev_index_head);
5885}
5886
022cbae6 5887static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
5888 .init = netdev_init,
5889 .exit = netdev_exit,
5890};
5891
4665079c 5892static void __net_exit default_device_exit(struct net *net)
ce286d32 5893{
e008b5fc 5894 struct net_device *dev, *aux;
ce286d32 5895 /*
e008b5fc 5896 * Push all migratable network devices back to the
ce286d32
EB
5897 * initial network namespace
5898 */
5899 rtnl_lock();
e008b5fc 5900 for_each_netdev_safe(net, dev, aux) {
ce286d32 5901 int err;
aca51397 5902 char fb_name[IFNAMSIZ];
ce286d32
EB
5903
5904 /* Ignore unmoveable devices (i.e. loopback) */
5905 if (dev->features & NETIF_F_NETNS_LOCAL)
5906 continue;
5907
e008b5fc
EB
5908 /* Leave virtual devices for the generic cleanup */
5909 if (dev->rtnl_link_ops)
5910 continue;
d0c082ce 5911
ce286d32 5912 /* Push remaing network devices to init_net */
aca51397
PE
5913 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5914 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 5915 if (err) {
aca51397 5916 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
ce286d32 5917 __func__, dev->name, err);
aca51397 5918 BUG();
ce286d32
EB
5919 }
5920 }
5921 rtnl_unlock();
5922}
5923
04dc7f6b
EB
5924static void __net_exit default_device_exit_batch(struct list_head *net_list)
5925{
5926 /* At exit all network devices most be removed from a network
5927 * namespace. Do this in the reverse order of registeration.
5928 * Do this across as many network namespaces as possible to
5929 * improve batching efficiency.
5930 */
5931 struct net_device *dev;
5932 struct net *net;
5933 LIST_HEAD(dev_kill_list);
5934
5935 rtnl_lock();
5936 list_for_each_entry(net, net_list, exit_list) {
5937 for_each_netdev_reverse(net, dev) {
5938 if (dev->rtnl_link_ops)
5939 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5940 else
5941 unregister_netdevice_queue(dev, &dev_kill_list);
5942 }
5943 }
5944 unregister_netdevice_many(&dev_kill_list);
5945 rtnl_unlock();
5946}
5947
022cbae6 5948static struct pernet_operations __net_initdata default_device_ops = {
ce286d32 5949 .exit = default_device_exit,
04dc7f6b 5950 .exit_batch = default_device_exit_batch,
ce286d32
EB
5951};
5952
1da177e4
LT
5953/*
5954 * Initialize the DEV module. At boot time this walks the device list and
5955 * unhooks any devices that fail to initialise (normally hardware not
5956 * present) and leaves us with a valid list of present and active devices.
5957 *
5958 */
5959
5960/*
5961 * This is called single threaded during boot, so no need
5962 * to take the rtnl semaphore.
5963 */
5964static int __init net_dev_init(void)
5965{
5966 int i, rc = -ENOMEM;
5967
5968 BUG_ON(!dev_boot_phase);
5969
1da177e4
LT
5970 if (dev_proc_init())
5971 goto out;
5972
8b41d188 5973 if (netdev_kobject_init())
1da177e4
LT
5974 goto out;
5975
5976 INIT_LIST_HEAD(&ptype_all);
82d8a867 5977 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
5978 INIT_LIST_HEAD(&ptype_base[i]);
5979
881d966b
EB
5980 if (register_pernet_subsys(&netdev_net_ops))
5981 goto out;
1da177e4
LT
5982
5983 /*
5984 * Initialise the packet receive queues.
5985 */
5986
6f912042 5987 for_each_possible_cpu(i) {
1da177e4
LT
5988 struct softnet_data *queue;
5989
5990 queue = &per_cpu(softnet_data, i);
5991 skb_queue_head_init(&queue->input_pkt_queue);
1da177e4
LT
5992 queue->completion_queue = NULL;
5993 INIT_LIST_HEAD(&queue->poll_list);
bea3348e
SH
5994
5995 queue->backlog.poll = process_backlog;
5996 queue->backlog.weight = weight_p;
d565b0a1 5997 queue->backlog.gro_list = NULL;
4ae5544f 5998 queue->backlog.gro_count = 0;
1da177e4
LT
5999 }
6000
1da177e4
LT
6001 dev_boot_phase = 0;
6002
505d4f73
EB
6003 /* The loopback device is special if any other network devices
6004 * is present in a network namespace the loopback device must
6005 * be present. Since we now dynamically allocate and free the
6006 * loopback device ensure this invariant is maintained by
6007 * keeping the loopback device as the first device on the
6008 * list of network devices. Ensuring the loopback devices
6009 * is the first device that appears and the last network device
6010 * that disappears.
6011 */
6012 if (register_pernet_device(&loopback_net_ops))
6013 goto out;
6014
6015 if (register_pernet_device(&default_device_ops))
6016 goto out;
6017
962cf36c
CM
6018 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6019 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
6020
6021 hotcpu_notifier(dev_cpu_callback, 0);
6022 dst_init();
6023 dev_mcast_init();
6024 rc = 0;
6025out:
6026 return rc;
6027}
6028
6029subsys_initcall(net_dev_init);
6030
e88721f8
KK
6031static int __init initialize_hashrnd(void)
6032{
6033 get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
6034 return 0;
6035}
6036
6037late_initcall_sync(initialize_hashrnd);
6038