]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/core/dev.c
aoe: Fix OOPS after SKB queue changes.
[net-next-2.6.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <asm/system.h>
77#include <linux/bitops.h>
4fc268d2 78#include <linux/capability.h>
1da177e4
LT
79#include <linux/cpu.h>
80#include <linux/types.h>
81#include <linux/kernel.h>
82#include <linux/sched.h>
4a3e2f71 83#include <linux/mutex.h>
1da177e4
LT
84#include <linux/string.h>
85#include <linux/mm.h>
86#include <linux/socket.h>
87#include <linux/sockios.h>
88#include <linux/errno.h>
89#include <linux/interrupt.h>
90#include <linux/if_ether.h>
91#include <linux/netdevice.h>
92#include <linux/etherdevice.h>
0187bdfb 93#include <linux/ethtool.h>
1da177e4
LT
94#include <linux/notifier.h>
95#include <linux/skbuff.h>
457c4cbc 96#include <net/net_namespace.h>
1da177e4
LT
97#include <net/sock.h>
98#include <linux/rtnetlink.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/stat.h>
102#include <linux/if_bridge.h>
b863ceb7 103#include <linux/if_macvlan.h>
1da177e4
LT
104#include <net/dst.h>
105#include <net/pkt_sched.h>
106#include <net/checksum.h>
107#include <linux/highmem.h>
108#include <linux/init.h>
109#include <linux/kmod.h>
110#include <linux/module.h>
111#include <linux/kallsyms.h>
112#include <linux/netpoll.h>
113#include <linux/rcupdate.h>
114#include <linux/delay.h>
295f4a1f 115#include <net/wext.h>
1da177e4 116#include <net/iw_handler.h>
1da177e4 117#include <asm/current.h>
5bdb9886 118#include <linux/audit.h>
db217334 119#include <linux/dmaengine.h>
f6a78bfc 120#include <linux/err.h>
c7fa9d18 121#include <linux/ctype.h>
723e98b7 122#include <linux/if_arp.h>
6de329e2 123#include <linux/if_vlan.h>
8f0f2223
DM
124#include <linux/ip.h>
125#include <linux/ipv6.h>
126#include <linux/in.h>
b6b2fed1
DM
127#include <linux/jhash.h>
128#include <linux/random.h>
1da177e4 129
342709ef
PE
130#include "net-sysfs.h"
131
1da177e4
LT
132/*
133 * The list of packet types we will receive (as opposed to discard)
134 * and the routines to invoke.
135 *
136 * Why 16. Because with 16 the only overlap we get on a hash of the
137 * low nibble of the protocol value is RARP/SNAP/X.25.
138 *
139 * NOTE: That is no longer true with the addition of VLAN tags. Not
140 * sure which should go first, but I bet it won't make much
141 * difference if we are running VLANs. The good news is that
142 * this protocol won't be in the list unless compiled in, so
3041a069 143 * the average user (w/out VLANs) will not be adversely affected.
1da177e4
LT
144 * --BLG
145 *
146 * 0800 IP
147 * 8100 802.1Q VLAN
148 * 0001 802.3
149 * 0002 AX.25
150 * 0004 802.2
151 * 8035 RARP
152 * 0005 SNAP
153 * 0805 X.25
154 * 0806 ARP
155 * 8137 IPX
156 * 0009 Localtalk
157 * 86DD IPv6
158 */
159
82d8a867
PE
160#define PTYPE_HASH_SIZE (16)
161#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
162
1da177e4 163static DEFINE_SPINLOCK(ptype_lock);
82d8a867 164static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
6b2bedc3 165static struct list_head ptype_all __read_mostly; /* Taps */
1da177e4 166
db217334 167#ifdef CONFIG_NET_DMA
d379b01e
DW
168struct net_dma {
169 struct dma_client client;
170 spinlock_t lock;
171 cpumask_t channel_mask;
0c0b0aca 172 struct dma_chan **channels;
d379b01e
DW
173};
174
175static enum dma_state_client
176netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
177 enum dma_state state);
178
179static struct net_dma net_dma = {
180 .client = {
181 .event_callback = netdev_dma_event,
182 },
183};
db217334
CL
184#endif
185
1da177e4 186/*
7562f876 187 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
188 * semaphore.
189 *
190 * Pure readers hold dev_base_lock for reading.
191 *
192 * Writers must hold the rtnl semaphore while they loop through the
7562f876 193 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
194 * actual updates. This allows pure readers to access the list even
195 * while a writer is preparing to update it.
196 *
197 * To put it another way, dev_base_lock is held for writing only to
198 * protect against pure readers; the rtnl semaphore provides the
199 * protection against other writers.
200 *
201 * See, for example usages, register_netdevice() and
202 * unregister_netdevice(), which must be called with the rtnl
203 * semaphore held.
204 */
1da177e4
LT
205DEFINE_RWLOCK(dev_base_lock);
206
1da177e4
LT
207EXPORT_SYMBOL(dev_base_lock);
208
209#define NETDEV_HASHBITS 8
881d966b 210#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
1da177e4 211
881d966b 212static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4
LT
213{
214 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
881d966b 215 return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
1da177e4
LT
216}
217
881d966b 218static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 219{
881d966b 220 return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
1da177e4
LT
221}
222
ce286d32
EB
223/* Device list insertion */
224static int list_netdevice(struct net_device *dev)
225{
c346dca1 226 struct net *net = dev_net(dev);
ce286d32
EB
227
228 ASSERT_RTNL();
229
230 write_lock_bh(&dev_base_lock);
231 list_add_tail(&dev->dev_list, &net->dev_base_head);
232 hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
233 hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
234 write_unlock_bh(&dev_base_lock);
235 return 0;
236}
237
238/* Device list removal */
239static void unlist_netdevice(struct net_device *dev)
240{
241 ASSERT_RTNL();
242
243 /* Unlink dev from the device chain */
244 write_lock_bh(&dev_base_lock);
245 list_del(&dev->dev_list);
246 hlist_del(&dev->name_hlist);
247 hlist_del(&dev->index_hlist);
248 write_unlock_bh(&dev_base_lock);
249}
250
1da177e4
LT
251/*
252 * Our notifier list
253 */
254
f07d5b94 255static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
256
257/*
258 * Device drivers call our routines to queue packets here. We empty the
259 * queue in the local softnet handler.
260 */
bea3348e
SH
261
262DEFINE_PER_CPU(struct softnet_data, softnet_data);
1da177e4 263
cf508b12 264#ifdef CONFIG_LOCKDEP
723e98b7 265/*
c773e847 266 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
267 * according to dev->type
268 */
269static const unsigned short netdev_lock_type[] =
270 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
271 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
272 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
273 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
274 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
275 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
276 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
277 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
278 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
279 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
280 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
281 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
282 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
283 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID,
284 ARPHRD_NONE};
285
286static const char *netdev_lock_name[] =
287 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
288 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
289 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
290 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
291 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
292 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
293 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
294 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
295 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
296 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
297 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
298 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
299 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
300 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID",
301 "_xmit_NONE"};
302
303static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 304static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
305
306static inline unsigned short netdev_lock_pos(unsigned short dev_type)
307{
308 int i;
309
310 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
311 if (netdev_lock_type[i] == dev_type)
312 return i;
313 /* the last key is used by default */
314 return ARRAY_SIZE(netdev_lock_type) - 1;
315}
316
cf508b12
DM
317static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
318 unsigned short dev_type)
723e98b7
JP
319{
320 int i;
321
322 i = netdev_lock_pos(dev_type);
323 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
324 netdev_lock_name[i]);
325}
cf508b12
DM
326
327static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
328{
329 int i;
330
331 i = netdev_lock_pos(dev->type);
332 lockdep_set_class_and_name(&dev->addr_list_lock,
333 &netdev_addr_lock_key[i],
334 netdev_lock_name[i]);
335}
723e98b7 336#else
cf508b12
DM
337static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
338 unsigned short dev_type)
339{
340}
341static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
342{
343}
344#endif
1da177e4
LT
345
346/*******************************************************************************
347
348 Protocol management and registration routines
349
350*******************************************************************************/
351
1da177e4
LT
352/*
353 * Add a protocol ID to the list. Now that the input handler is
354 * smarter we can dispense with all the messy stuff that used to be
355 * here.
356 *
357 * BEWARE!!! Protocol handlers, mangling input packets,
358 * MUST BE last in hash buckets and checking protocol handlers
359 * MUST start from promiscuous ptype_all chain in net_bh.
360 * It is true now, do not change it.
361 * Explanation follows: if protocol handler, mangling packet, will
362 * be the first on list, it is not able to sense, that packet
363 * is cloned and should be copied-on-write, so that it will
364 * change it and subsequent readers will get broken packet.
365 * --ANK (980803)
366 */
367
368/**
369 * dev_add_pack - add packet handler
370 * @pt: packet type declaration
371 *
372 * Add a protocol handler to the networking stack. The passed &packet_type
373 * is linked into kernel lists and may not be freed until it has been
374 * removed from the kernel lists.
375 *
4ec93edb 376 * This call does not sleep therefore it can not
1da177e4
LT
377 * guarantee all CPU's that are in middle of receiving packets
378 * will see the new packet type (until the next received packet).
379 */
380
381void dev_add_pack(struct packet_type *pt)
382{
383 int hash;
384
385 spin_lock_bh(&ptype_lock);
9be9a6b9 386 if (pt->type == htons(ETH_P_ALL))
1da177e4 387 list_add_rcu(&pt->list, &ptype_all);
9be9a6b9 388 else {
82d8a867 389 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
1da177e4
LT
390 list_add_rcu(&pt->list, &ptype_base[hash]);
391 }
392 spin_unlock_bh(&ptype_lock);
393}
394
1da177e4
LT
395/**
396 * __dev_remove_pack - remove packet handler
397 * @pt: packet type declaration
398 *
399 * Remove a protocol handler that was previously added to the kernel
400 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
401 * from the kernel lists and can be freed or reused once this function
4ec93edb 402 * returns.
1da177e4
LT
403 *
404 * The packet type might still be in use by receivers
405 * and must not be freed until after all the CPU's have gone
406 * through a quiescent state.
407 */
408void __dev_remove_pack(struct packet_type *pt)
409{
410 struct list_head *head;
411 struct packet_type *pt1;
412
413 spin_lock_bh(&ptype_lock);
414
9be9a6b9 415 if (pt->type == htons(ETH_P_ALL))
1da177e4 416 head = &ptype_all;
9be9a6b9 417 else
82d8a867 418 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
1da177e4
LT
419
420 list_for_each_entry(pt1, head, list) {
421 if (pt == pt1) {
422 list_del_rcu(&pt->list);
423 goto out;
424 }
425 }
426
427 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
428out:
429 spin_unlock_bh(&ptype_lock);
430}
431/**
432 * dev_remove_pack - remove packet handler
433 * @pt: packet type declaration
434 *
435 * Remove a protocol handler that was previously added to the kernel
436 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
437 * from the kernel lists and can be freed or reused once this function
438 * returns.
439 *
440 * This call sleeps to guarantee that no CPU is looking at the packet
441 * type after return.
442 */
443void dev_remove_pack(struct packet_type *pt)
444{
445 __dev_remove_pack(pt);
4ec93edb 446
1da177e4
LT
447 synchronize_net();
448}
449
450/******************************************************************************
451
452 Device Boot-time Settings Routines
453
454*******************************************************************************/
455
456/* Boot time configuration table */
457static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
458
459/**
460 * netdev_boot_setup_add - add new setup entry
461 * @name: name of the device
462 * @map: configured settings for the device
463 *
464 * Adds new setup entry to the dev_boot_setup list. The function
465 * returns 0 on error and 1 on success. This is a generic routine to
466 * all netdevices.
467 */
468static int netdev_boot_setup_add(char *name, struct ifmap *map)
469{
470 struct netdev_boot_setup *s;
471 int i;
472
473 s = dev_boot_setup;
474 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
475 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
476 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 477 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
478 memcpy(&s[i].map, map, sizeof(s[i].map));
479 break;
480 }
481 }
482
483 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
484}
485
486/**
487 * netdev_boot_setup_check - check boot time settings
488 * @dev: the netdevice
489 *
490 * Check boot time settings for the device.
491 * The found settings are set for the device to be used
492 * later in the device probing.
493 * Returns 0 if no settings found, 1 if they are.
494 */
495int netdev_boot_setup_check(struct net_device *dev)
496{
497 struct netdev_boot_setup *s = dev_boot_setup;
498 int i;
499
500 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
501 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 502 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
503 dev->irq = s[i].map.irq;
504 dev->base_addr = s[i].map.base_addr;
505 dev->mem_start = s[i].map.mem_start;
506 dev->mem_end = s[i].map.mem_end;
507 return 1;
508 }
509 }
510 return 0;
511}
512
513
514/**
515 * netdev_boot_base - get address from boot time settings
516 * @prefix: prefix for network device
517 * @unit: id for network device
518 *
519 * Check boot time settings for the base address of device.
520 * The found settings are set for the device to be used
521 * later in the device probing.
522 * Returns 0 if no settings found.
523 */
524unsigned long netdev_boot_base(const char *prefix, int unit)
525{
526 const struct netdev_boot_setup *s = dev_boot_setup;
527 char name[IFNAMSIZ];
528 int i;
529
530 sprintf(name, "%s%d", prefix, unit);
531
532 /*
533 * If device already registered then return base of 1
534 * to indicate not to probe for this interface
535 */
881d966b 536 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
537 return 1;
538
539 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
540 if (!strcmp(name, s[i].name))
541 return s[i].map.base_addr;
542 return 0;
543}
544
545/*
546 * Saves at boot time configured settings for any netdevice.
547 */
548int __init netdev_boot_setup(char *str)
549{
550 int ints[5];
551 struct ifmap map;
552
553 str = get_options(str, ARRAY_SIZE(ints), ints);
554 if (!str || !*str)
555 return 0;
556
557 /* Save settings */
558 memset(&map, 0, sizeof(map));
559 if (ints[0] > 0)
560 map.irq = ints[1];
561 if (ints[0] > 1)
562 map.base_addr = ints[2];
563 if (ints[0] > 2)
564 map.mem_start = ints[3];
565 if (ints[0] > 3)
566 map.mem_end = ints[4];
567
568 /* Add new entry to the list */
569 return netdev_boot_setup_add(str, &map);
570}
571
572__setup("netdev=", netdev_boot_setup);
573
574/*******************************************************************************
575
576 Device Interface Subroutines
577
578*******************************************************************************/
579
580/**
581 * __dev_get_by_name - find a device by its name
c4ea43c5 582 * @net: the applicable net namespace
1da177e4
LT
583 * @name: name to find
584 *
585 * Find an interface by name. Must be called under RTNL semaphore
586 * or @dev_base_lock. If the name is found a pointer to the device
587 * is returned. If the name is not found then %NULL is returned. The
588 * reference counters are not incremented so the caller must be
589 * careful with locks.
590 */
591
881d966b 592struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
593{
594 struct hlist_node *p;
595
881d966b 596 hlist_for_each(p, dev_name_hash(net, name)) {
1da177e4
LT
597 struct net_device *dev
598 = hlist_entry(p, struct net_device, name_hlist);
599 if (!strncmp(dev->name, name, IFNAMSIZ))
600 return dev;
601 }
602 return NULL;
603}
604
605/**
606 * dev_get_by_name - find a device by its name
c4ea43c5 607 * @net: the applicable net namespace
1da177e4
LT
608 * @name: name to find
609 *
610 * Find an interface by name. This can be called from any
611 * context and does its own locking. The returned handle has
612 * the usage count incremented and the caller must use dev_put() to
613 * release it when it is no longer needed. %NULL is returned if no
614 * matching device is found.
615 */
616
881d966b 617struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
618{
619 struct net_device *dev;
620
621 read_lock(&dev_base_lock);
881d966b 622 dev = __dev_get_by_name(net, name);
1da177e4
LT
623 if (dev)
624 dev_hold(dev);
625 read_unlock(&dev_base_lock);
626 return dev;
627}
628
629/**
630 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 631 * @net: the applicable net namespace
1da177e4
LT
632 * @ifindex: index of device
633 *
634 * Search for an interface by index. Returns %NULL if the device
635 * is not found or a pointer to the device. The device has not
636 * had its reference counter increased so the caller must be careful
637 * about locking. The caller must hold either the RTNL semaphore
638 * or @dev_base_lock.
639 */
640
881d966b 641struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
642{
643 struct hlist_node *p;
644
881d966b 645 hlist_for_each(p, dev_index_hash(net, ifindex)) {
1da177e4
LT
646 struct net_device *dev
647 = hlist_entry(p, struct net_device, index_hlist);
648 if (dev->ifindex == ifindex)
649 return dev;
650 }
651 return NULL;
652}
653
654
655/**
656 * dev_get_by_index - find a device by its ifindex
c4ea43c5 657 * @net: the applicable net namespace
1da177e4
LT
658 * @ifindex: index of device
659 *
660 * Search for an interface by index. Returns NULL if the device
661 * is not found or a pointer to the device. The device returned has
662 * had a reference added and the pointer is safe until the user calls
663 * dev_put to indicate they have finished with it.
664 */
665
881d966b 666struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
667{
668 struct net_device *dev;
669
670 read_lock(&dev_base_lock);
881d966b 671 dev = __dev_get_by_index(net, ifindex);
1da177e4
LT
672 if (dev)
673 dev_hold(dev);
674 read_unlock(&dev_base_lock);
675 return dev;
676}
677
678/**
679 * dev_getbyhwaddr - find a device by its hardware address
c4ea43c5 680 * @net: the applicable net namespace
1da177e4
LT
681 * @type: media type of device
682 * @ha: hardware address
683 *
684 * Search for an interface by MAC address. Returns NULL if the device
685 * is not found or a pointer to the device. The caller must hold the
686 * rtnl semaphore. The returned device has not had its ref count increased
687 * and the caller must therefore be careful about locking
688 *
689 * BUGS:
690 * If the API was consistent this would be __dev_get_by_hwaddr
691 */
692
881d966b 693struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
1da177e4
LT
694{
695 struct net_device *dev;
696
697 ASSERT_RTNL();
698
81103a52 699 for_each_netdev(net, dev)
1da177e4
LT
700 if (dev->type == type &&
701 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
702 return dev;
703
704 return NULL;
1da177e4
LT
705}
706
cf309e3f
JF
707EXPORT_SYMBOL(dev_getbyhwaddr);
708
881d966b 709struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
710{
711 struct net_device *dev;
712
4e9cac2b 713 ASSERT_RTNL();
881d966b 714 for_each_netdev(net, dev)
4e9cac2b 715 if (dev->type == type)
7562f876
PE
716 return dev;
717
718 return NULL;
4e9cac2b
PM
719}
720
721EXPORT_SYMBOL(__dev_getfirstbyhwtype);
722
881d966b 723struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b
PM
724{
725 struct net_device *dev;
726
727 rtnl_lock();
881d966b 728 dev = __dev_getfirstbyhwtype(net, type);
4e9cac2b
PM
729 if (dev)
730 dev_hold(dev);
1da177e4
LT
731 rtnl_unlock();
732 return dev;
733}
734
735EXPORT_SYMBOL(dev_getfirstbyhwtype);
736
737/**
738 * dev_get_by_flags - find any device with given flags
c4ea43c5 739 * @net: the applicable net namespace
1da177e4
LT
740 * @if_flags: IFF_* values
741 * @mask: bitmask of bits in if_flags to check
742 *
743 * Search for any interface with the given flags. Returns NULL if a device
4ec93edb 744 * is not found or a pointer to the device. The device returned has
1da177e4
LT
745 * had a reference added and the pointer is safe until the user calls
746 * dev_put to indicate they have finished with it.
747 */
748
881d966b 749struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
1da177e4 750{
7562f876 751 struct net_device *dev, *ret;
1da177e4 752
7562f876 753 ret = NULL;
1da177e4 754 read_lock(&dev_base_lock);
881d966b 755 for_each_netdev(net, dev) {
1da177e4
LT
756 if (((dev->flags ^ if_flags) & mask) == 0) {
757 dev_hold(dev);
7562f876 758 ret = dev;
1da177e4
LT
759 break;
760 }
761 }
762 read_unlock(&dev_base_lock);
7562f876 763 return ret;
1da177e4
LT
764}
765
766/**
767 * dev_valid_name - check if name is okay for network device
768 * @name: name string
769 *
770 * Network device names need to be valid file names to
c7fa9d18
DM
771 * to allow sysfs to work. We also disallow any kind of
772 * whitespace.
1da177e4 773 */
c2373ee9 774int dev_valid_name(const char *name)
1da177e4 775{
c7fa9d18
DM
776 if (*name == '\0')
777 return 0;
b6fe17d6
SH
778 if (strlen(name) >= IFNAMSIZ)
779 return 0;
c7fa9d18
DM
780 if (!strcmp(name, ".") || !strcmp(name, ".."))
781 return 0;
782
783 while (*name) {
784 if (*name == '/' || isspace(*name))
785 return 0;
786 name++;
787 }
788 return 1;
1da177e4
LT
789}
790
791/**
b267b179
EB
792 * __dev_alloc_name - allocate a name for a device
793 * @net: network namespace to allocate the device name in
1da177e4 794 * @name: name format string
b267b179 795 * @buf: scratch buffer and result name string
1da177e4
LT
796 *
797 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
798 * id. It scans list of devices to build up a free map, then chooses
799 * the first empty slot. The caller must hold the dev_base or rtnl lock
800 * while allocating the name and adding the device in order to avoid
801 * duplicates.
802 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
803 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
804 */
805
b267b179 806static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
807{
808 int i = 0;
1da177e4
LT
809 const char *p;
810 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 811 unsigned long *inuse;
1da177e4
LT
812 struct net_device *d;
813
814 p = strnchr(name, IFNAMSIZ-1, '%');
815 if (p) {
816 /*
817 * Verify the string as this thing may have come from
818 * the user. There must be either one "%d" and no other "%"
819 * characters.
820 */
821 if (p[1] != 'd' || strchr(p + 2, '%'))
822 return -EINVAL;
823
824 /* Use one page as a bit array of possible slots */
cfcabdcc 825 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
826 if (!inuse)
827 return -ENOMEM;
828
881d966b 829 for_each_netdev(net, d) {
1da177e4
LT
830 if (!sscanf(d->name, name, &i))
831 continue;
832 if (i < 0 || i >= max_netdevices)
833 continue;
834
835 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 836 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
837 if (!strncmp(buf, d->name, IFNAMSIZ))
838 set_bit(i, inuse);
839 }
840
841 i = find_first_zero_bit(inuse, max_netdevices);
842 free_page((unsigned long) inuse);
843 }
844
b267b179
EB
845 snprintf(buf, IFNAMSIZ, name, i);
846 if (!__dev_get_by_name(net, buf))
1da177e4 847 return i;
1da177e4
LT
848
849 /* It is possible to run out of possible slots
850 * when the name is long and there isn't enough space left
851 * for the digits, or if all bits are used.
852 */
853 return -ENFILE;
854}
855
b267b179
EB
856/**
857 * dev_alloc_name - allocate a name for a device
858 * @dev: device
859 * @name: name format string
860 *
861 * Passed a format string - eg "lt%d" it will try and find a suitable
862 * id. It scans list of devices to build up a free map, then chooses
863 * the first empty slot. The caller must hold the dev_base or rtnl lock
864 * while allocating the name and adding the device in order to avoid
865 * duplicates.
866 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
867 * Returns the number of the unit assigned or a negative errno code.
868 */
869
870int dev_alloc_name(struct net_device *dev, const char *name)
871{
872 char buf[IFNAMSIZ];
873 struct net *net;
874 int ret;
875
c346dca1
YH
876 BUG_ON(!dev_net(dev));
877 net = dev_net(dev);
b267b179
EB
878 ret = __dev_alloc_name(net, name, buf);
879 if (ret >= 0)
880 strlcpy(dev->name, buf, IFNAMSIZ);
881 return ret;
882}
883
1da177e4
LT
884
885/**
886 * dev_change_name - change name of a device
887 * @dev: device
888 * @newname: name (or format string) must be at least IFNAMSIZ
889 *
890 * Change name of a device, can pass format strings "eth%d".
891 * for wildcarding.
892 */
893int dev_change_name(struct net_device *dev, char *newname)
894{
fcc5a03a 895 char oldname[IFNAMSIZ];
1da177e4 896 int err = 0;
fcc5a03a 897 int ret;
881d966b 898 struct net *net;
1da177e4
LT
899
900 ASSERT_RTNL();
c346dca1 901 BUG_ON(!dev_net(dev));
1da177e4 902
c346dca1 903 net = dev_net(dev);
1da177e4
LT
904 if (dev->flags & IFF_UP)
905 return -EBUSY;
906
907 if (!dev_valid_name(newname))
908 return -EINVAL;
909
c8d90dca
SH
910 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
911 return 0;
912
fcc5a03a
HX
913 memcpy(oldname, dev->name, IFNAMSIZ);
914
1da177e4
LT
915 if (strchr(newname, '%')) {
916 err = dev_alloc_name(dev, newname);
917 if (err < 0)
918 return err;
919 strcpy(newname, dev->name);
920 }
881d966b 921 else if (__dev_get_by_name(net, newname))
1da177e4
LT
922 return -EEXIST;
923 else
924 strlcpy(dev->name, newname, IFNAMSIZ);
925
fcc5a03a 926rollback:
dcc99773
SH
927 err = device_rename(&dev->dev, dev->name);
928 if (err) {
929 memcpy(dev->name, oldname, IFNAMSIZ);
930 return err;
931 }
7f988eab
HX
932
933 write_lock_bh(&dev_base_lock);
92749821 934 hlist_del(&dev->name_hlist);
881d966b 935 hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
936 write_unlock_bh(&dev_base_lock);
937
056925ab 938 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
939 ret = notifier_to_errno(ret);
940
941 if (ret) {
942 if (err) {
943 printk(KERN_ERR
944 "%s: name change rollback failed: %d.\n",
945 dev->name, ret);
946 } else {
947 err = ret;
948 memcpy(dev->name, oldname, IFNAMSIZ);
949 goto rollback;
950 }
951 }
1da177e4
LT
952
953 return err;
954}
955
0b815a1a
SH
956/**
957 * dev_set_alias - change ifalias of a device
958 * @dev: device
959 * @alias: name up to IFALIASZ
960 *
961 * Set ifalias for a device,
962 */
963int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
964{
965 ASSERT_RTNL();
966
967 if (len >= IFALIASZ)
968 return -EINVAL;
969
970 dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
971 if (!dev->ifalias)
972 return -ENOMEM;
973
974 strlcpy(dev->ifalias, alias, len+1);
975 return len;
976}
977
978
d8a33ac4 979/**
3041a069 980 * netdev_features_change - device changes features
d8a33ac4
SH
981 * @dev: device to cause notification
982 *
983 * Called to indicate a device has changed features.
984 */
985void netdev_features_change(struct net_device *dev)
986{
056925ab 987 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
988}
989EXPORT_SYMBOL(netdev_features_change);
990
1da177e4
LT
991/**
992 * netdev_state_change - device changes state
993 * @dev: device to cause notification
994 *
995 * Called to indicate a device has changed state. This function calls
996 * the notifier chains for netdev_chain and sends a NEWLINK message
997 * to the routing socket.
998 */
999void netdev_state_change(struct net_device *dev)
1000{
1001 if (dev->flags & IFF_UP) {
056925ab 1002 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1da177e4
LT
1003 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1004 }
1005}
1006
c1da4ac7
OG
1007void netdev_bonding_change(struct net_device *dev)
1008{
1009 call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1010}
1011EXPORT_SYMBOL(netdev_bonding_change);
1012
1da177e4
LT
1013/**
1014 * dev_load - load a network module
c4ea43c5 1015 * @net: the applicable net namespace
1da177e4
LT
1016 * @name: name of interface
1017 *
1018 * If a network interface is not present and the process has suitable
1019 * privileges this function loads the module. If module loading is not
1020 * available in this kernel then it becomes a nop.
1021 */
1022
881d966b 1023void dev_load(struct net *net, const char *name)
1da177e4 1024{
4ec93edb 1025 struct net_device *dev;
1da177e4
LT
1026
1027 read_lock(&dev_base_lock);
881d966b 1028 dev = __dev_get_by_name(net, name);
1da177e4
LT
1029 read_unlock(&dev_base_lock);
1030
1031 if (!dev && capable(CAP_SYS_MODULE))
1032 request_module("%s", name);
1033}
1034
1da177e4
LT
1035/**
1036 * dev_open - prepare an interface for use.
1037 * @dev: device to open
1038 *
1039 * Takes a device from down to up state. The device's private open
1040 * function is invoked and then the multicast lists are loaded. Finally
1041 * the device is moved into the up state and a %NETDEV_UP message is
1042 * sent to the netdev notifier chain.
1043 *
1044 * Calling this function on an active interface is a nop. On a failure
1045 * a negative errno code is returned.
1046 */
1047int dev_open(struct net_device *dev)
1048{
1049 int ret = 0;
1050
e46b66bc
BH
1051 ASSERT_RTNL();
1052
1da177e4
LT
1053 /*
1054 * Is it already up?
1055 */
1056
1057 if (dev->flags & IFF_UP)
1058 return 0;
1059
1060 /*
1061 * Is it even present?
1062 */
1063 if (!netif_device_present(dev))
1064 return -ENODEV;
1065
1066 /*
1067 * Call device private open method
1068 */
1069 set_bit(__LINK_STATE_START, &dev->state);
bada339b
JG
1070
1071 if (dev->validate_addr)
1072 ret = dev->validate_addr(dev);
1073
1074 if (!ret && dev->open)
1da177e4 1075 ret = dev->open(dev);
1da177e4 1076
4ec93edb 1077 /*
1da177e4
LT
1078 * If it went open OK then:
1079 */
1080
bada339b
JG
1081 if (ret)
1082 clear_bit(__LINK_STATE_START, &dev->state);
1083 else {
1da177e4
LT
1084 /*
1085 * Set the flags.
1086 */
1087 dev->flags |= IFF_UP;
1088
1089 /*
1090 * Initialize multicasting status
1091 */
4417da66 1092 dev_set_rx_mode(dev);
1da177e4
LT
1093
1094 /*
1095 * Wakeup transmit queue engine
1096 */
1097 dev_activate(dev);
1098
1099 /*
1100 * ... and announce new interface.
1101 */
056925ab 1102 call_netdevice_notifiers(NETDEV_UP, dev);
1da177e4 1103 }
bada339b 1104
1da177e4
LT
1105 return ret;
1106}
1107
1108/**
1109 * dev_close - shutdown an interface.
1110 * @dev: device to shutdown
1111 *
1112 * This function moves an active device into down state. A
1113 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1114 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1115 * chain.
1116 */
1117int dev_close(struct net_device *dev)
1118{
e46b66bc
BH
1119 ASSERT_RTNL();
1120
9d5010db
DM
1121 might_sleep();
1122
1da177e4
LT
1123 if (!(dev->flags & IFF_UP))
1124 return 0;
1125
1126 /*
1127 * Tell people we are going down, so that they can
1128 * prepare to death, when device is still operating.
1129 */
056925ab 1130 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1131
1da177e4
LT
1132 clear_bit(__LINK_STATE_START, &dev->state);
1133
1134 /* Synchronize to scheduled poll. We cannot touch poll list,
bea3348e
SH
1135 * it can be even on different cpu. So just clear netif_running().
1136 *
1137 * dev->stop() will invoke napi_disable() on all of it's
1138 * napi_struct instances on this device.
1139 */
1da177e4 1140 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1da177e4 1141
d8b2a4d2
ML
1142 dev_deactivate(dev);
1143
1da177e4
LT
1144 /*
1145 * Call the device specific close. This cannot fail.
1146 * Only if device is UP
1147 *
1148 * We allow it to be called even after a DETACH hot-plug
1149 * event.
1150 */
1151 if (dev->stop)
1152 dev->stop(dev);
1153
1154 /*
1155 * Device is now down.
1156 */
1157
1158 dev->flags &= ~IFF_UP;
1159
1160 /*
1161 * Tell people we are down
1162 */
056925ab 1163 call_netdevice_notifiers(NETDEV_DOWN, dev);
1da177e4
LT
1164
1165 return 0;
1166}
1167
1168
0187bdfb
BH
1169/**
1170 * dev_disable_lro - disable Large Receive Offload on a device
1171 * @dev: device
1172 *
1173 * Disable Large Receive Offload (LRO) on a net device. Must be
1174 * called under RTNL. This is needed if received packets may be
1175 * forwarded to another interface.
1176 */
1177void dev_disable_lro(struct net_device *dev)
1178{
1179 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1180 dev->ethtool_ops->set_flags) {
1181 u32 flags = dev->ethtool_ops->get_flags(dev);
1182 if (flags & ETH_FLAG_LRO) {
1183 flags &= ~ETH_FLAG_LRO;
1184 dev->ethtool_ops->set_flags(dev, flags);
1185 }
1186 }
1187 WARN_ON(dev->features & NETIF_F_LRO);
1188}
1189EXPORT_SYMBOL(dev_disable_lro);
1190
1191
881d966b
EB
1192static int dev_boot_phase = 1;
1193
1da177e4
LT
1194/*
1195 * Device change register/unregister. These are not inline or static
1196 * as we export them to the world.
1197 */
1198
1199/**
1200 * register_netdevice_notifier - register a network notifier block
1201 * @nb: notifier
1202 *
1203 * Register a notifier to be called when network device events occur.
1204 * The notifier passed is linked into the kernel structures and must
1205 * not be reused until it has been unregistered. A negative errno code
1206 * is returned on a failure.
1207 *
1208 * When registered all registration and up events are replayed
4ec93edb 1209 * to the new notifier to allow device to have a race free
1da177e4
LT
1210 * view of the network device list.
1211 */
1212
1213int register_netdevice_notifier(struct notifier_block *nb)
1214{
1215 struct net_device *dev;
fcc5a03a 1216 struct net_device *last;
881d966b 1217 struct net *net;
1da177e4
LT
1218 int err;
1219
1220 rtnl_lock();
f07d5b94 1221 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1222 if (err)
1223 goto unlock;
881d966b
EB
1224 if (dev_boot_phase)
1225 goto unlock;
1226 for_each_net(net) {
1227 for_each_netdev(net, dev) {
1228 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1229 err = notifier_to_errno(err);
1230 if (err)
1231 goto rollback;
1232
1233 if (!(dev->flags & IFF_UP))
1234 continue;
1da177e4 1235
881d966b
EB
1236 nb->notifier_call(nb, NETDEV_UP, dev);
1237 }
1da177e4 1238 }
fcc5a03a
HX
1239
1240unlock:
1da177e4
LT
1241 rtnl_unlock();
1242 return err;
fcc5a03a
HX
1243
1244rollback:
1245 last = dev;
881d966b
EB
1246 for_each_net(net) {
1247 for_each_netdev(net, dev) {
1248 if (dev == last)
1249 break;
fcc5a03a 1250
881d966b
EB
1251 if (dev->flags & IFF_UP) {
1252 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1253 nb->notifier_call(nb, NETDEV_DOWN, dev);
1254 }
1255 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
fcc5a03a 1256 }
fcc5a03a 1257 }
c67625a1
PE
1258
1259 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1260 goto unlock;
1da177e4
LT
1261}
1262
1263/**
1264 * unregister_netdevice_notifier - unregister a network notifier block
1265 * @nb: notifier
1266 *
1267 * Unregister a notifier previously registered by
1268 * register_netdevice_notifier(). The notifier is unlinked into the
1269 * kernel structures and may then be reused. A negative errno code
1270 * is returned on a failure.
1271 */
1272
1273int unregister_netdevice_notifier(struct notifier_block *nb)
1274{
9f514950
HX
1275 int err;
1276
1277 rtnl_lock();
f07d5b94 1278 err = raw_notifier_chain_unregister(&netdev_chain, nb);
9f514950
HX
1279 rtnl_unlock();
1280 return err;
1da177e4
LT
1281}
1282
1283/**
1284 * call_netdevice_notifiers - call all network notifier blocks
1285 * @val: value passed unmodified to notifier function
c4ea43c5 1286 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1287 *
1288 * Call all network notifier blocks. Parameters and return value
f07d5b94 1289 * are as for raw_notifier_call_chain().
1da177e4
LT
1290 */
1291
ad7379d4 1292int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1293{
ad7379d4 1294 return raw_notifier_call_chain(&netdev_chain, val, dev);
1da177e4
LT
1295}
1296
1297/* When > 0 there are consumers of rx skb time stamps */
1298static atomic_t netstamp_needed = ATOMIC_INIT(0);
1299
1300void net_enable_timestamp(void)
1301{
1302 atomic_inc(&netstamp_needed);
1303}
1304
1305void net_disable_timestamp(void)
1306{
1307 atomic_dec(&netstamp_needed);
1308}
1309
a61bbcf2 1310static inline void net_timestamp(struct sk_buff *skb)
1da177e4
LT
1311{
1312 if (atomic_read(&netstamp_needed))
a61bbcf2 1313 __net_timestamp(skb);
b7aa0bf7
ED
1314 else
1315 skb->tstamp.tv64 = 0;
1da177e4
LT
1316}
1317
1318/*
1319 * Support routine. Sends outgoing frames to any network
1320 * taps currently in use.
1321 */
1322
f6a78bfc 1323static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1324{
1325 struct packet_type *ptype;
a61bbcf2
PM
1326
1327 net_timestamp(skb);
1da177e4
LT
1328
1329 rcu_read_lock();
1330 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1331 /* Never send packets back to the socket
1332 * they originated from - MvS (miquels@drinkel.ow.org)
1333 */
1334 if ((ptype->dev == dev || !ptype->dev) &&
1335 (ptype->af_packet_priv == NULL ||
1336 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1337 struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1338 if (!skb2)
1339 break;
1340
1341 /* skb->nh should be correctly
1342 set by sender, so that the second statement is
1343 just protection against buggy protocols.
1344 */
459a98ed 1345 skb_reset_mac_header(skb2);
1da177e4 1346
d56f90a7 1347 if (skb_network_header(skb2) < skb2->data ||
27a884dc 1348 skb2->network_header > skb2->tail) {
1da177e4
LT
1349 if (net_ratelimit())
1350 printk(KERN_CRIT "protocol %04x is "
1351 "buggy, dev %s\n",
1352 skb2->protocol, dev->name);
c1d2bbe1 1353 skb_reset_network_header(skb2);
1da177e4
LT
1354 }
1355
b0e380b1 1356 skb2->transport_header = skb2->network_header;
1da177e4 1357 skb2->pkt_type = PACKET_OUTGOING;
f2ccd8fa 1358 ptype->func(skb2, skb->dev, ptype, skb->dev);
1da177e4
LT
1359 }
1360 }
1361 rcu_read_unlock();
1362}
1363
56079431 1364
def82a1d 1365static inline void __netif_reschedule(struct Qdisc *q)
56079431 1366{
def82a1d
JP
1367 struct softnet_data *sd;
1368 unsigned long flags;
56079431 1369
def82a1d
JP
1370 local_irq_save(flags);
1371 sd = &__get_cpu_var(softnet_data);
1372 q->next_sched = sd->output_queue;
1373 sd->output_queue = q;
1374 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1375 local_irq_restore(flags);
1376}
1377
1378void __netif_schedule(struct Qdisc *q)
1379{
1380 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1381 __netif_reschedule(q);
56079431
DV
1382}
1383EXPORT_SYMBOL(__netif_schedule);
1384
bea3348e 1385void dev_kfree_skb_irq(struct sk_buff *skb)
56079431 1386{
bea3348e
SH
1387 if (atomic_dec_and_test(&skb->users)) {
1388 struct softnet_data *sd;
1389 unsigned long flags;
56079431 1390
bea3348e
SH
1391 local_irq_save(flags);
1392 sd = &__get_cpu_var(softnet_data);
1393 skb->next = sd->completion_queue;
1394 sd->completion_queue = skb;
1395 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1396 local_irq_restore(flags);
1397 }
56079431 1398}
bea3348e 1399EXPORT_SYMBOL(dev_kfree_skb_irq);
56079431
DV
1400
1401void dev_kfree_skb_any(struct sk_buff *skb)
1402{
1403 if (in_irq() || irqs_disabled())
1404 dev_kfree_skb_irq(skb);
1405 else
1406 dev_kfree_skb(skb);
1407}
1408EXPORT_SYMBOL(dev_kfree_skb_any);
1409
1410
bea3348e
SH
1411/**
1412 * netif_device_detach - mark device as removed
1413 * @dev: network device
1414 *
1415 * Mark device as removed from system and therefore no longer available.
1416 */
56079431
DV
1417void netif_device_detach(struct net_device *dev)
1418{
1419 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1420 netif_running(dev)) {
1421 netif_stop_queue(dev);
1422 }
1423}
1424EXPORT_SYMBOL(netif_device_detach);
1425
bea3348e
SH
1426/**
1427 * netif_device_attach - mark device as attached
1428 * @dev: network device
1429 *
1430 * Mark device as attached from system and restart if needed.
1431 */
56079431
DV
1432void netif_device_attach(struct net_device *dev)
1433{
1434 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1435 netif_running(dev)) {
1436 netif_wake_queue(dev);
4ec93edb 1437 __netdev_watchdog_up(dev);
56079431
DV
1438 }
1439}
1440EXPORT_SYMBOL(netif_device_attach);
1441
6de329e2
BH
1442static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1443{
1444 return ((features & NETIF_F_GEN_CSUM) ||
1445 ((features & NETIF_F_IP_CSUM) &&
1446 protocol == htons(ETH_P_IP)) ||
1447 ((features & NETIF_F_IPV6_CSUM) &&
1448 protocol == htons(ETH_P_IPV6)));
1449}
1450
1451static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1452{
1453 if (can_checksum_protocol(dev->features, skb->protocol))
1454 return true;
1455
1456 if (skb->protocol == htons(ETH_P_8021Q)) {
1457 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1458 if (can_checksum_protocol(dev->features & dev->vlan_features,
1459 veh->h_vlan_encapsulated_proto))
1460 return true;
1461 }
1462
1463 return false;
1464}
56079431 1465
1da177e4
LT
1466/*
1467 * Invalidate hardware checksum when packet is to be mangled, and
1468 * complete checksum manually on outgoing path.
1469 */
84fa7933 1470int skb_checksum_help(struct sk_buff *skb)
1da177e4 1471{
d3bc23e7 1472 __wsum csum;
663ead3b 1473 int ret = 0, offset;
1da177e4 1474
84fa7933 1475 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
1476 goto out_set_summed;
1477
1478 if (unlikely(skb_shinfo(skb)->gso_size)) {
a430a43d
HX
1479 /* Let GSO fix up the checksum. */
1480 goto out_set_summed;
1da177e4
LT
1481 }
1482
a030847e
HX
1483 offset = skb->csum_start - skb_headroom(skb);
1484 BUG_ON(offset >= skb_headlen(skb));
1485 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1486
1487 offset += skb->csum_offset;
1488 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1489
1490 if (skb_cloned(skb) &&
1491 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
1492 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1493 if (ret)
1494 goto out;
1495 }
1496
a030847e 1497 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
a430a43d 1498out_set_summed:
1da177e4 1499 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 1500out:
1da177e4
LT
1501 return ret;
1502}
1503
f6a78bfc
HX
1504/**
1505 * skb_gso_segment - Perform segmentation on skb.
1506 * @skb: buffer to segment
576a30eb 1507 * @features: features for the output path (see dev->features)
f6a78bfc
HX
1508 *
1509 * This function segments the given skb and returns a list of segments.
576a30eb
HX
1510 *
1511 * It may return NULL if the skb requires no segmentation. This is
1512 * only possible when GSO is used for verifying header integrity.
f6a78bfc 1513 */
576a30eb 1514struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
f6a78bfc
HX
1515{
1516 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1517 struct packet_type *ptype;
252e3346 1518 __be16 type = skb->protocol;
a430a43d 1519 int err;
f6a78bfc
HX
1520
1521 BUG_ON(skb_shinfo(skb)->frag_list);
f6a78bfc 1522
459a98ed 1523 skb_reset_mac_header(skb);
b0e380b1 1524 skb->mac_len = skb->network_header - skb->mac_header;
f6a78bfc
HX
1525 __skb_pull(skb, skb->mac_len);
1526
f9d106a6 1527 if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
a430a43d
HX
1528 if (skb_header_cloned(skb) &&
1529 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1530 return ERR_PTR(err);
1531 }
1532
f6a78bfc 1533 rcu_read_lock();
82d8a867
PE
1534 list_for_each_entry_rcu(ptype,
1535 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
f6a78bfc 1536 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
84fa7933 1537 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
a430a43d
HX
1538 err = ptype->gso_send_check(skb);
1539 segs = ERR_PTR(err);
1540 if (err || skb_gso_ok(skb, features))
1541 break;
d56f90a7
ACM
1542 __skb_push(skb, (skb->data -
1543 skb_network_header(skb)));
a430a43d 1544 }
576a30eb 1545 segs = ptype->gso_segment(skb, features);
f6a78bfc
HX
1546 break;
1547 }
1548 }
1549 rcu_read_unlock();
1550
98e399f8 1551 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 1552
f6a78bfc
HX
1553 return segs;
1554}
1555
1556EXPORT_SYMBOL(skb_gso_segment);
1557
fb286bb2
HX
1558/* Take action when hardware reception checksum errors are detected. */
1559#ifdef CONFIG_BUG
1560void netdev_rx_csum_fault(struct net_device *dev)
1561{
1562 if (net_ratelimit()) {
4ec93edb 1563 printk(KERN_ERR "%s: hw csum failure.\n",
246a4212 1564 dev ? dev->name : "<unknown>");
fb286bb2
HX
1565 dump_stack();
1566 }
1567}
1568EXPORT_SYMBOL(netdev_rx_csum_fault);
1569#endif
1570
1da177e4
LT
1571/* Actually, we should eliminate this check as soon as we know, that:
1572 * 1. IOMMU is present and allows to map all the memory.
1573 * 2. No high memory really exists on this machine.
1574 */
1575
1576static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1577{
3d3a8533 1578#ifdef CONFIG_HIGHMEM
1da177e4
LT
1579 int i;
1580
1581 if (dev->features & NETIF_F_HIGHDMA)
1582 return 0;
1583
1584 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1585 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1586 return 1;
1587
3d3a8533 1588#endif
1da177e4
LT
1589 return 0;
1590}
1da177e4 1591
f6a78bfc
HX
1592struct dev_gso_cb {
1593 void (*destructor)(struct sk_buff *skb);
1594};
1595
1596#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1597
1598static void dev_gso_skb_destructor(struct sk_buff *skb)
1599{
1600 struct dev_gso_cb *cb;
1601
1602 do {
1603 struct sk_buff *nskb = skb->next;
1604
1605 skb->next = nskb->next;
1606 nskb->next = NULL;
1607 kfree_skb(nskb);
1608 } while (skb->next);
1609
1610 cb = DEV_GSO_CB(skb);
1611 if (cb->destructor)
1612 cb->destructor(skb);
1613}
1614
1615/**
1616 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1617 * @skb: buffer to segment
1618 *
1619 * This function segments the given skb and stores the list of segments
1620 * in skb->next.
1621 */
1622static int dev_gso_segment(struct sk_buff *skb)
1623{
1624 struct net_device *dev = skb->dev;
1625 struct sk_buff *segs;
576a30eb
HX
1626 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1627 NETIF_F_SG : 0);
1628
1629 segs = skb_gso_segment(skb, features);
1630
1631 /* Verifying header integrity only. */
1632 if (!segs)
1633 return 0;
f6a78bfc 1634
801678c5 1635 if (IS_ERR(segs))
f6a78bfc
HX
1636 return PTR_ERR(segs);
1637
1638 skb->next = segs;
1639 DEV_GSO_CB(skb)->destructor = skb->destructor;
1640 skb->destructor = dev_gso_skb_destructor;
1641
1642 return 0;
1643}
1644
fd2ea0a7
DM
1645int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1646 struct netdev_queue *txq)
f6a78bfc
HX
1647{
1648 if (likely(!skb->next)) {
9be9a6b9 1649 if (!list_empty(&ptype_all))
f6a78bfc
HX
1650 dev_queue_xmit_nit(skb, dev);
1651
576a30eb
HX
1652 if (netif_needs_gso(dev, skb)) {
1653 if (unlikely(dev_gso_segment(skb)))
1654 goto out_kfree_skb;
1655 if (skb->next)
1656 goto gso;
1657 }
f6a78bfc 1658
576a30eb 1659 return dev->hard_start_xmit(skb, dev);
f6a78bfc
HX
1660 }
1661
576a30eb 1662gso:
f6a78bfc
HX
1663 do {
1664 struct sk_buff *nskb = skb->next;
1665 int rc;
1666
1667 skb->next = nskb->next;
1668 nskb->next = NULL;
1669 rc = dev->hard_start_xmit(nskb, dev);
1670 if (unlikely(rc)) {
f54d9e8d 1671 nskb->next = skb->next;
f6a78bfc
HX
1672 skb->next = nskb;
1673 return rc;
1674 }
fd2ea0a7 1675 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
f54d9e8d 1676 return NETDEV_TX_BUSY;
f6a78bfc 1677 } while (skb->next);
4ec93edb 1678
f6a78bfc
HX
1679 skb->destructor = DEV_GSO_CB(skb)->destructor;
1680
1681out_kfree_skb:
1682 kfree_skb(skb);
1683 return 0;
1684}
1685
b6b2fed1
DM
1686static u32 simple_tx_hashrnd;
1687static int simple_tx_hashrnd_initialized = 0;
1688
8f0f2223
DM
1689static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
1690{
b6b2fed1
DM
1691 u32 addr1, addr2, ports;
1692 u32 hash, ihl;
8f0f2223 1693 u8 ip_proto;
b6b2fed1
DM
1694
1695 if (unlikely(!simple_tx_hashrnd_initialized)) {
1696 get_random_bytes(&simple_tx_hashrnd, 4);
1697 simple_tx_hashrnd_initialized = 1;
1698 }
8f0f2223
DM
1699
1700 switch (skb->protocol) {
60678040 1701 case htons(ETH_P_IP):
8f0f2223 1702 ip_proto = ip_hdr(skb)->protocol;
b6b2fed1
DM
1703 addr1 = ip_hdr(skb)->saddr;
1704 addr2 = ip_hdr(skb)->daddr;
8f0f2223 1705 ihl = ip_hdr(skb)->ihl;
8f0f2223 1706 break;
60678040 1707 case htons(ETH_P_IPV6):
8f0f2223 1708 ip_proto = ipv6_hdr(skb)->nexthdr;
b6b2fed1
DM
1709 addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
1710 addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
8f0f2223 1711 ihl = (40 >> 2);
8f0f2223
DM
1712 break;
1713 default:
1714 return 0;
1715 }
1716
8f0f2223
DM
1717
1718 switch (ip_proto) {
1719 case IPPROTO_TCP:
1720 case IPPROTO_UDP:
1721 case IPPROTO_DCCP:
1722 case IPPROTO_ESP:
1723 case IPPROTO_AH:
1724 case IPPROTO_SCTP:
1725 case IPPROTO_UDPLITE:
b6b2fed1 1726 ports = *((u32 *) (skb_network_header(skb) + (ihl * 4)));
8f0f2223
DM
1727 break;
1728
1729 default:
b6b2fed1 1730 ports = 0;
8f0f2223
DM
1731 break;
1732 }
1733
b6b2fed1
DM
1734 hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd);
1735
1736 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
8f0f2223
DM
1737}
1738
e8a0464c
DM
1739static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1740 struct sk_buff *skb)
1741{
fd2ea0a7
DM
1742 u16 queue_index = 0;
1743
eae792b7
DM
1744 if (dev->select_queue)
1745 queue_index = dev->select_queue(dev, skb);
8f0f2223
DM
1746 else if (dev->real_num_tx_queues > 1)
1747 queue_index = simple_tx_hash(dev, skb);
eae792b7 1748
fd2ea0a7
DM
1749 skb_set_queue_mapping(skb, queue_index);
1750 return netdev_get_tx_queue(dev, queue_index);
e8a0464c
DM
1751}
1752
d29f749e
DJ
1753/**
1754 * dev_queue_xmit - transmit a buffer
1755 * @skb: buffer to transmit
1756 *
1757 * Queue a buffer for transmission to a network device. The caller must
1758 * have set the device and priority and built the buffer before calling
1759 * this function. The function can be called from an interrupt.
1760 *
1761 * A negative errno code is returned on a failure. A success does not
1762 * guarantee the frame will be transmitted as it may be dropped due
1763 * to congestion or traffic shaping.
1764 *
1765 * -----------------------------------------------------------------------------------
1766 * I notice this method can also return errors from the queue disciplines,
1767 * including NET_XMIT_DROP, which is a positive value. So, errors can also
1768 * be positive.
1769 *
1770 * Regardless of the return value, the skb is consumed, so it is currently
1771 * difficult to retry a send to this method. (You can bump the ref count
1772 * before sending to hold a reference for retry if you are careful.)
1773 *
1774 * When calling this method, interrupts MUST be enabled. This is because
1775 * the BH enable code must have IRQs enabled so that it will not deadlock.
1776 * --BLG
1777 */
1da177e4
LT
1778int dev_queue_xmit(struct sk_buff *skb)
1779{
1780 struct net_device *dev = skb->dev;
dc2b4847 1781 struct netdev_queue *txq;
1da177e4
LT
1782 struct Qdisc *q;
1783 int rc = -ENOMEM;
1784
f6a78bfc
HX
1785 /* GSO will handle the following emulations directly. */
1786 if (netif_needs_gso(dev, skb))
1787 goto gso;
1788
1da177e4
LT
1789 if (skb_shinfo(skb)->frag_list &&
1790 !(dev->features & NETIF_F_FRAGLIST) &&
364c6bad 1791 __skb_linearize(skb))
1da177e4
LT
1792 goto out_kfree_skb;
1793
1794 /* Fragmented skb is linearized if device does not support SG,
1795 * or if at least one of fragments is in highmem and device
1796 * does not support DMA from it.
1797 */
1798 if (skb_shinfo(skb)->nr_frags &&
1799 (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
364c6bad 1800 __skb_linearize(skb))
1da177e4
LT
1801 goto out_kfree_skb;
1802
1803 /* If packet is not checksummed and device does not support
1804 * checksumming for this protocol, complete checksumming here.
1805 */
663ead3b
HX
1806 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1807 skb_set_transport_header(skb, skb->csum_start -
1808 skb_headroom(skb));
6de329e2
BH
1809 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1810 goto out_kfree_skb;
663ead3b 1811 }
1da177e4 1812
f6a78bfc 1813gso:
4ec93edb
YH
1814 /* Disable soft irqs for various locks below. Also
1815 * stops preemption for RCU.
1da177e4 1816 */
4ec93edb 1817 rcu_read_lock_bh();
1da177e4 1818
eae792b7 1819 txq = dev_pick_tx(dev, skb);
b0e1e646 1820 q = rcu_dereference(txq->qdisc);
37437bb2 1821
1da177e4
LT
1822#ifdef CONFIG_NET_CLS_ACT
1823 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1824#endif
1825 if (q->enqueue) {
5fb66229 1826 spinlock_t *root_lock = qdisc_lock(q);
37437bb2
DM
1827
1828 spin_lock(root_lock);
1829
a9312ae8 1830 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
96d20316 1831 kfree_skb(skb);
a9312ae8 1832 rc = NET_XMIT_DROP;
96d20316
DM
1833 } else {
1834 rc = qdisc_enqueue_root(skb, q);
1835 qdisc_run(q);
a9312ae8 1836 }
37437bb2
DM
1837 spin_unlock(root_lock);
1838
37437bb2 1839 goto out;
1da177e4
LT
1840 }
1841
1842 /* The device has no queue. Common case for software devices:
1843 loopback, all the sorts of tunnels...
1844
932ff279
HX
1845 Really, it is unlikely that netif_tx_lock protection is necessary
1846 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
1847 counters.)
1848 However, it is possible, that they rely on protection
1849 made by us here.
1850
1851 Check this and shot the lock. It is not prone from deadlocks.
1852 Either shot noqueue qdisc, it is even simpler 8)
1853 */
1854 if (dev->flags & IFF_UP) {
1855 int cpu = smp_processor_id(); /* ok because BHs are off */
1856
c773e847 1857 if (txq->xmit_lock_owner != cpu) {
1da177e4 1858
c773e847 1859 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 1860
fd2ea0a7 1861 if (!netif_tx_queue_stopped(txq)) {
1da177e4 1862 rc = 0;
fd2ea0a7 1863 if (!dev_hard_start_xmit(skb, dev, txq)) {
c773e847 1864 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
1865 goto out;
1866 }
1867 }
c773e847 1868 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
1869 if (net_ratelimit())
1870 printk(KERN_CRIT "Virtual device %s asks to "
1871 "queue packet!\n", dev->name);
1872 } else {
1873 /* Recursion is detected! It is possible,
1874 * unfortunately */
1875 if (net_ratelimit())
1876 printk(KERN_CRIT "Dead loop on virtual device "
1877 "%s, fix it urgently!\n", dev->name);
1878 }
1879 }
1880
1881 rc = -ENETDOWN;
d4828d85 1882 rcu_read_unlock_bh();
1da177e4
LT
1883
1884out_kfree_skb:
1885 kfree_skb(skb);
1886 return rc;
1887out:
d4828d85 1888 rcu_read_unlock_bh();
1da177e4
LT
1889 return rc;
1890}
1891
1892
1893/*=======================================================================
1894 Receiver routines
1895 =======================================================================*/
1896
6b2bedc3
SH
1897int netdev_max_backlog __read_mostly = 1000;
1898int netdev_budget __read_mostly = 300;
1899int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4
LT
1900
1901DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1902
1903
1da177e4
LT
1904/**
1905 * netif_rx - post buffer to the network code
1906 * @skb: buffer to post
1907 *
1908 * This function receives a packet from a device driver and queues it for
1909 * the upper (protocol) levels to process. It always succeeds. The buffer
1910 * may be dropped during processing for congestion control or by the
1911 * protocol layers.
1912 *
1913 * return values:
1914 * NET_RX_SUCCESS (no congestion)
1da177e4
LT
1915 * NET_RX_DROP (packet was dropped)
1916 *
1917 */
1918
1919int netif_rx(struct sk_buff *skb)
1920{
1da177e4
LT
1921 struct softnet_data *queue;
1922 unsigned long flags;
1923
1924 /* if netpoll wants it, pretend we never saw it */
1925 if (netpoll_rx(skb))
1926 return NET_RX_DROP;
1927
b7aa0bf7 1928 if (!skb->tstamp.tv64)
a61bbcf2 1929 net_timestamp(skb);
1da177e4
LT
1930
1931 /*
1932 * The code is rearranged so that the path is the most
1933 * short when CPU is congested, but is still operating.
1934 */
1935 local_irq_save(flags);
1da177e4
LT
1936 queue = &__get_cpu_var(softnet_data);
1937
1938 __get_cpu_var(netdev_rx_stat).total++;
1939 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1940 if (queue->input_pkt_queue.qlen) {
1da177e4 1941enqueue:
1da177e4 1942 __skb_queue_tail(&queue->input_pkt_queue, skb);
1da177e4 1943 local_irq_restore(flags);
34008d8c 1944 return NET_RX_SUCCESS;
1da177e4
LT
1945 }
1946
bea3348e 1947 napi_schedule(&queue->backlog);
1da177e4
LT
1948 goto enqueue;
1949 }
1950
1da177e4
LT
1951 __get_cpu_var(netdev_rx_stat).dropped++;
1952 local_irq_restore(flags);
1953
1954 kfree_skb(skb);
1955 return NET_RX_DROP;
1956}
1957
1958int netif_rx_ni(struct sk_buff *skb)
1959{
1960 int err;
1961
1962 preempt_disable();
1963 err = netif_rx(skb);
1964 if (local_softirq_pending())
1965 do_softirq();
1966 preempt_enable();
1967
1968 return err;
1969}
1970
1971EXPORT_SYMBOL(netif_rx_ni);
1972
1da177e4
LT
1973static void net_tx_action(struct softirq_action *h)
1974{
1975 struct softnet_data *sd = &__get_cpu_var(softnet_data);
1976
1977 if (sd->completion_queue) {
1978 struct sk_buff *clist;
1979
1980 local_irq_disable();
1981 clist = sd->completion_queue;
1982 sd->completion_queue = NULL;
1983 local_irq_enable();
1984
1985 while (clist) {
1986 struct sk_buff *skb = clist;
1987 clist = clist->next;
1988
547b792c 1989 WARN_ON(atomic_read(&skb->users));
1da177e4
LT
1990 __kfree_skb(skb);
1991 }
1992 }
1993
1994 if (sd->output_queue) {
37437bb2 1995 struct Qdisc *head;
1da177e4
LT
1996
1997 local_irq_disable();
1998 head = sd->output_queue;
1999 sd->output_queue = NULL;
2000 local_irq_enable();
2001
2002 while (head) {
37437bb2
DM
2003 struct Qdisc *q = head;
2004 spinlock_t *root_lock;
2005
1da177e4
LT
2006 head = head->next_sched;
2007
5fb66229 2008 root_lock = qdisc_lock(q);
37437bb2 2009 if (spin_trylock(root_lock)) {
def82a1d
JP
2010 smp_mb__before_clear_bit();
2011 clear_bit(__QDISC_STATE_SCHED,
2012 &q->state);
37437bb2
DM
2013 qdisc_run(q);
2014 spin_unlock(root_lock);
1da177e4 2015 } else {
195648bb 2016 if (!test_bit(__QDISC_STATE_DEACTIVATED,
e8a83e10 2017 &q->state)) {
195648bb 2018 __netif_reschedule(q);
e8a83e10
JP
2019 } else {
2020 smp_mb__before_clear_bit();
2021 clear_bit(__QDISC_STATE_SCHED,
2022 &q->state);
2023 }
1da177e4
LT
2024 }
2025 }
2026 }
2027}
2028
6f05f629
SH
2029static inline int deliver_skb(struct sk_buff *skb,
2030 struct packet_type *pt_prev,
2031 struct net_device *orig_dev)
1da177e4
LT
2032{
2033 atomic_inc(&skb->users);
f2ccd8fa 2034 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4
LT
2035}
2036
2037#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
6229e362 2038/* These hooks defined here for ATM */
1da177e4
LT
2039struct net_bridge;
2040struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2041 unsigned char *addr);
6229e362 2042void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
1da177e4 2043
6229e362
SH
2044/*
2045 * If bridge module is loaded call bridging hook.
2046 * returns NULL if packet was consumed.
2047 */
2048struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2049 struct sk_buff *skb) __read_mostly;
2050static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2051 struct packet_type **pt_prev, int *ret,
2052 struct net_device *orig_dev)
1da177e4
LT
2053{
2054 struct net_bridge_port *port;
2055
6229e362
SH
2056 if (skb->pkt_type == PACKET_LOOPBACK ||
2057 (port = rcu_dereference(skb->dev->br_port)) == NULL)
2058 return skb;
1da177e4
LT
2059
2060 if (*pt_prev) {
6229e362 2061 *ret = deliver_skb(skb, *pt_prev, orig_dev);
1da177e4 2062 *pt_prev = NULL;
4ec93edb
YH
2063 }
2064
6229e362 2065 return br_handle_frame_hook(port, skb);
1da177e4
LT
2066}
2067#else
6229e362 2068#define handle_bridge(skb, pt_prev, ret, orig_dev) (skb)
1da177e4
LT
2069#endif
2070
b863ceb7
PM
2071#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2072struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2073EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2074
2075static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2076 struct packet_type **pt_prev,
2077 int *ret,
2078 struct net_device *orig_dev)
2079{
2080 if (skb->dev->macvlan_port == NULL)
2081 return skb;
2082
2083 if (*pt_prev) {
2084 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2085 *pt_prev = NULL;
2086 }
2087 return macvlan_handle_frame_hook(skb);
2088}
2089#else
2090#define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb)
2091#endif
2092
1da177e4
LT
2093#ifdef CONFIG_NET_CLS_ACT
2094/* TODO: Maybe we should just force sch_ingress to be compiled in
2095 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2096 * a compare and 2 stores extra right now if we dont have it on
2097 * but have CONFIG_NET_CLS_ACT
4ec93edb 2098 * NOTE: This doesnt stop any functionality; if you dont have
1da177e4
LT
2099 * the ingress scheduler, you just cant add policies on ingress.
2100 *
2101 */
4ec93edb 2102static int ing_filter(struct sk_buff *skb)
1da177e4 2103{
1da177e4 2104 struct net_device *dev = skb->dev;
f697c3e8 2105 u32 ttl = G_TC_RTTL(skb->tc_verd);
555353cf
DM
2106 struct netdev_queue *rxq;
2107 int result = TC_ACT_OK;
2108 struct Qdisc *q;
4ec93edb 2109
f697c3e8
HX
2110 if (MAX_RED_LOOP < ttl++) {
2111 printk(KERN_WARNING
2112 "Redir loop detected Dropping packet (%d->%d)\n",
2113 skb->iif, dev->ifindex);
2114 return TC_ACT_SHOT;
2115 }
1da177e4 2116
f697c3e8
HX
2117 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2118 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1da177e4 2119
555353cf
DM
2120 rxq = &dev->rx_queue;
2121
83874000 2122 q = rxq->qdisc;
8d50b53d 2123 if (q != &noop_qdisc) {
83874000 2124 spin_lock(qdisc_lock(q));
a9312ae8
DM
2125 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2126 result = qdisc_enqueue_root(skb, q);
83874000
DM
2127 spin_unlock(qdisc_lock(q));
2128 }
f697c3e8
HX
2129
2130 return result;
2131}
86e65da9 2132
f697c3e8
HX
2133static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2134 struct packet_type **pt_prev,
2135 int *ret, struct net_device *orig_dev)
2136{
8d50b53d 2137 if (skb->dev->rx_queue.qdisc == &noop_qdisc)
f697c3e8 2138 goto out;
1da177e4 2139
f697c3e8
HX
2140 if (*pt_prev) {
2141 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2142 *pt_prev = NULL;
2143 } else {
2144 /* Huh? Why does turning on AF_PACKET affect this? */
2145 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
1da177e4
LT
2146 }
2147
f697c3e8
HX
2148 switch (ing_filter(skb)) {
2149 case TC_ACT_SHOT:
2150 case TC_ACT_STOLEN:
2151 kfree_skb(skb);
2152 return NULL;
2153 }
2154
2155out:
2156 skb->tc_verd = 0;
2157 return skb;
1da177e4
LT
2158}
2159#endif
2160
bc1d0411
PM
2161/*
2162 * netif_nit_deliver - deliver received packets to network taps
2163 * @skb: buffer
2164 *
2165 * This function is used to deliver incoming packets to network
2166 * taps. It should be used when the normal netif_receive_skb path
2167 * is bypassed, for example because of VLAN acceleration.
2168 */
2169void netif_nit_deliver(struct sk_buff *skb)
2170{
2171 struct packet_type *ptype;
2172
2173 if (list_empty(&ptype_all))
2174 return;
2175
2176 skb_reset_network_header(skb);
2177 skb_reset_transport_header(skb);
2178 skb->mac_len = skb->network_header - skb->mac_header;
2179
2180 rcu_read_lock();
2181 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2182 if (!ptype->dev || ptype->dev == skb->dev)
2183 deliver_skb(skb, ptype, skb->dev);
2184 }
2185 rcu_read_unlock();
2186}
2187
3b582cc1
SH
2188/**
2189 * netif_receive_skb - process receive buffer from network
2190 * @skb: buffer to process
2191 *
2192 * netif_receive_skb() is the main receive data processing function.
2193 * It always succeeds. The buffer may be dropped during processing
2194 * for congestion control or by the protocol layers.
2195 *
2196 * This function may only be called from softirq context and interrupts
2197 * should be enabled.
2198 *
2199 * Return values (usually ignored):
2200 * NET_RX_SUCCESS: no congestion
2201 * NET_RX_DROP: packet was dropped
2202 */
1da177e4
LT
2203int netif_receive_skb(struct sk_buff *skb)
2204{
2205 struct packet_type *ptype, *pt_prev;
f2ccd8fa 2206 struct net_device *orig_dev;
0d7a3681 2207 struct net_device *null_or_orig;
1da177e4 2208 int ret = NET_RX_DROP;
252e3346 2209 __be16 type;
1da177e4
LT
2210
2211 /* if we've gotten here through NAPI, check netpoll */
bea3348e 2212 if (netpoll_receive_skb(skb))
1da177e4
LT
2213 return NET_RX_DROP;
2214
b7aa0bf7 2215 if (!skb->tstamp.tv64)
a61bbcf2 2216 net_timestamp(skb);
1da177e4 2217
c01003c2
PM
2218 if (!skb->iif)
2219 skb->iif = skb->dev->ifindex;
86e65da9 2220
0d7a3681 2221 null_or_orig = NULL;
cc9bd5ce
JE
2222 orig_dev = skb->dev;
2223 if (orig_dev->master) {
0d7a3681
JE
2224 if (skb_bond_should_drop(skb))
2225 null_or_orig = orig_dev; /* deliver only exact match */
2226 else
2227 skb->dev = orig_dev->master;
cc9bd5ce 2228 }
8f903c70 2229
1da177e4
LT
2230 __get_cpu_var(netdev_rx_stat).total++;
2231
c1d2bbe1 2232 skb_reset_network_header(skb);
badff6d0 2233 skb_reset_transport_header(skb);
b0e380b1 2234 skb->mac_len = skb->network_header - skb->mac_header;
1da177e4
LT
2235
2236 pt_prev = NULL;
2237
2238 rcu_read_lock();
2239
b9f75f45
EB
2240 /* Don't receive packets in an exiting network namespace */
2241 if (!net_alive(dev_net(skb->dev)))
2242 goto out;
2243
1da177e4
LT
2244#ifdef CONFIG_NET_CLS_ACT
2245 if (skb->tc_verd & TC_NCLS) {
2246 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2247 goto ncls;
2248 }
2249#endif
2250
2251 list_for_each_entry_rcu(ptype, &ptype_all, list) {
f982307f
JE
2252 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2253 ptype->dev == orig_dev) {
4ec93edb 2254 if (pt_prev)
f2ccd8fa 2255 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
2256 pt_prev = ptype;
2257 }
2258 }
2259
2260#ifdef CONFIG_NET_CLS_ACT
f697c3e8
HX
2261 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2262 if (!skb)
1da177e4 2263 goto out;
1da177e4
LT
2264ncls:
2265#endif
2266
6229e362 2267 skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
b863ceb7
PM
2268 if (!skb)
2269 goto out;
2270 skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
6229e362 2271 if (!skb)
1da177e4
LT
2272 goto out;
2273
2274 type = skb->protocol;
82d8a867
PE
2275 list_for_each_entry_rcu(ptype,
2276 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1da177e4 2277 if (ptype->type == type &&
f982307f
JE
2278 (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2279 ptype->dev == orig_dev)) {
4ec93edb 2280 if (pt_prev)
f2ccd8fa 2281 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
2282 pt_prev = ptype;
2283 }
2284 }
2285
2286 if (pt_prev) {
f2ccd8fa 2287 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4
LT
2288 } else {
2289 kfree_skb(skb);
2290 /* Jamal, now you will not able to escape explaining
2291 * me how you were going to use this. :-)
2292 */
2293 ret = NET_RX_DROP;
2294 }
2295
2296out:
2297 rcu_read_unlock();
2298 return ret;
2299}
2300
6e583ce5
SH
2301/* Network device is going away, flush any packets still pending */
2302static void flush_backlog(void *arg)
2303{
2304 struct net_device *dev = arg;
2305 struct softnet_data *queue = &__get_cpu_var(softnet_data);
2306 struct sk_buff *skb, *tmp;
2307
2308 skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2309 if (skb->dev == dev) {
2310 __skb_unlink(skb, &queue->input_pkt_queue);
2311 kfree_skb(skb);
2312 }
2313}
2314
bea3348e 2315static int process_backlog(struct napi_struct *napi, int quota)
1da177e4
LT
2316{
2317 int work = 0;
1da177e4
LT
2318 struct softnet_data *queue = &__get_cpu_var(softnet_data);
2319 unsigned long start_time = jiffies;
2320
bea3348e
SH
2321 napi->weight = weight_p;
2322 do {
1da177e4 2323 struct sk_buff *skb;
1da177e4
LT
2324
2325 local_irq_disable();
2326 skb = __skb_dequeue(&queue->input_pkt_queue);
bea3348e
SH
2327 if (!skb) {
2328 __napi_complete(napi);
2329 local_irq_enable();
2330 break;
2331 }
1da177e4
LT
2332 local_irq_enable();
2333
1da177e4 2334 netif_receive_skb(skb);
bea3348e 2335 } while (++work < quota && jiffies == start_time);
1da177e4 2336
bea3348e
SH
2337 return work;
2338}
1da177e4 2339
bea3348e
SH
2340/**
2341 * __napi_schedule - schedule for receive
c4ea43c5 2342 * @n: entry to schedule
bea3348e
SH
2343 *
2344 * The entry's receive function will be scheduled to run
2345 */
b5606c2d 2346void __napi_schedule(struct napi_struct *n)
bea3348e
SH
2347{
2348 unsigned long flags;
1da177e4 2349
bea3348e
SH
2350 local_irq_save(flags);
2351 list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2352 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2353 local_irq_restore(flags);
1da177e4 2354}
bea3348e
SH
2355EXPORT_SYMBOL(__napi_schedule);
2356
1da177e4
LT
2357
2358static void net_rx_action(struct softirq_action *h)
2359{
bea3348e 2360 struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
1da177e4 2361 unsigned long start_time = jiffies;
51b0bded 2362 int budget = netdev_budget;
53fb95d3
MM
2363 void *have;
2364
1da177e4
LT
2365 local_irq_disable();
2366
bea3348e
SH
2367 while (!list_empty(list)) {
2368 struct napi_struct *n;
2369 int work, weight;
1da177e4 2370
bea3348e
SH
2371 /* If softirq window is exhuasted then punt.
2372 *
2373 * Note that this is a slight policy change from the
2374 * previous NAPI code, which would allow up to 2
2375 * jiffies to pass before breaking out. The test
2376 * used to be "jiffies - start_time > 1".
2377 */
2378 if (unlikely(budget <= 0 || jiffies != start_time))
1da177e4
LT
2379 goto softnet_break;
2380
2381 local_irq_enable();
2382
bea3348e
SH
2383 /* Even though interrupts have been re-enabled, this
2384 * access is safe because interrupts can only add new
2385 * entries to the tail of this list, and only ->poll()
2386 * calls can remove this head entry from the list.
2387 */
2388 n = list_entry(list->next, struct napi_struct, poll_list);
1da177e4 2389
bea3348e
SH
2390 have = netpoll_poll_lock(n);
2391
2392 weight = n->weight;
2393
0a7606c1
DM
2394 /* This NAPI_STATE_SCHED test is for avoiding a race
2395 * with netpoll's poll_napi(). Only the entity which
2396 * obtains the lock and sees NAPI_STATE_SCHED set will
2397 * actually make the ->poll() call. Therefore we avoid
2398 * accidently calling ->poll() when NAPI is not scheduled.
2399 */
2400 work = 0;
2401 if (test_bit(NAPI_STATE_SCHED, &n->state))
2402 work = n->poll(n, weight);
bea3348e
SH
2403
2404 WARN_ON_ONCE(work > weight);
2405
2406 budget -= work;
2407
2408 local_irq_disable();
2409
2410 /* Drivers must not modify the NAPI state if they
2411 * consume the entire weight. In such cases this code
2412 * still "owns" the NAPI instance and therefore can
2413 * move the instance around on the list at-will.
2414 */
fed17f30
DM
2415 if (unlikely(work == weight)) {
2416 if (unlikely(napi_disable_pending(n)))
2417 __napi_complete(n);
2418 else
2419 list_move_tail(&n->poll_list, list);
2420 }
bea3348e
SH
2421
2422 netpoll_poll_unlock(have);
1da177e4
LT
2423 }
2424out:
515e06c4 2425 local_irq_enable();
bea3348e 2426
db217334
CL
2427#ifdef CONFIG_NET_DMA
2428 /*
2429 * There may not be any more sk_buffs coming right now, so push
2430 * any pending DMA copies to hardware
2431 */
d379b01e
DW
2432 if (!cpus_empty(net_dma.channel_mask)) {
2433 int chan_idx;
0e12f848 2434 for_each_cpu_mask_nr(chan_idx, net_dma.channel_mask) {
d379b01e
DW
2435 struct dma_chan *chan = net_dma.channels[chan_idx];
2436 if (chan)
2437 dma_async_memcpy_issue_pending(chan);
2438 }
db217334
CL
2439 }
2440#endif
bea3348e 2441
1da177e4
LT
2442 return;
2443
2444softnet_break:
2445 __get_cpu_var(netdev_rx_stat).time_squeeze++;
2446 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2447 goto out;
2448}
2449
2450static gifconf_func_t * gifconf_list [NPROTO];
2451
2452/**
2453 * register_gifconf - register a SIOCGIF handler
2454 * @family: Address family
2455 * @gifconf: Function handler
2456 *
2457 * Register protocol dependent address dumping routines. The handler
2458 * that is passed must not be freed or reused until it has been replaced
2459 * by another handler.
2460 */
2461int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2462{
2463 if (family >= NPROTO)
2464 return -EINVAL;
2465 gifconf_list[family] = gifconf;
2466 return 0;
2467}
2468
2469
2470/*
2471 * Map an interface index to its name (SIOCGIFNAME)
2472 */
2473
2474/*
2475 * We need this ioctl for efficient implementation of the
2476 * if_indextoname() function required by the IPv6 API. Without
2477 * it, we would have to search all the interfaces to find a
2478 * match. --pb
2479 */
2480
881d966b 2481static int dev_ifname(struct net *net, struct ifreq __user *arg)
1da177e4
LT
2482{
2483 struct net_device *dev;
2484 struct ifreq ifr;
2485
2486 /*
2487 * Fetch the caller's info block.
2488 */
2489
2490 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2491 return -EFAULT;
2492
2493 read_lock(&dev_base_lock);
881d966b 2494 dev = __dev_get_by_index(net, ifr.ifr_ifindex);
1da177e4
LT
2495 if (!dev) {
2496 read_unlock(&dev_base_lock);
2497 return -ENODEV;
2498 }
2499
2500 strcpy(ifr.ifr_name, dev->name);
2501 read_unlock(&dev_base_lock);
2502
2503 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2504 return -EFAULT;
2505 return 0;
2506}
2507
2508/*
2509 * Perform a SIOCGIFCONF call. This structure will change
2510 * size eventually, and there is nothing I can do about it.
2511 * Thus we will need a 'compatibility mode'.
2512 */
2513
881d966b 2514static int dev_ifconf(struct net *net, char __user *arg)
1da177e4
LT
2515{
2516 struct ifconf ifc;
2517 struct net_device *dev;
2518 char __user *pos;
2519 int len;
2520 int total;
2521 int i;
2522
2523 /*
2524 * Fetch the caller's info block.
2525 */
2526
2527 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2528 return -EFAULT;
2529
2530 pos = ifc.ifc_buf;
2531 len = ifc.ifc_len;
2532
2533 /*
2534 * Loop over the interfaces, and write an info block for each.
2535 */
2536
2537 total = 0;
881d966b 2538 for_each_netdev(net, dev) {
1da177e4
LT
2539 for (i = 0; i < NPROTO; i++) {
2540 if (gifconf_list[i]) {
2541 int done;
2542 if (!pos)
2543 done = gifconf_list[i](dev, NULL, 0);
2544 else
2545 done = gifconf_list[i](dev, pos + total,
2546 len - total);
2547 if (done < 0)
2548 return -EFAULT;
2549 total += done;
2550 }
2551 }
4ec93edb 2552 }
1da177e4
LT
2553
2554 /*
2555 * All done. Write the updated control block back to the caller.
2556 */
2557 ifc.ifc_len = total;
2558
2559 /*
2560 * Both BSD and Solaris return 0 here, so we do too.
2561 */
2562 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2563}
2564
2565#ifdef CONFIG_PROC_FS
2566/*
2567 * This is invoked by the /proc filesystem handler to display a device
2568 * in detail.
2569 */
7562f876 2570void *dev_seq_start(struct seq_file *seq, loff_t *pos)
9a429c49 2571 __acquires(dev_base_lock)
1da177e4 2572{
e372c414 2573 struct net *net = seq_file_net(seq);
7562f876 2574 loff_t off;
1da177e4 2575 struct net_device *dev;
1da177e4 2576
7562f876
PE
2577 read_lock(&dev_base_lock);
2578 if (!*pos)
2579 return SEQ_START_TOKEN;
1da177e4 2580
7562f876 2581 off = 1;
881d966b 2582 for_each_netdev(net, dev)
7562f876
PE
2583 if (off++ == *pos)
2584 return dev;
1da177e4 2585
7562f876 2586 return NULL;
1da177e4
LT
2587}
2588
2589void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2590{
e372c414 2591 struct net *net = seq_file_net(seq);
1da177e4 2592 ++*pos;
7562f876 2593 return v == SEQ_START_TOKEN ?
881d966b 2594 first_net_device(net) : next_net_device((struct net_device *)v);
1da177e4
LT
2595}
2596
2597void dev_seq_stop(struct seq_file *seq, void *v)
9a429c49 2598 __releases(dev_base_lock)
1da177e4
LT
2599{
2600 read_unlock(&dev_base_lock);
2601}
2602
2603static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2604{
c45d286e 2605 struct net_device_stats *stats = dev->get_stats(dev);
1da177e4 2606
5a1b5898
RR
2607 seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2608 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2609 dev->name, stats->rx_bytes, stats->rx_packets,
2610 stats->rx_errors,
2611 stats->rx_dropped + stats->rx_missed_errors,
2612 stats->rx_fifo_errors,
2613 stats->rx_length_errors + stats->rx_over_errors +
2614 stats->rx_crc_errors + stats->rx_frame_errors,
2615 stats->rx_compressed, stats->multicast,
2616 stats->tx_bytes, stats->tx_packets,
2617 stats->tx_errors, stats->tx_dropped,
2618 stats->tx_fifo_errors, stats->collisions,
2619 stats->tx_carrier_errors +
2620 stats->tx_aborted_errors +
2621 stats->tx_window_errors +
2622 stats->tx_heartbeat_errors,
2623 stats->tx_compressed);
1da177e4
LT
2624}
2625
2626/*
2627 * Called from the PROCfs module. This now uses the new arbitrary sized
2628 * /proc/net interface to create /proc/net/dev
2629 */
2630static int dev_seq_show(struct seq_file *seq, void *v)
2631{
2632 if (v == SEQ_START_TOKEN)
2633 seq_puts(seq, "Inter-| Receive "
2634 " | Transmit\n"
2635 " face |bytes packets errs drop fifo frame "
2636 "compressed multicast|bytes packets errs "
2637 "drop fifo colls carrier compressed\n");
2638 else
2639 dev_seq_printf_stats(seq, v);
2640 return 0;
2641}
2642
2643static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2644{
2645 struct netif_rx_stats *rc = NULL;
2646
0c0b0aca 2647 while (*pos < nr_cpu_ids)
4ec93edb 2648 if (cpu_online(*pos)) {
1da177e4
LT
2649 rc = &per_cpu(netdev_rx_stat, *pos);
2650 break;
2651 } else
2652 ++*pos;
2653 return rc;
2654}
2655
2656static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2657{
2658 return softnet_get_online(pos);
2659}
2660
2661static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2662{
2663 ++*pos;
2664 return softnet_get_online(pos);
2665}
2666
2667static void softnet_seq_stop(struct seq_file *seq, void *v)
2668{
2669}
2670
2671static int softnet_seq_show(struct seq_file *seq, void *v)
2672{
2673 struct netif_rx_stats *s = v;
2674
2675 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
31aa02c5 2676 s->total, s->dropped, s->time_squeeze, 0,
c1ebcdb8
SH
2677 0, 0, 0, 0, /* was fastroute */
2678 s->cpu_collision );
1da177e4
LT
2679 return 0;
2680}
2681
f690808e 2682static const struct seq_operations dev_seq_ops = {
1da177e4
LT
2683 .start = dev_seq_start,
2684 .next = dev_seq_next,
2685 .stop = dev_seq_stop,
2686 .show = dev_seq_show,
2687};
2688
2689static int dev_seq_open(struct inode *inode, struct file *file)
2690{
e372c414
DL
2691 return seq_open_net(inode, file, &dev_seq_ops,
2692 sizeof(struct seq_net_private));
1da177e4
LT
2693}
2694
9a32144e 2695static const struct file_operations dev_seq_fops = {
1da177e4
LT
2696 .owner = THIS_MODULE,
2697 .open = dev_seq_open,
2698 .read = seq_read,
2699 .llseek = seq_lseek,
e372c414 2700 .release = seq_release_net,
1da177e4
LT
2701};
2702
f690808e 2703static const struct seq_operations softnet_seq_ops = {
1da177e4
LT
2704 .start = softnet_seq_start,
2705 .next = softnet_seq_next,
2706 .stop = softnet_seq_stop,
2707 .show = softnet_seq_show,
2708};
2709
2710static int softnet_seq_open(struct inode *inode, struct file *file)
2711{
2712 return seq_open(file, &softnet_seq_ops);
2713}
2714
9a32144e 2715static const struct file_operations softnet_seq_fops = {
1da177e4
LT
2716 .owner = THIS_MODULE,
2717 .open = softnet_seq_open,
2718 .read = seq_read,
2719 .llseek = seq_lseek,
2720 .release = seq_release,
2721};
2722
0e1256ff
SH
2723static void *ptype_get_idx(loff_t pos)
2724{
2725 struct packet_type *pt = NULL;
2726 loff_t i = 0;
2727 int t;
2728
2729 list_for_each_entry_rcu(pt, &ptype_all, list) {
2730 if (i == pos)
2731 return pt;
2732 ++i;
2733 }
2734
82d8a867 2735 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
0e1256ff
SH
2736 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
2737 if (i == pos)
2738 return pt;
2739 ++i;
2740 }
2741 }
2742 return NULL;
2743}
2744
2745static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
72348a42 2746 __acquires(RCU)
0e1256ff
SH
2747{
2748 rcu_read_lock();
2749 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
2750}
2751
2752static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2753{
2754 struct packet_type *pt;
2755 struct list_head *nxt;
2756 int hash;
2757
2758 ++*pos;
2759 if (v == SEQ_START_TOKEN)
2760 return ptype_get_idx(0);
2761
2762 pt = v;
2763 nxt = pt->list.next;
2764 if (pt->type == htons(ETH_P_ALL)) {
2765 if (nxt != &ptype_all)
2766 goto found;
2767 hash = 0;
2768 nxt = ptype_base[0].next;
2769 } else
82d8a867 2770 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
0e1256ff
SH
2771
2772 while (nxt == &ptype_base[hash]) {
82d8a867 2773 if (++hash >= PTYPE_HASH_SIZE)
0e1256ff
SH
2774 return NULL;
2775 nxt = ptype_base[hash].next;
2776 }
2777found:
2778 return list_entry(nxt, struct packet_type, list);
2779}
2780
2781static void ptype_seq_stop(struct seq_file *seq, void *v)
72348a42 2782 __releases(RCU)
0e1256ff
SH
2783{
2784 rcu_read_unlock();
2785}
2786
2787static void ptype_seq_decode(struct seq_file *seq, void *sym)
2788{
2789#ifdef CONFIG_KALLSYMS
2790 unsigned long offset = 0, symsize;
2791 const char *symname;
2792 char *modname;
2793 char namebuf[128];
2794
2795 symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset,
2796 &modname, namebuf);
2797
2798 if (symname) {
2799 char *delim = ":";
2800
2801 if (!modname)
2802 modname = delim = "";
2803 seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim,
2804 symname, offset);
2805 return;
2806 }
2807#endif
2808
2809 seq_printf(seq, "[%p]", sym);
2810}
2811
2812static int ptype_seq_show(struct seq_file *seq, void *v)
2813{
2814 struct packet_type *pt = v;
2815
2816 if (v == SEQ_START_TOKEN)
2817 seq_puts(seq, "Type Device Function\n");
c346dca1 2818 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
0e1256ff
SH
2819 if (pt->type == htons(ETH_P_ALL))
2820 seq_puts(seq, "ALL ");
2821 else
2822 seq_printf(seq, "%04x", ntohs(pt->type));
2823
2824 seq_printf(seq, " %-8s ",
2825 pt->dev ? pt->dev->name : "");
2826 ptype_seq_decode(seq, pt->func);
2827 seq_putc(seq, '\n');
2828 }
2829
2830 return 0;
2831}
2832
2833static const struct seq_operations ptype_seq_ops = {
2834 .start = ptype_seq_start,
2835 .next = ptype_seq_next,
2836 .stop = ptype_seq_stop,
2837 .show = ptype_seq_show,
2838};
2839
2840static int ptype_seq_open(struct inode *inode, struct file *file)
2841{
2feb27db
PE
2842 return seq_open_net(inode, file, &ptype_seq_ops,
2843 sizeof(struct seq_net_private));
0e1256ff
SH
2844}
2845
2846static const struct file_operations ptype_seq_fops = {
2847 .owner = THIS_MODULE,
2848 .open = ptype_seq_open,
2849 .read = seq_read,
2850 .llseek = seq_lseek,
2feb27db 2851 .release = seq_release_net,
0e1256ff
SH
2852};
2853
2854
4665079c 2855static int __net_init dev_proc_net_init(struct net *net)
1da177e4
LT
2856{
2857 int rc = -ENOMEM;
2858
881d966b 2859 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
1da177e4 2860 goto out;
881d966b 2861 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
1da177e4 2862 goto out_dev;
881d966b 2863 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
457c4cbc 2864 goto out_softnet;
0e1256ff 2865
881d966b 2866 if (wext_proc_init(net))
457c4cbc 2867 goto out_ptype;
1da177e4
LT
2868 rc = 0;
2869out:
2870 return rc;
457c4cbc 2871out_ptype:
881d966b 2872 proc_net_remove(net, "ptype");
1da177e4 2873out_softnet:
881d966b 2874 proc_net_remove(net, "softnet_stat");
1da177e4 2875out_dev:
881d966b 2876 proc_net_remove(net, "dev");
1da177e4
LT
2877 goto out;
2878}
881d966b 2879
4665079c 2880static void __net_exit dev_proc_net_exit(struct net *net)
881d966b
EB
2881{
2882 wext_proc_exit(net);
2883
2884 proc_net_remove(net, "ptype");
2885 proc_net_remove(net, "softnet_stat");
2886 proc_net_remove(net, "dev");
2887}
2888
022cbae6 2889static struct pernet_operations __net_initdata dev_proc_ops = {
881d966b
EB
2890 .init = dev_proc_net_init,
2891 .exit = dev_proc_net_exit,
2892};
2893
2894static int __init dev_proc_init(void)
2895{
2896 return register_pernet_subsys(&dev_proc_ops);
2897}
1da177e4
LT
2898#else
2899#define dev_proc_init() 0
2900#endif /* CONFIG_PROC_FS */
2901
2902
2903/**
2904 * netdev_set_master - set up master/slave pair
2905 * @slave: slave device
2906 * @master: new master device
2907 *
2908 * Changes the master device of the slave. Pass %NULL to break the
2909 * bonding. The caller must hold the RTNL semaphore. On a failure
2910 * a negative errno code is returned. On success the reference counts
2911 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2912 * function returns zero.
2913 */
2914int netdev_set_master(struct net_device *slave, struct net_device *master)
2915{
2916 struct net_device *old = slave->master;
2917
2918 ASSERT_RTNL();
2919
2920 if (master) {
2921 if (old)
2922 return -EBUSY;
2923 dev_hold(master);
2924 }
2925
2926 slave->master = master;
4ec93edb 2927
1da177e4
LT
2928 synchronize_net();
2929
2930 if (old)
2931 dev_put(old);
2932
2933 if (master)
2934 slave->flags |= IFF_SLAVE;
2935 else
2936 slave->flags &= ~IFF_SLAVE;
2937
2938 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2939 return 0;
2940}
2941
dad9b335 2942static int __dev_set_promiscuity(struct net_device *dev, int inc)
1da177e4
LT
2943{
2944 unsigned short old_flags = dev->flags;
2945
24023451
PM
2946 ASSERT_RTNL();
2947
dad9b335
WC
2948 dev->flags |= IFF_PROMISC;
2949 dev->promiscuity += inc;
2950 if (dev->promiscuity == 0) {
2951 /*
2952 * Avoid overflow.
2953 * If inc causes overflow, untouch promisc and return error.
2954 */
2955 if (inc < 0)
2956 dev->flags &= ~IFF_PROMISC;
2957 else {
2958 dev->promiscuity -= inc;
2959 printk(KERN_WARNING "%s: promiscuity touches roof, "
2960 "set promiscuity failed, promiscuity feature "
2961 "of device might be broken.\n", dev->name);
2962 return -EOVERFLOW;
2963 }
2964 }
52609c0b 2965 if (dev->flags != old_flags) {
1da177e4
LT
2966 printk(KERN_INFO "device %s %s promiscuous mode\n",
2967 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4ec93edb 2968 "left");
7759db82
KHK
2969 if (audit_enabled)
2970 audit_log(current->audit_context, GFP_ATOMIC,
2971 AUDIT_ANOM_PROMISCUOUS,
2972 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
2973 dev->name, (dev->flags & IFF_PROMISC),
2974 (old_flags & IFF_PROMISC),
2975 audit_get_loginuid(current),
2976 current->uid, current->gid,
2977 audit_get_sessionid(current));
24023451
PM
2978
2979 if (dev->change_rx_flags)
2980 dev->change_rx_flags(dev, IFF_PROMISC);
1da177e4 2981 }
dad9b335 2982 return 0;
1da177e4
LT
2983}
2984
4417da66
PM
2985/**
2986 * dev_set_promiscuity - update promiscuity count on a device
2987 * @dev: device
2988 * @inc: modifier
2989 *
2990 * Add or remove promiscuity from a device. While the count in the device
2991 * remains above zero the interface remains promiscuous. Once it hits zero
2992 * the device reverts back to normal filtering operation. A negative inc
2993 * value is used to drop promiscuity on the device.
dad9b335 2994 * Return 0 if successful or a negative errno code on error.
4417da66 2995 */
dad9b335 2996int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66
PM
2997{
2998 unsigned short old_flags = dev->flags;
dad9b335 2999 int err;
4417da66 3000
dad9b335 3001 err = __dev_set_promiscuity(dev, inc);
4b5a698e 3002 if (err < 0)
dad9b335 3003 return err;
4417da66
PM
3004 if (dev->flags != old_flags)
3005 dev_set_rx_mode(dev);
dad9b335 3006 return err;
4417da66
PM
3007}
3008
1da177e4
LT
3009/**
3010 * dev_set_allmulti - update allmulti count on a device
3011 * @dev: device
3012 * @inc: modifier
3013 *
3014 * Add or remove reception of all multicast frames to a device. While the
3015 * count in the device remains above zero the interface remains listening
3016 * to all interfaces. Once it hits zero the device reverts back to normal
3017 * filtering operation. A negative @inc value is used to drop the counter
3018 * when releasing a resource needing all multicasts.
dad9b335 3019 * Return 0 if successful or a negative errno code on error.
1da177e4
LT
3020 */
3021
dad9b335 3022int dev_set_allmulti(struct net_device *dev, int inc)
1da177e4
LT
3023{
3024 unsigned short old_flags = dev->flags;
3025
24023451
PM
3026 ASSERT_RTNL();
3027
1da177e4 3028 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
3029 dev->allmulti += inc;
3030 if (dev->allmulti == 0) {
3031 /*
3032 * Avoid overflow.
3033 * If inc causes overflow, untouch allmulti and return error.
3034 */
3035 if (inc < 0)
3036 dev->flags &= ~IFF_ALLMULTI;
3037 else {
3038 dev->allmulti -= inc;
3039 printk(KERN_WARNING "%s: allmulti touches roof, "
3040 "set allmulti failed, allmulti feature of "
3041 "device might be broken.\n", dev->name);
3042 return -EOVERFLOW;
3043 }
3044 }
24023451
PM
3045 if (dev->flags ^ old_flags) {
3046 if (dev->change_rx_flags)
3047 dev->change_rx_flags(dev, IFF_ALLMULTI);
4417da66 3048 dev_set_rx_mode(dev);
24023451 3049 }
dad9b335 3050 return 0;
4417da66
PM
3051}
3052
3053/*
3054 * Upload unicast and multicast address lists to device and
3055 * configure RX filtering. When the device doesn't support unicast
53ccaae1 3056 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
3057 * are present.
3058 */
3059void __dev_set_rx_mode(struct net_device *dev)
3060{
3061 /* dev_open will call this function so the list will stay sane. */
3062 if (!(dev->flags&IFF_UP))
3063 return;
3064
3065 if (!netif_device_present(dev))
40b77c94 3066 return;
4417da66
PM
3067
3068 if (dev->set_rx_mode)
3069 dev->set_rx_mode(dev);
3070 else {
3071 /* Unicast addresses changes may only happen under the rtnl,
3072 * therefore calling __dev_set_promiscuity here is safe.
3073 */
3074 if (dev->uc_count > 0 && !dev->uc_promisc) {
3075 __dev_set_promiscuity(dev, 1);
3076 dev->uc_promisc = 1;
3077 } else if (dev->uc_count == 0 && dev->uc_promisc) {
3078 __dev_set_promiscuity(dev, -1);
3079 dev->uc_promisc = 0;
3080 }
3081
3082 if (dev->set_multicast_list)
3083 dev->set_multicast_list(dev);
3084 }
3085}
3086
3087void dev_set_rx_mode(struct net_device *dev)
3088{
b9e40857 3089 netif_addr_lock_bh(dev);
4417da66 3090 __dev_set_rx_mode(dev);
b9e40857 3091 netif_addr_unlock_bh(dev);
1da177e4
LT
3092}
3093
61cbc2fc
PM
3094int __dev_addr_delete(struct dev_addr_list **list, int *count,
3095 void *addr, int alen, int glbl)
bf742482
PM
3096{
3097 struct dev_addr_list *da;
3098
3099 for (; (da = *list) != NULL; list = &da->next) {
3100 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3101 alen == da->da_addrlen) {
3102 if (glbl) {
3103 int old_glbl = da->da_gusers;
3104 da->da_gusers = 0;
3105 if (old_glbl == 0)
3106 break;
3107 }
3108 if (--da->da_users)
3109 return 0;
3110
3111 *list = da->next;
3112 kfree(da);
61cbc2fc 3113 (*count)--;
bf742482
PM
3114 return 0;
3115 }
3116 }
3117 return -ENOENT;
3118}
3119
61cbc2fc
PM
3120int __dev_addr_add(struct dev_addr_list **list, int *count,
3121 void *addr, int alen, int glbl)
bf742482
PM
3122{
3123 struct dev_addr_list *da;
3124
3125 for (da = *list; da != NULL; da = da->next) {
3126 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3127 da->da_addrlen == alen) {
3128 if (glbl) {
3129 int old_glbl = da->da_gusers;
3130 da->da_gusers = 1;
3131 if (old_glbl)
3132 return 0;
3133 }
3134 da->da_users++;
3135 return 0;
3136 }
3137 }
3138
12aa343a 3139 da = kzalloc(sizeof(*da), GFP_ATOMIC);
bf742482
PM
3140 if (da == NULL)
3141 return -ENOMEM;
3142 memcpy(da->da_addr, addr, alen);
3143 da->da_addrlen = alen;
3144 da->da_users = 1;
3145 da->da_gusers = glbl ? 1 : 0;
3146 da->next = *list;
3147 *list = da;
61cbc2fc 3148 (*count)++;
bf742482
PM
3149 return 0;
3150}
3151
4417da66
PM
3152/**
3153 * dev_unicast_delete - Release secondary unicast address.
3154 * @dev: device
0ed72ec4
RD
3155 * @addr: address to delete
3156 * @alen: length of @addr
4417da66
PM
3157 *
3158 * Release reference to a secondary unicast address and remove it
0ed72ec4 3159 * from the device if the reference count drops to zero.
4417da66
PM
3160 *
3161 * The caller must hold the rtnl_mutex.
3162 */
3163int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3164{
3165 int err;
3166
3167 ASSERT_RTNL();
3168
b9e40857 3169 netif_addr_lock_bh(dev);
61cbc2fc
PM
3170 err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3171 if (!err)
4417da66 3172 __dev_set_rx_mode(dev);
b9e40857 3173 netif_addr_unlock_bh(dev);
4417da66
PM
3174 return err;
3175}
3176EXPORT_SYMBOL(dev_unicast_delete);
3177
3178/**
3179 * dev_unicast_add - add a secondary unicast address
3180 * @dev: device
5dbaec5d 3181 * @addr: address to add
0ed72ec4 3182 * @alen: length of @addr
4417da66
PM
3183 *
3184 * Add a secondary unicast address to the device or increase
3185 * the reference count if it already exists.
3186 *
3187 * The caller must hold the rtnl_mutex.
3188 */
3189int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3190{
3191 int err;
3192
3193 ASSERT_RTNL();
3194
b9e40857 3195 netif_addr_lock_bh(dev);
61cbc2fc
PM
3196 err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3197 if (!err)
4417da66 3198 __dev_set_rx_mode(dev);
b9e40857 3199 netif_addr_unlock_bh(dev);
4417da66
PM
3200 return err;
3201}
3202EXPORT_SYMBOL(dev_unicast_add);
3203
e83a2ea8
CL
3204int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3205 struct dev_addr_list **from, int *from_count)
3206{
3207 struct dev_addr_list *da, *next;
3208 int err = 0;
3209
3210 da = *from;
3211 while (da != NULL) {
3212 next = da->next;
3213 if (!da->da_synced) {
3214 err = __dev_addr_add(to, to_count,
3215 da->da_addr, da->da_addrlen, 0);
3216 if (err < 0)
3217 break;
3218 da->da_synced = 1;
3219 da->da_users++;
3220 } else if (da->da_users == 1) {
3221 __dev_addr_delete(to, to_count,
3222 da->da_addr, da->da_addrlen, 0);
3223 __dev_addr_delete(from, from_count,
3224 da->da_addr, da->da_addrlen, 0);
3225 }
3226 da = next;
3227 }
3228 return err;
3229}
3230
3231void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3232 struct dev_addr_list **from, int *from_count)
3233{
3234 struct dev_addr_list *da, *next;
3235
3236 da = *from;
3237 while (da != NULL) {
3238 next = da->next;
3239 if (da->da_synced) {
3240 __dev_addr_delete(to, to_count,
3241 da->da_addr, da->da_addrlen, 0);
3242 da->da_synced = 0;
3243 __dev_addr_delete(from, from_count,
3244 da->da_addr, da->da_addrlen, 0);
3245 }
3246 da = next;
3247 }
3248}
3249
3250/**
3251 * dev_unicast_sync - Synchronize device's unicast list to another device
3252 * @to: destination device
3253 * @from: source device
3254 *
3255 * Add newly added addresses to the destination device and release
3256 * addresses that have no users left. The source device must be
3257 * locked by netif_tx_lock_bh.
3258 *
3259 * This function is intended to be called from the dev->set_rx_mode
3260 * function of layered software devices.
3261 */
3262int dev_unicast_sync(struct net_device *to, struct net_device *from)
3263{
3264 int err = 0;
3265
b9e40857 3266 netif_addr_lock_bh(to);
e83a2ea8
CL
3267 err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3268 &from->uc_list, &from->uc_count);
3269 if (!err)
3270 __dev_set_rx_mode(to);
b9e40857 3271 netif_addr_unlock_bh(to);
e83a2ea8
CL
3272 return err;
3273}
3274EXPORT_SYMBOL(dev_unicast_sync);
3275
3276/**
bc2cda1e 3277 * dev_unicast_unsync - Remove synchronized addresses from the destination device
e83a2ea8
CL
3278 * @to: destination device
3279 * @from: source device
3280 *
3281 * Remove all addresses that were added to the destination device by
3282 * dev_unicast_sync(). This function is intended to be called from the
3283 * dev->stop function of layered software devices.
3284 */
3285void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3286{
b9e40857 3287 netif_addr_lock_bh(from);
e308a5d8 3288 netif_addr_lock(to);
e83a2ea8
CL
3289
3290 __dev_addr_unsync(&to->uc_list, &to->uc_count,
3291 &from->uc_list, &from->uc_count);
3292 __dev_set_rx_mode(to);
3293
e308a5d8 3294 netif_addr_unlock(to);
b9e40857 3295 netif_addr_unlock_bh(from);
e83a2ea8
CL
3296}
3297EXPORT_SYMBOL(dev_unicast_unsync);
3298
12972621
DC
3299static void __dev_addr_discard(struct dev_addr_list **list)
3300{
3301 struct dev_addr_list *tmp;
3302
3303 while (*list != NULL) {
3304 tmp = *list;
3305 *list = tmp->next;
3306 if (tmp->da_users > tmp->da_gusers)
3307 printk("__dev_addr_discard: address leakage! "
3308 "da_users=%d\n", tmp->da_users);
3309 kfree(tmp);
3310 }
3311}
3312
26cc2522 3313static void dev_addr_discard(struct net_device *dev)
4417da66 3314{
b9e40857 3315 netif_addr_lock_bh(dev);
26cc2522 3316
4417da66
PM
3317 __dev_addr_discard(&dev->uc_list);
3318 dev->uc_count = 0;
4417da66 3319
456ad75c
DC
3320 __dev_addr_discard(&dev->mc_list);
3321 dev->mc_count = 0;
26cc2522 3322
b9e40857 3323 netif_addr_unlock_bh(dev);
456ad75c
DC
3324}
3325
1da177e4
LT
3326unsigned dev_get_flags(const struct net_device *dev)
3327{
3328 unsigned flags;
3329
3330 flags = (dev->flags & ~(IFF_PROMISC |
3331 IFF_ALLMULTI |
b00055aa
SR
3332 IFF_RUNNING |
3333 IFF_LOWER_UP |
3334 IFF_DORMANT)) |
1da177e4
LT
3335 (dev->gflags & (IFF_PROMISC |
3336 IFF_ALLMULTI));
3337
b00055aa
SR
3338 if (netif_running(dev)) {
3339 if (netif_oper_up(dev))
3340 flags |= IFF_RUNNING;
3341 if (netif_carrier_ok(dev))
3342 flags |= IFF_LOWER_UP;
3343 if (netif_dormant(dev))
3344 flags |= IFF_DORMANT;
3345 }
1da177e4
LT
3346
3347 return flags;
3348}
3349
3350int dev_change_flags(struct net_device *dev, unsigned flags)
3351{
7c355f53 3352 int ret, changes;
1da177e4
LT
3353 int old_flags = dev->flags;
3354
24023451
PM
3355 ASSERT_RTNL();
3356
1da177e4
LT
3357 /*
3358 * Set the flags on our device.
3359 */
3360
3361 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3362 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3363 IFF_AUTOMEDIA)) |
3364 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3365 IFF_ALLMULTI));
3366
3367 /*
3368 * Load in the correct multicast list now the flags have changed.
3369 */
3370
0e91796e 3371 if (dev->change_rx_flags && (old_flags ^ flags) & IFF_MULTICAST)
24023451
PM
3372 dev->change_rx_flags(dev, IFF_MULTICAST);
3373
4417da66 3374 dev_set_rx_mode(dev);
1da177e4
LT
3375
3376 /*
3377 * Have we downed the interface. We handle IFF_UP ourselves
3378 * according to user attempts to set it, rather than blindly
3379 * setting it.
3380 */
3381
3382 ret = 0;
3383 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
3384 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3385
3386 if (!ret)
4417da66 3387 dev_set_rx_mode(dev);
1da177e4
LT
3388 }
3389
3390 if (dev->flags & IFF_UP &&
3391 ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3392 IFF_VOLATILE)))
056925ab 3393 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1da177e4
LT
3394
3395 if ((flags ^ dev->gflags) & IFF_PROMISC) {
3396 int inc = (flags & IFF_PROMISC) ? +1 : -1;
3397 dev->gflags ^= IFF_PROMISC;
3398 dev_set_promiscuity(dev, inc);
3399 }
3400
3401 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3402 is important. Some (broken) drivers set IFF_PROMISC, when
3403 IFF_ALLMULTI is requested not asking us and not reporting.
3404 */
3405 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3406 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3407 dev->gflags ^= IFF_ALLMULTI;
3408 dev_set_allmulti(dev, inc);
3409 }
3410
7c355f53
TG
3411 /* Exclude state transition flags, already notified */
3412 changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3413 if (changes)
3414 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
1da177e4
LT
3415
3416 return ret;
3417}
3418
3419int dev_set_mtu(struct net_device *dev, int new_mtu)
3420{
3421 int err;
3422
3423 if (new_mtu == dev->mtu)
3424 return 0;
3425
3426 /* MTU must be positive. */
3427 if (new_mtu < 0)
3428 return -EINVAL;
3429
3430 if (!netif_device_present(dev))
3431 return -ENODEV;
3432
3433 err = 0;
3434 if (dev->change_mtu)
3435 err = dev->change_mtu(dev, new_mtu);
3436 else
3437 dev->mtu = new_mtu;
3438 if (!err && dev->flags & IFF_UP)
056925ab 3439 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
1da177e4
LT
3440 return err;
3441}
3442
3443int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3444{
3445 int err;
3446
3447 if (!dev->set_mac_address)
3448 return -EOPNOTSUPP;
3449 if (sa->sa_family != dev->type)
3450 return -EINVAL;
3451 if (!netif_device_present(dev))
3452 return -ENODEV;
3453 err = dev->set_mac_address(dev, sa);
3454 if (!err)
056925ab 3455 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
1da177e4
LT
3456 return err;
3457}
3458
3459/*
14e3e079 3460 * Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
1da177e4 3461 */
14e3e079 3462static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
1da177e4
LT
3463{
3464 int err;
881d966b 3465 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
1da177e4
LT
3466
3467 if (!dev)
3468 return -ENODEV;
3469
3470 switch (cmd) {
3471 case SIOCGIFFLAGS: /* Get interface flags */
3472 ifr->ifr_flags = dev_get_flags(dev);
3473 return 0;
3474
1da177e4
LT
3475 case SIOCGIFMETRIC: /* Get the metric on the interface
3476 (currently unused) */
3477 ifr->ifr_metric = 0;
3478 return 0;
3479
1da177e4
LT
3480 case SIOCGIFMTU: /* Get the MTU of a device */
3481 ifr->ifr_mtu = dev->mtu;
3482 return 0;
3483
1da177e4
LT
3484 case SIOCGIFHWADDR:
3485 if (!dev->addr_len)
3486 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3487 else
3488 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3489 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3490 ifr->ifr_hwaddr.sa_family = dev->type;
3491 return 0;
3492
14e3e079
JG
3493 case SIOCGIFSLAVE:
3494 err = -EINVAL;
3495 break;
3496
3497 case SIOCGIFMAP:
3498 ifr->ifr_map.mem_start = dev->mem_start;
3499 ifr->ifr_map.mem_end = dev->mem_end;
3500 ifr->ifr_map.base_addr = dev->base_addr;
3501 ifr->ifr_map.irq = dev->irq;
3502 ifr->ifr_map.dma = dev->dma;
3503 ifr->ifr_map.port = dev->if_port;
3504 return 0;
3505
3506 case SIOCGIFINDEX:
3507 ifr->ifr_ifindex = dev->ifindex;
3508 return 0;
3509
3510 case SIOCGIFTXQLEN:
3511 ifr->ifr_qlen = dev->tx_queue_len;
3512 return 0;
3513
3514 default:
3515 /* dev_ioctl() should ensure this case
3516 * is never reached
3517 */
3518 WARN_ON(1);
3519 err = -EINVAL;
3520 break;
3521
3522 }
3523 return err;
3524}
3525
3526/*
3527 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
3528 */
3529static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3530{
3531 int err;
3532 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3533
3534 if (!dev)
3535 return -ENODEV;
3536
3537 switch (cmd) {
3538 case SIOCSIFFLAGS: /* Set interface flags */
3539 return dev_change_flags(dev, ifr->ifr_flags);
3540
3541 case SIOCSIFMETRIC: /* Set the metric on the interface
3542 (currently unused) */
3543 return -EOPNOTSUPP;
3544
3545 case SIOCSIFMTU: /* Set the MTU of a device */
3546 return dev_set_mtu(dev, ifr->ifr_mtu);
3547
1da177e4
LT
3548 case SIOCSIFHWADDR:
3549 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3550
3551 case SIOCSIFHWBROADCAST:
3552 if (ifr->ifr_hwaddr.sa_family != dev->type)
3553 return -EINVAL;
3554 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3555 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
056925ab 3556 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
1da177e4
LT
3557 return 0;
3558
1da177e4
LT
3559 case SIOCSIFMAP:
3560 if (dev->set_config) {
3561 if (!netif_device_present(dev))
3562 return -ENODEV;
3563 return dev->set_config(dev, &ifr->ifr_map);
3564 }
3565 return -EOPNOTSUPP;
3566
3567 case SIOCADDMULTI:
61ee6bd4 3568 if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
1da177e4
LT
3569 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3570 return -EINVAL;
3571 if (!netif_device_present(dev))
3572 return -ENODEV;
3573 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3574 dev->addr_len, 1);
3575
3576 case SIOCDELMULTI:
61ee6bd4 3577 if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
1da177e4
LT
3578 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3579 return -EINVAL;
3580 if (!netif_device_present(dev))
3581 return -ENODEV;
3582 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3583 dev->addr_len, 1);
3584
1da177e4
LT
3585 case SIOCSIFTXQLEN:
3586 if (ifr->ifr_qlen < 0)
3587 return -EINVAL;
3588 dev->tx_queue_len = ifr->ifr_qlen;
3589 return 0;
3590
3591 case SIOCSIFNAME:
3592 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3593 return dev_change_name(dev, ifr->ifr_newname);
3594
3595 /*
3596 * Unknown or private ioctl
3597 */
3598
3599 default:
3600 if ((cmd >= SIOCDEVPRIVATE &&
3601 cmd <= SIOCDEVPRIVATE + 15) ||
3602 cmd == SIOCBONDENSLAVE ||
3603 cmd == SIOCBONDRELEASE ||
3604 cmd == SIOCBONDSETHWADDR ||
3605 cmd == SIOCBONDSLAVEINFOQUERY ||
3606 cmd == SIOCBONDINFOQUERY ||
3607 cmd == SIOCBONDCHANGEACTIVE ||
3608 cmd == SIOCGMIIPHY ||
3609 cmd == SIOCGMIIREG ||
3610 cmd == SIOCSMIIREG ||
3611 cmd == SIOCBRADDIF ||
3612 cmd == SIOCBRDELIF ||
3613 cmd == SIOCWANDEV) {
3614 err = -EOPNOTSUPP;
3615 if (dev->do_ioctl) {
3616 if (netif_device_present(dev))
3617 err = dev->do_ioctl(dev, ifr,
3618 cmd);
3619 else
3620 err = -ENODEV;
3621 }
3622 } else
3623 err = -EINVAL;
3624
3625 }
3626 return err;
3627}
3628
3629/*
3630 * This function handles all "interface"-type I/O control requests. The actual
3631 * 'doing' part of this is dev_ifsioc above.
3632 */
3633
3634/**
3635 * dev_ioctl - network device ioctl
c4ea43c5 3636 * @net: the applicable net namespace
1da177e4
LT
3637 * @cmd: command to issue
3638 * @arg: pointer to a struct ifreq in user space
3639 *
3640 * Issue ioctl functions to devices. This is normally called by the
3641 * user space syscall interfaces but can sometimes be useful for
3642 * other purposes. The return value is the return from the syscall if
3643 * positive or a negative errno code on error.
3644 */
3645
881d966b 3646int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1da177e4
LT
3647{
3648 struct ifreq ifr;
3649 int ret;
3650 char *colon;
3651
3652 /* One special case: SIOCGIFCONF takes ifconf argument
3653 and requires shared lock, because it sleeps writing
3654 to user space.
3655 */
3656
3657 if (cmd == SIOCGIFCONF) {
6756ae4b 3658 rtnl_lock();
881d966b 3659 ret = dev_ifconf(net, (char __user *) arg);
6756ae4b 3660 rtnl_unlock();
1da177e4
LT
3661 return ret;
3662 }
3663 if (cmd == SIOCGIFNAME)
881d966b 3664 return dev_ifname(net, (struct ifreq __user *)arg);
1da177e4
LT
3665
3666 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3667 return -EFAULT;
3668
3669 ifr.ifr_name[IFNAMSIZ-1] = 0;
3670
3671 colon = strchr(ifr.ifr_name, ':');
3672 if (colon)
3673 *colon = 0;
3674
3675 /*
3676 * See which interface the caller is talking about.
3677 */
3678
3679 switch (cmd) {
3680 /*
3681 * These ioctl calls:
3682 * - can be done by all.
3683 * - atomic and do not require locking.
3684 * - return a value
3685 */
3686 case SIOCGIFFLAGS:
3687 case SIOCGIFMETRIC:
3688 case SIOCGIFMTU:
3689 case SIOCGIFHWADDR:
3690 case SIOCGIFSLAVE:
3691 case SIOCGIFMAP:
3692 case SIOCGIFINDEX:
3693 case SIOCGIFTXQLEN:
881d966b 3694 dev_load(net, ifr.ifr_name);
1da177e4 3695 read_lock(&dev_base_lock);
14e3e079 3696 ret = dev_ifsioc_locked(net, &ifr, cmd);
1da177e4
LT
3697 read_unlock(&dev_base_lock);
3698 if (!ret) {
3699 if (colon)
3700 *colon = ':';
3701 if (copy_to_user(arg, &ifr,
3702 sizeof(struct ifreq)))
3703 ret = -EFAULT;
3704 }
3705 return ret;
3706
3707 case SIOCETHTOOL:
881d966b 3708 dev_load(net, ifr.ifr_name);
1da177e4 3709 rtnl_lock();
881d966b 3710 ret = dev_ethtool(net, &ifr);
1da177e4
LT
3711 rtnl_unlock();
3712 if (!ret) {
3713 if (colon)
3714 *colon = ':';
3715 if (copy_to_user(arg, &ifr,
3716 sizeof(struct ifreq)))
3717 ret = -EFAULT;
3718 }
3719 return ret;
3720
3721 /*
3722 * These ioctl calls:
3723 * - require superuser power.
3724 * - require strict serialization.
3725 * - return a value
3726 */
3727 case SIOCGMIIPHY:
3728 case SIOCGMIIREG:
3729 case SIOCSIFNAME:
3730 if (!capable(CAP_NET_ADMIN))
3731 return -EPERM;
881d966b 3732 dev_load(net, ifr.ifr_name);
1da177e4 3733 rtnl_lock();
881d966b 3734 ret = dev_ifsioc(net, &ifr, cmd);
1da177e4
LT
3735 rtnl_unlock();
3736 if (!ret) {
3737 if (colon)
3738 *colon = ':';
3739 if (copy_to_user(arg, &ifr,
3740 sizeof(struct ifreq)))
3741 ret = -EFAULT;
3742 }
3743 return ret;
3744
3745 /*
3746 * These ioctl calls:
3747 * - require superuser power.
3748 * - require strict serialization.
3749 * - do not return a value
3750 */
3751 case SIOCSIFFLAGS:
3752 case SIOCSIFMETRIC:
3753 case SIOCSIFMTU:
3754 case SIOCSIFMAP:
3755 case SIOCSIFHWADDR:
3756 case SIOCSIFSLAVE:
3757 case SIOCADDMULTI:
3758 case SIOCDELMULTI:
3759 case SIOCSIFHWBROADCAST:
3760 case SIOCSIFTXQLEN:
3761 case SIOCSMIIREG:
3762 case SIOCBONDENSLAVE:
3763 case SIOCBONDRELEASE:
3764 case SIOCBONDSETHWADDR:
1da177e4
LT
3765 case SIOCBONDCHANGEACTIVE:
3766 case SIOCBRADDIF:
3767 case SIOCBRDELIF:
3768 if (!capable(CAP_NET_ADMIN))
3769 return -EPERM;
cabcac0b
TG
3770 /* fall through */
3771 case SIOCBONDSLAVEINFOQUERY:
3772 case SIOCBONDINFOQUERY:
881d966b 3773 dev_load(net, ifr.ifr_name);
1da177e4 3774 rtnl_lock();
881d966b 3775 ret = dev_ifsioc(net, &ifr, cmd);
1da177e4
LT
3776 rtnl_unlock();
3777 return ret;
3778
3779 case SIOCGIFMEM:
3780 /* Get the per device memory space. We can add this but
3781 * currently do not support it */
3782 case SIOCSIFMEM:
3783 /* Set the per device memory buffer space.
3784 * Not applicable in our case */
3785 case SIOCSIFLINK:
3786 return -EINVAL;
3787
3788 /*
3789 * Unknown or private ioctl.
3790 */
3791 default:
3792 if (cmd == SIOCWANDEV ||
3793 (cmd >= SIOCDEVPRIVATE &&
3794 cmd <= SIOCDEVPRIVATE + 15)) {
881d966b 3795 dev_load(net, ifr.ifr_name);
1da177e4 3796 rtnl_lock();
881d966b 3797 ret = dev_ifsioc(net, &ifr, cmd);
1da177e4
LT
3798 rtnl_unlock();
3799 if (!ret && copy_to_user(arg, &ifr,
3800 sizeof(struct ifreq)))
3801 ret = -EFAULT;
3802 return ret;
3803 }
1da177e4 3804 /* Take care of Wireless Extensions */
295f4a1f 3805 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
881d966b 3806 return wext_handle_ioctl(net, &ifr, cmd, arg);
1da177e4
LT
3807 return -EINVAL;
3808 }
3809}
3810
3811
3812/**
3813 * dev_new_index - allocate an ifindex
c4ea43c5 3814 * @net: the applicable net namespace
1da177e4
LT
3815 *
3816 * Returns a suitable unique value for a new device interface
3817 * number. The caller must hold the rtnl semaphore or the
3818 * dev_base_lock to be sure it remains unique.
3819 */
881d966b 3820static int dev_new_index(struct net *net)
1da177e4
LT
3821{
3822 static int ifindex;
3823 for (;;) {
3824 if (++ifindex <= 0)
3825 ifindex = 1;
881d966b 3826 if (!__dev_get_by_index(net, ifindex))
1da177e4
LT
3827 return ifindex;
3828 }
3829}
3830
1da177e4
LT
3831/* Delayed registration/unregisteration */
3832static DEFINE_SPINLOCK(net_todo_list_lock);
3b5b34fd 3833static LIST_HEAD(net_todo_list);
1da177e4 3834
6f05f629 3835static void net_set_todo(struct net_device *dev)
1da177e4
LT
3836{
3837 spin_lock(&net_todo_list_lock);
3838 list_add_tail(&dev->todo_list, &net_todo_list);
3839 spin_unlock(&net_todo_list_lock);
3840}
3841
93ee31f1
DL
3842static void rollback_registered(struct net_device *dev)
3843{
3844 BUG_ON(dev_boot_phase);
3845 ASSERT_RTNL();
3846
3847 /* Some devices call without registering for initialization unwind. */
3848 if (dev->reg_state == NETREG_UNINITIALIZED) {
3849 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3850 "was registered\n", dev->name, dev);
3851
3852 WARN_ON(1);
3853 return;
3854 }
3855
3856 BUG_ON(dev->reg_state != NETREG_REGISTERED);
3857
3858 /* If device is running, close it first. */
3859 dev_close(dev);
3860
3861 /* And unlink it from device chain. */
3862 unlist_netdevice(dev);
3863
3864 dev->reg_state = NETREG_UNREGISTERING;
3865
3866 synchronize_net();
3867
3868 /* Shutdown queueing discipline. */
3869 dev_shutdown(dev);
3870
3871
3872 /* Notify protocols, that we are about to destroy
3873 this device. They should clean all the things.
3874 */
3875 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3876
3877 /*
3878 * Flush the unicast and multicast chains
3879 */
3880 dev_addr_discard(dev);
3881
3882 if (dev->uninit)
3883 dev->uninit(dev);
3884
3885 /* Notifier chain MUST detach us from master device. */
547b792c 3886 WARN_ON(dev->master);
93ee31f1
DL
3887
3888 /* Remove entries from kobject tree */
3889 netdev_unregister_kobject(dev);
3890
3891 synchronize_net();
3892
3893 dev_put(dev);
3894}
3895
e8a0464c
DM
3896static void __netdev_init_queue_locks_one(struct net_device *dev,
3897 struct netdev_queue *dev_queue,
3898 void *_unused)
c773e847
DM
3899{
3900 spin_lock_init(&dev_queue->_xmit_lock);
cf508b12 3901 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
c773e847
DM
3902 dev_queue->xmit_lock_owner = -1;
3903}
3904
3905static void netdev_init_queue_locks(struct net_device *dev)
3906{
e8a0464c
DM
3907 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
3908 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
c773e847
DM
3909}
3910
1da177e4
LT
3911/**
3912 * register_netdevice - register a network device
3913 * @dev: device to register
3914 *
3915 * Take a completed network device structure and add it to the kernel
3916 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3917 * chain. 0 is returned on success. A negative errno code is returned
3918 * on a failure to set up the device, or if the name is a duplicate.
3919 *
3920 * Callers must hold the rtnl semaphore. You may want
3921 * register_netdev() instead of this.
3922 *
3923 * BUGS:
3924 * The locking appears insufficient to guarantee two parallel registers
3925 * will not get the same name.
3926 */
3927
3928int register_netdevice(struct net_device *dev)
3929{
3930 struct hlist_head *head;
3931 struct hlist_node *p;
3932 int ret;
881d966b 3933 struct net *net;
1da177e4
LT
3934
3935 BUG_ON(dev_boot_phase);
3936 ASSERT_RTNL();
3937
b17a7c17
SH
3938 might_sleep();
3939
1da177e4
LT
3940 /* When net_device's are persistent, this will be fatal. */
3941 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
c346dca1
YH
3942 BUG_ON(!dev_net(dev));
3943 net = dev_net(dev);
1da177e4 3944
f1f28aa3 3945 spin_lock_init(&dev->addr_list_lock);
cf508b12 3946 netdev_set_addr_lockdep_class(dev);
c773e847 3947 netdev_init_queue_locks(dev);
1da177e4 3948
1da177e4
LT
3949 dev->iflink = -1;
3950
3951 /* Init, if this function is available */
3952 if (dev->init) {
3953 ret = dev->init(dev);
3954 if (ret) {
3955 if (ret > 0)
3956 ret = -EIO;
90833aa4 3957 goto out;
1da177e4
LT
3958 }
3959 }
4ec93edb 3960
1da177e4
LT
3961 if (!dev_valid_name(dev->name)) {
3962 ret = -EINVAL;
7ce1b0ed 3963 goto err_uninit;
1da177e4
LT
3964 }
3965
881d966b 3966 dev->ifindex = dev_new_index(net);
1da177e4
LT
3967 if (dev->iflink == -1)
3968 dev->iflink = dev->ifindex;
3969
3970 /* Check for existence of name */
881d966b 3971 head = dev_name_hash(net, dev->name);
1da177e4
LT
3972 hlist_for_each(p, head) {
3973 struct net_device *d
3974 = hlist_entry(p, struct net_device, name_hlist);
3975 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3976 ret = -EEXIST;
7ce1b0ed 3977 goto err_uninit;
1da177e4 3978 }
4ec93edb 3979 }
1da177e4 3980
d212f87b
SH
3981 /* Fix illegal checksum combinations */
3982 if ((dev->features & NETIF_F_HW_CSUM) &&
3983 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3984 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
3985 dev->name);
3986 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
3987 }
3988
3989 if ((dev->features & NETIF_F_NO_CSUM) &&
3990 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3991 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
3992 dev->name);
3993 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
3994 }
3995
3996
1da177e4
LT
3997 /* Fix illegal SG+CSUM combinations. */
3998 if ((dev->features & NETIF_F_SG) &&
8648b305 3999 !(dev->features & NETIF_F_ALL_CSUM)) {
5a8da02b 4000 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
1da177e4
LT
4001 dev->name);
4002 dev->features &= ~NETIF_F_SG;
4003 }
4004
4005 /* TSO requires that SG is present as well. */
4006 if ((dev->features & NETIF_F_TSO) &&
4007 !(dev->features & NETIF_F_SG)) {
5a8da02b 4008 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
1da177e4
LT
4009 dev->name);
4010 dev->features &= ~NETIF_F_TSO;
4011 }
e89e9cf5
AR
4012 if (dev->features & NETIF_F_UFO) {
4013 if (!(dev->features & NETIF_F_HW_CSUM)) {
4014 printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
4015 "NETIF_F_HW_CSUM feature.\n",
4016 dev->name);
4017 dev->features &= ~NETIF_F_UFO;
4018 }
4019 if (!(dev->features & NETIF_F_SG)) {
4020 printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
4021 "NETIF_F_SG feature.\n",
4022 dev->name);
4023 dev->features &= ~NETIF_F_UFO;
4024 }
4025 }
1da177e4 4026
e5a4a72d
LB
4027 /* Enable software GSO if SG is supported. */
4028 if (dev->features & NETIF_F_SG)
4029 dev->features |= NETIF_F_GSO;
4030
aaf8cdc3 4031 netdev_initialize_kobject(dev);
8b41d188 4032 ret = netdev_register_kobject(dev);
b17a7c17 4033 if (ret)
7ce1b0ed 4034 goto err_uninit;
b17a7c17
SH
4035 dev->reg_state = NETREG_REGISTERED;
4036
1da177e4
LT
4037 /*
4038 * Default initial state at registry is that the
4039 * device is present.
4040 */
4041
4042 set_bit(__LINK_STATE_PRESENT, &dev->state);
4043
1da177e4 4044 dev_init_scheduler(dev);
1da177e4 4045 dev_hold(dev);
ce286d32 4046 list_netdevice(dev);
1da177e4
LT
4047
4048 /* Notify protocols, that a new device appeared. */
056925ab 4049 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 4050 ret = notifier_to_errno(ret);
93ee31f1
DL
4051 if (ret) {
4052 rollback_registered(dev);
4053 dev->reg_state = NETREG_UNREGISTERED;
4054 }
1da177e4
LT
4055
4056out:
4057 return ret;
7ce1b0ed
HX
4058
4059err_uninit:
4060 if (dev->uninit)
4061 dev->uninit(dev);
4062 goto out;
1da177e4
LT
4063}
4064
4065/**
4066 * register_netdev - register a network device
4067 * @dev: device to register
4068 *
4069 * Take a completed network device structure and add it to the kernel
4070 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4071 * chain. 0 is returned on success. A negative errno code is returned
4072 * on a failure to set up the device, or if the name is a duplicate.
4073 *
38b4da38 4074 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
4075 * and expands the device name if you passed a format string to
4076 * alloc_netdev.
4077 */
4078int register_netdev(struct net_device *dev)
4079{
4080 int err;
4081
4082 rtnl_lock();
4083
4084 /*
4085 * If the name is a format string the caller wants us to do a
4086 * name allocation.
4087 */
4088 if (strchr(dev->name, '%')) {
4089 err = dev_alloc_name(dev, dev->name);
4090 if (err < 0)
4091 goto out;
4092 }
4ec93edb 4093
1da177e4
LT
4094 err = register_netdevice(dev);
4095out:
4096 rtnl_unlock();
4097 return err;
4098}
4099EXPORT_SYMBOL(register_netdev);
4100
4101/*
4102 * netdev_wait_allrefs - wait until all references are gone.
4103 *
4104 * This is called when unregistering network devices.
4105 *
4106 * Any protocol or device that holds a reference should register
4107 * for netdevice notification, and cleanup and put back the
4108 * reference if they receive an UNREGISTER event.
4109 * We can get stuck here if buggy protocols don't correctly
4ec93edb 4110 * call dev_put.
1da177e4
LT
4111 */
4112static void netdev_wait_allrefs(struct net_device *dev)
4113{
4114 unsigned long rebroadcast_time, warning_time;
4115
4116 rebroadcast_time = warning_time = jiffies;
4117 while (atomic_read(&dev->refcnt) != 0) {
4118 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 4119 rtnl_lock();
1da177e4
LT
4120
4121 /* Rebroadcast unregister notification */
056925ab 4122 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
1da177e4
LT
4123
4124 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4125 &dev->state)) {
4126 /* We must not have linkwatch events
4127 * pending on unregister. If this
4128 * happens, we simply run the queue
4129 * unscheduled, resulting in a noop
4130 * for this device.
4131 */
4132 linkwatch_run_queue();
4133 }
4134
6756ae4b 4135 __rtnl_unlock();
1da177e4
LT
4136
4137 rebroadcast_time = jiffies;
4138 }
4139
4140 msleep(250);
4141
4142 if (time_after(jiffies, warning_time + 10 * HZ)) {
4143 printk(KERN_EMERG "unregister_netdevice: "
4144 "waiting for %s to become free. Usage "
4145 "count = %d\n",
4146 dev->name, atomic_read(&dev->refcnt));
4147 warning_time = jiffies;
4148 }
4149 }
4150}
4151
4152/* The sequence is:
4153 *
4154 * rtnl_lock();
4155 * ...
4156 * register_netdevice(x1);
4157 * register_netdevice(x2);
4158 * ...
4159 * unregister_netdevice(y1);
4160 * unregister_netdevice(y2);
4161 * ...
4162 * rtnl_unlock();
4163 * free_netdev(y1);
4164 * free_netdev(y2);
4165 *
4166 * We are invoked by rtnl_unlock() after it drops the semaphore.
4167 * This allows us to deal with problems:
b17a7c17 4168 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
4169 * without deadlocking with linkwatch via keventd.
4170 * 2) Since we run with the RTNL semaphore not held, we can sleep
4171 * safely in order to wait for the netdev refcnt to drop to zero.
4172 */
4a3e2f71 4173static DEFINE_MUTEX(net_todo_run_mutex);
1da177e4
LT
4174void netdev_run_todo(void)
4175{
626ab0e6 4176 struct list_head list;
1da177e4
LT
4177
4178 /* Need to guard against multiple cpu's getting out of order. */
4a3e2f71 4179 mutex_lock(&net_todo_run_mutex);
1da177e4
LT
4180
4181 /* Not safe to do outside the semaphore. We must not return
4182 * until all unregister events invoked by the local processor
4183 * have been completed (either by this todo run, or one on
4184 * another cpu).
4185 */
4186 if (list_empty(&net_todo_list))
4187 goto out;
4188
4189 /* Snapshot list, allow later requests */
4190 spin_lock(&net_todo_list_lock);
626ab0e6 4191 list_replace_init(&net_todo_list, &list);
1da177e4 4192 spin_unlock(&net_todo_list_lock);
626ab0e6 4193
1da177e4
LT
4194 while (!list_empty(&list)) {
4195 struct net_device *dev
4196 = list_entry(list.next, struct net_device, todo_list);
4197 list_del(&dev->todo_list);
4198
b17a7c17
SH
4199 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4200 printk(KERN_ERR "network todo '%s' but state %d\n",
4201 dev->name, dev->reg_state);
4202 dump_stack();
4203 continue;
4204 }
1da177e4 4205
b17a7c17 4206 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 4207
6e583ce5
SH
4208 on_each_cpu(flush_backlog, dev, 1);
4209
b17a7c17 4210 netdev_wait_allrefs(dev);
1da177e4 4211
b17a7c17
SH
4212 /* paranoia */
4213 BUG_ON(atomic_read(&dev->refcnt));
547b792c
IJ
4214 WARN_ON(dev->ip_ptr);
4215 WARN_ON(dev->ip6_ptr);
4216 WARN_ON(dev->dn_ptr);
1da177e4 4217
b17a7c17
SH
4218 if (dev->destructor)
4219 dev->destructor(dev);
9093bbb2
SH
4220
4221 /* Free network device */
4222 kobject_put(&dev->dev.kobj);
1da177e4
LT
4223 }
4224
4225out:
4a3e2f71 4226 mutex_unlock(&net_todo_run_mutex);
1da177e4
LT
4227}
4228
5a1b5898 4229static struct net_device_stats *internal_stats(struct net_device *dev)
c45d286e 4230{
5a1b5898 4231 return &dev->stats;
c45d286e
RR
4232}
4233
dc2b4847 4234static void netdev_init_one_queue(struct net_device *dev,
e8a0464c
DM
4235 struct netdev_queue *queue,
4236 void *_unused)
dc2b4847 4237{
dc2b4847
DM
4238 queue->dev = dev;
4239}
4240
bb949fbd
DM
4241static void netdev_init_queues(struct net_device *dev)
4242{
e8a0464c
DM
4243 netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4244 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
c3f26a26 4245 spin_lock_init(&dev->tx_global_lock);
bb949fbd
DM
4246}
4247
1da177e4 4248/**
f25f4e44 4249 * alloc_netdev_mq - allocate network device
1da177e4
LT
4250 * @sizeof_priv: size of private data to allocate space for
4251 * @name: device name format string
4252 * @setup: callback to initialize device
f25f4e44 4253 * @queue_count: the number of subqueues to allocate
1da177e4
LT
4254 *
4255 * Allocates a struct net_device with private data area for driver use
f25f4e44
PWJ
4256 * and performs basic initialization. Also allocates subquue structs
4257 * for each queue on the device at the end of the netdevice.
1da177e4 4258 */
f25f4e44
PWJ
4259struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4260 void (*setup)(struct net_device *), unsigned int queue_count)
1da177e4 4261{
e8a0464c 4262 struct netdev_queue *tx;
1da177e4 4263 struct net_device *dev;
7943986c 4264 size_t alloc_size;
e8a0464c 4265 void *p;
1da177e4 4266
b6fe17d6
SH
4267 BUG_ON(strlen(name) >= sizeof(dev->name));
4268
fd2ea0a7 4269 alloc_size = sizeof(struct net_device);
d1643d24
AD
4270 if (sizeof_priv) {
4271 /* ensure 32-byte alignment of private area */
4272 alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4273 alloc_size += sizeof_priv;
4274 }
4275 /* ensure 32-byte alignment of whole construct */
4276 alloc_size += NETDEV_ALIGN_CONST;
1da177e4 4277
31380de9 4278 p = kzalloc(alloc_size, GFP_KERNEL);
1da177e4 4279 if (!p) {
b6fe17d6 4280 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
1da177e4
LT
4281 return NULL;
4282 }
1da177e4 4283
7943986c 4284 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
e8a0464c
DM
4285 if (!tx) {
4286 printk(KERN_ERR "alloc_netdev: Unable to allocate "
4287 "tx qdiscs.\n");
4288 kfree(p);
4289 return NULL;
4290 }
4291
1da177e4
LT
4292 dev = (struct net_device *)
4293 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4294 dev->padded = (char *)dev - (char *)p;
c346dca1 4295 dev_net_set(dev, &init_net);
1da177e4 4296
e8a0464c
DM
4297 dev->_tx = tx;
4298 dev->num_tx_queues = queue_count;
fd2ea0a7 4299 dev->real_num_tx_queues = queue_count;
e8a0464c 4300
f25f4e44
PWJ
4301 if (sizeof_priv) {
4302 dev->priv = ((char *)dev +
fd2ea0a7 4303 ((sizeof(struct net_device) + NETDEV_ALIGN_CONST)
f25f4e44
PWJ
4304 & ~NETDEV_ALIGN_CONST));
4305 }
4306
82cc1a7a 4307 dev->gso_max_size = GSO_MAX_SIZE;
1da177e4 4308
bb949fbd
DM
4309 netdev_init_queues(dev);
4310
5a1b5898 4311 dev->get_stats = internal_stats;
bea3348e 4312 netpoll_netdev_init(dev);
1da177e4
LT
4313 setup(dev);
4314 strcpy(dev->name, name);
4315 return dev;
4316}
f25f4e44 4317EXPORT_SYMBOL(alloc_netdev_mq);
1da177e4
LT
4318
4319/**
4320 * free_netdev - free network device
4321 * @dev: device
4322 *
4ec93edb
YH
4323 * This function does the last stage of destroying an allocated device
4324 * interface. The reference to the device object is released.
1da177e4
LT
4325 * If this is the last reference then it will be freed.
4326 */
4327void free_netdev(struct net_device *dev)
4328{
f3005d7f
DL
4329 release_net(dev_net(dev));
4330
e8a0464c
DM
4331 kfree(dev->_tx);
4332
3041a069 4333 /* Compatibility with error handling in drivers */
1da177e4
LT
4334 if (dev->reg_state == NETREG_UNINITIALIZED) {
4335 kfree((char *)dev - dev->padded);
4336 return;
4337 }
4338
4339 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4340 dev->reg_state = NETREG_RELEASED;
4341
43cb76d9
GKH
4342 /* will free via device release */
4343 put_device(&dev->dev);
1da177e4 4344}
4ec93edb 4345
1da177e4 4346/* Synchronize with packet receive processing. */
4ec93edb 4347void synchronize_net(void)
1da177e4
LT
4348{
4349 might_sleep();
fbd568a3 4350 synchronize_rcu();
1da177e4
LT
4351}
4352
4353/**
4354 * unregister_netdevice - remove device from the kernel
4355 * @dev: device
4356 *
4357 * This function shuts down a device interface and removes it
d59b54b1 4358 * from the kernel tables.
1da177e4
LT
4359 *
4360 * Callers must hold the rtnl semaphore. You may want
4361 * unregister_netdev() instead of this.
4362 */
4363
22f8cde5 4364void unregister_netdevice(struct net_device *dev)
1da177e4 4365{
a6620712
HX
4366 ASSERT_RTNL();
4367
93ee31f1 4368 rollback_registered(dev);
1da177e4
LT
4369 /* Finish processing unregister after unlock */
4370 net_set_todo(dev);
1da177e4
LT
4371}
4372
4373/**
4374 * unregister_netdev - remove device from the kernel
4375 * @dev: device
4376 *
4377 * This function shuts down a device interface and removes it
d59b54b1 4378 * from the kernel tables.
1da177e4
LT
4379 *
4380 * This is just a wrapper for unregister_netdevice that takes
4381 * the rtnl semaphore. In general you want to use this and not
4382 * unregister_netdevice.
4383 */
4384void unregister_netdev(struct net_device *dev)
4385{
4386 rtnl_lock();
4387 unregister_netdevice(dev);
4388 rtnl_unlock();
4389}
4390
4391EXPORT_SYMBOL(unregister_netdev);
4392
ce286d32
EB
4393/**
4394 * dev_change_net_namespace - move device to different nethost namespace
4395 * @dev: device
4396 * @net: network namespace
4397 * @pat: If not NULL name pattern to try if the current device name
4398 * is already taken in the destination network namespace.
4399 *
4400 * This function shuts down a device interface and moves it
4401 * to a new network namespace. On success 0 is returned, on
4402 * a failure a netagive errno code is returned.
4403 *
4404 * Callers must hold the rtnl semaphore.
4405 */
4406
4407int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4408{
4409 char buf[IFNAMSIZ];
4410 const char *destname;
4411 int err;
4412
4413 ASSERT_RTNL();
4414
4415 /* Don't allow namespace local devices to be moved. */
4416 err = -EINVAL;
4417 if (dev->features & NETIF_F_NETNS_LOCAL)
4418 goto out;
4419
4420 /* Ensure the device has been registrered */
4421 err = -EINVAL;
4422 if (dev->reg_state != NETREG_REGISTERED)
4423 goto out;
4424
4425 /* Get out if there is nothing todo */
4426 err = 0;
878628fb 4427 if (net_eq(dev_net(dev), net))
ce286d32
EB
4428 goto out;
4429
4430 /* Pick the destination device name, and ensure
4431 * we can use it in the destination network namespace.
4432 */
4433 err = -EEXIST;
4434 destname = dev->name;
4435 if (__dev_get_by_name(net, destname)) {
4436 /* We get here if we can't use the current device name */
4437 if (!pat)
4438 goto out;
4439 if (!dev_valid_name(pat))
4440 goto out;
4441 if (strchr(pat, '%')) {
4442 if (__dev_alloc_name(net, pat, buf) < 0)
4443 goto out;
4444 destname = buf;
4445 } else
4446 destname = pat;
4447 if (__dev_get_by_name(net, destname))
4448 goto out;
4449 }
4450
4451 /*
4452 * And now a mini version of register_netdevice unregister_netdevice.
4453 */
4454
4455 /* If device is running close it first. */
9b772652 4456 dev_close(dev);
ce286d32
EB
4457
4458 /* And unlink it from device chain */
4459 err = -ENODEV;
4460 unlist_netdevice(dev);
4461
4462 synchronize_net();
4463
4464 /* Shutdown queueing discipline. */
4465 dev_shutdown(dev);
4466
4467 /* Notify protocols, that we are about to destroy
4468 this device. They should clean all the things.
4469 */
4470 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4471
4472 /*
4473 * Flush the unicast and multicast chains
4474 */
4475 dev_addr_discard(dev);
4476
4477 /* Actually switch the network namespace */
c346dca1 4478 dev_net_set(dev, net);
ce286d32
EB
4479
4480 /* Assign the new device name */
4481 if (destname != dev->name)
4482 strcpy(dev->name, destname);
4483
4484 /* If there is an ifindex conflict assign a new one */
4485 if (__dev_get_by_index(net, dev->ifindex)) {
4486 int iflink = (dev->iflink == dev->ifindex);
4487 dev->ifindex = dev_new_index(net);
4488 if (iflink)
4489 dev->iflink = dev->ifindex;
4490 }
4491
8b41d188 4492 /* Fixup kobjects */
aaf8cdc3
DL
4493 netdev_unregister_kobject(dev);
4494 err = netdev_register_kobject(dev);
8b41d188 4495 WARN_ON(err);
ce286d32
EB
4496
4497 /* Add the device back in the hashes */
4498 list_netdevice(dev);
4499
4500 /* Notify protocols, that a new device appeared. */
4501 call_netdevice_notifiers(NETDEV_REGISTER, dev);
4502
4503 synchronize_net();
4504 err = 0;
4505out:
4506 return err;
4507}
4508
1da177e4
LT
4509static int dev_cpu_callback(struct notifier_block *nfb,
4510 unsigned long action,
4511 void *ocpu)
4512{
4513 struct sk_buff **list_skb;
37437bb2 4514 struct Qdisc **list_net;
1da177e4
LT
4515 struct sk_buff *skb;
4516 unsigned int cpu, oldcpu = (unsigned long)ocpu;
4517 struct softnet_data *sd, *oldsd;
4518
8bb78442 4519 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
4520 return NOTIFY_OK;
4521
4522 local_irq_disable();
4523 cpu = smp_processor_id();
4524 sd = &per_cpu(softnet_data, cpu);
4525 oldsd = &per_cpu(softnet_data, oldcpu);
4526
4527 /* Find end of our completion_queue. */
4528 list_skb = &sd->completion_queue;
4529 while (*list_skb)
4530 list_skb = &(*list_skb)->next;
4531 /* Append completion queue from offline CPU. */
4532 *list_skb = oldsd->completion_queue;
4533 oldsd->completion_queue = NULL;
4534
4535 /* Find end of our output_queue. */
4536 list_net = &sd->output_queue;
4537 while (*list_net)
4538 list_net = &(*list_net)->next_sched;
4539 /* Append output queue from offline CPU. */
4540 *list_net = oldsd->output_queue;
4541 oldsd->output_queue = NULL;
4542
4543 raise_softirq_irqoff(NET_TX_SOFTIRQ);
4544 local_irq_enable();
4545
4546 /* Process offline CPU's input_pkt_queue */
4547 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4548 netif_rx(skb);
4549
4550 return NOTIFY_OK;
4551}
1da177e4 4552
db217334
CL
4553#ifdef CONFIG_NET_DMA
4554/**
0ed72ec4
RD
4555 * net_dma_rebalance - try to maintain one DMA channel per CPU
4556 * @net_dma: DMA client and associated data (lock, channels, channel_mask)
4557 *
4558 * This is called when the number of channels allocated to the net_dma client
4559 * changes. The net_dma client tries to have one DMA channel per CPU.
db217334 4560 */
d379b01e
DW
4561
4562static void net_dma_rebalance(struct net_dma *net_dma)
db217334 4563{
d379b01e 4564 unsigned int cpu, i, n, chan_idx;
db217334
CL
4565 struct dma_chan *chan;
4566
d379b01e 4567 if (cpus_empty(net_dma->channel_mask)) {
db217334 4568 for_each_online_cpu(cpu)
29bbd72d 4569 rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
db217334
CL
4570 return;
4571 }
4572
4573 i = 0;
4574 cpu = first_cpu(cpu_online_map);
4575
0e12f848 4576 for_each_cpu_mask_nr(chan_idx, net_dma->channel_mask) {
d379b01e
DW
4577 chan = net_dma->channels[chan_idx];
4578
4579 n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
4580 + (i < (num_online_cpus() %
4581 cpus_weight(net_dma->channel_mask)) ? 1 : 0));
db217334
CL
4582
4583 while(n) {
29bbd72d 4584 per_cpu(softnet_data, cpu).net_dma = chan;
db217334
CL
4585 cpu = next_cpu(cpu, cpu_online_map);
4586 n--;
4587 }
4588 i++;
4589 }
db217334
CL
4590}
4591
4592/**
4593 * netdev_dma_event - event callback for the net_dma_client
4594 * @client: should always be net_dma_client
f4b8ea78 4595 * @chan: DMA channel for the event
0ed72ec4 4596 * @state: DMA state to be handled
db217334 4597 */
d379b01e
DW
4598static enum dma_state_client
4599netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
4600 enum dma_state state)
4601{
4602 int i, found = 0, pos = -1;
4603 struct net_dma *net_dma =
4604 container_of(client, struct net_dma, client);
4605 enum dma_state_client ack = DMA_DUP; /* default: take no action */
4606
4607 spin_lock(&net_dma->lock);
4608 switch (state) {
4609 case DMA_RESOURCE_AVAILABLE:
0c0b0aca 4610 for (i = 0; i < nr_cpu_ids; i++)
d379b01e
DW
4611 if (net_dma->channels[i] == chan) {
4612 found = 1;
4613 break;
4614 } else if (net_dma->channels[i] == NULL && pos < 0)
4615 pos = i;
4616
4617 if (!found && pos >= 0) {
4618 ack = DMA_ACK;
4619 net_dma->channels[pos] = chan;
4620 cpu_set(pos, net_dma->channel_mask);
4621 net_dma_rebalance(net_dma);
4622 }
db217334
CL
4623 break;
4624 case DMA_RESOURCE_REMOVED:
0c0b0aca 4625 for (i = 0; i < nr_cpu_ids; i++)
d379b01e
DW
4626 if (net_dma->channels[i] == chan) {
4627 found = 1;
4628 pos = i;
4629 break;
4630 }
4631
4632 if (found) {
4633 ack = DMA_ACK;
4634 cpu_clear(pos, net_dma->channel_mask);
4635 net_dma->channels[i] = NULL;
4636 net_dma_rebalance(net_dma);
4637 }
db217334
CL
4638 break;
4639 default:
4640 break;
4641 }
d379b01e
DW
4642 spin_unlock(&net_dma->lock);
4643
4644 return ack;
db217334
CL
4645}
4646
4647/**
4648 * netdev_dma_regiser - register the networking subsystem as a DMA client
4649 */
4650static int __init netdev_dma_register(void)
4651{
0c0b0aca
MT
4652 net_dma.channels = kzalloc(nr_cpu_ids * sizeof(struct net_dma),
4653 GFP_KERNEL);
4654 if (unlikely(!net_dma.channels)) {
4655 printk(KERN_NOTICE
4656 "netdev_dma: no memory for net_dma.channels\n");
4657 return -ENOMEM;
4658 }
d379b01e
DW
4659 spin_lock_init(&net_dma.lock);
4660 dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
4661 dma_async_client_register(&net_dma.client);
4662 dma_async_client_chan_request(&net_dma.client);
db217334
CL
4663 return 0;
4664}
4665
4666#else
4667static int __init netdev_dma_register(void) { return -ENODEV; }
4668#endif /* CONFIG_NET_DMA */
1da177e4 4669
7f353bf2
HX
4670/**
4671 * netdev_compute_feature - compute conjunction of two feature sets
4672 * @all: first feature set
4673 * @one: second feature set
4674 *
4675 * Computes a new feature set after adding a device with feature set
4676 * @one to the master device with current feature set @all. Returns
4677 * the new feature set.
4678 */
4679int netdev_compute_features(unsigned long all, unsigned long one)
4680{
4681 /* if device needs checksumming, downgrade to hw checksumming */
4682 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
4683 all ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
4684
4685 /* if device can't do all checksum, downgrade to ipv4/ipv6 */
4686 if (all & NETIF_F_HW_CSUM && !(one & NETIF_F_HW_CSUM))
4687 all ^= NETIF_F_HW_CSUM
4688 | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
4689
4690 if (one & NETIF_F_GSO)
4691 one |= NETIF_F_GSO_SOFTWARE;
4692 one |= NETIF_F_GSO;
4693
e2a6b852
HX
4694 /*
4695 * If even one device supports a GSO protocol with software fallback,
4696 * enable it for all.
4697 */
4698 all |= one & NETIF_F_GSO_SOFTWARE;
4699
7f353bf2
HX
4700 /* If even one device supports robust GSO, enable it for all. */
4701 if (one & NETIF_F_GSO_ROBUST)
4702 all |= NETIF_F_GSO_ROBUST;
4703
4704 all &= one | NETIF_F_LLTX;
4705
4706 if (!(all & NETIF_F_ALL_CSUM))
4707 all &= ~NETIF_F_SG;
4708 if (!(all & NETIF_F_SG))
4709 all &= ~NETIF_F_GSO_MASK;
4710
4711 return all;
4712}
4713EXPORT_SYMBOL(netdev_compute_features);
4714
30d97d35
PE
4715static struct hlist_head *netdev_create_hash(void)
4716{
4717 int i;
4718 struct hlist_head *hash;
4719
4720 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
4721 if (hash != NULL)
4722 for (i = 0; i < NETDEV_HASHENTRIES; i++)
4723 INIT_HLIST_HEAD(&hash[i]);
4724
4725 return hash;
4726}
4727
881d966b 4728/* Initialize per network namespace state */
4665079c 4729static int __net_init netdev_init(struct net *net)
881d966b 4730{
881d966b 4731 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 4732
30d97d35
PE
4733 net->dev_name_head = netdev_create_hash();
4734 if (net->dev_name_head == NULL)
4735 goto err_name;
881d966b 4736
30d97d35
PE
4737 net->dev_index_head = netdev_create_hash();
4738 if (net->dev_index_head == NULL)
4739 goto err_idx;
881d966b
EB
4740
4741 return 0;
30d97d35
PE
4742
4743err_idx:
4744 kfree(net->dev_name_head);
4745err_name:
4746 return -ENOMEM;
881d966b
EB
4747}
4748
6579e57b
AV
4749char *netdev_drivername(struct net_device *dev, char *buffer, int len)
4750{
4751 struct device_driver *driver;
4752 struct device *parent;
4753
4754 if (len <= 0 || !buffer)
4755 return buffer;
4756 buffer[0] = 0;
4757
4758 parent = dev->dev.parent;
4759
4760 if (!parent)
4761 return buffer;
4762
4763 driver = parent->driver;
4764 if (driver && driver->name)
4765 strlcpy(buffer, driver->name, len);
4766 return buffer;
4767}
4768
4665079c 4769static void __net_exit netdev_exit(struct net *net)
881d966b
EB
4770{
4771 kfree(net->dev_name_head);
4772 kfree(net->dev_index_head);
4773}
4774
022cbae6 4775static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
4776 .init = netdev_init,
4777 .exit = netdev_exit,
4778};
4779
4665079c 4780static void __net_exit default_device_exit(struct net *net)
ce286d32
EB
4781{
4782 struct net_device *dev, *next;
4783 /*
4784 * Push all migratable of the network devices back to the
4785 * initial network namespace
4786 */
4787 rtnl_lock();
4788 for_each_netdev_safe(net, dev, next) {
4789 int err;
aca51397 4790 char fb_name[IFNAMSIZ];
ce286d32
EB
4791
4792 /* Ignore unmoveable devices (i.e. loopback) */
4793 if (dev->features & NETIF_F_NETNS_LOCAL)
4794 continue;
4795
4796 /* Push remaing network devices to init_net */
aca51397
PE
4797 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
4798 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 4799 if (err) {
aca51397 4800 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
ce286d32 4801 __func__, dev->name, err);
aca51397 4802 BUG();
ce286d32
EB
4803 }
4804 }
4805 rtnl_unlock();
4806}
4807
022cbae6 4808static struct pernet_operations __net_initdata default_device_ops = {
ce286d32
EB
4809 .exit = default_device_exit,
4810};
4811
1da177e4
LT
4812/*
4813 * Initialize the DEV module. At boot time this walks the device list and
4814 * unhooks any devices that fail to initialise (normally hardware not
4815 * present) and leaves us with a valid list of present and active devices.
4816 *
4817 */
4818
4819/*
4820 * This is called single threaded during boot, so no need
4821 * to take the rtnl semaphore.
4822 */
4823static int __init net_dev_init(void)
4824{
4825 int i, rc = -ENOMEM;
4826
4827 BUG_ON(!dev_boot_phase);
4828
1da177e4
LT
4829 if (dev_proc_init())
4830 goto out;
4831
8b41d188 4832 if (netdev_kobject_init())
1da177e4
LT
4833 goto out;
4834
4835 INIT_LIST_HEAD(&ptype_all);
82d8a867 4836 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
4837 INIT_LIST_HEAD(&ptype_base[i]);
4838
881d966b
EB
4839 if (register_pernet_subsys(&netdev_net_ops))
4840 goto out;
1da177e4 4841
ce286d32
EB
4842 if (register_pernet_device(&default_device_ops))
4843 goto out;
4844
1da177e4
LT
4845 /*
4846 * Initialise the packet receive queues.
4847 */
4848
6f912042 4849 for_each_possible_cpu(i) {
1da177e4
LT
4850 struct softnet_data *queue;
4851
4852 queue = &per_cpu(softnet_data, i);
4853 skb_queue_head_init(&queue->input_pkt_queue);
1da177e4
LT
4854 queue->completion_queue = NULL;
4855 INIT_LIST_HEAD(&queue->poll_list);
bea3348e
SH
4856
4857 queue->backlog.poll = process_backlog;
4858 queue->backlog.weight = weight_p;
1da177e4
LT
4859 }
4860
db217334
CL
4861 netdev_dma_register();
4862
1da177e4
LT
4863 dev_boot_phase = 0;
4864
962cf36c
CM
4865 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
4866 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
4867
4868 hotcpu_notifier(dev_cpu_callback, 0);
4869 dst_init();
4870 dev_mcast_init();
4871 rc = 0;
4872out:
4873 return rc;
4874}
4875
4876subsys_initcall(net_dev_init);
4877
4878EXPORT_SYMBOL(__dev_get_by_index);
4879EXPORT_SYMBOL(__dev_get_by_name);
4880EXPORT_SYMBOL(__dev_remove_pack);
c2373ee9 4881EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
4882EXPORT_SYMBOL(dev_add_pack);
4883EXPORT_SYMBOL(dev_alloc_name);
4884EXPORT_SYMBOL(dev_close);
4885EXPORT_SYMBOL(dev_get_by_flags);
4886EXPORT_SYMBOL(dev_get_by_index);
4887EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
4888EXPORT_SYMBOL(dev_open);
4889EXPORT_SYMBOL(dev_queue_xmit);
4890EXPORT_SYMBOL(dev_remove_pack);
4891EXPORT_SYMBOL(dev_set_allmulti);
4892EXPORT_SYMBOL(dev_set_promiscuity);
4893EXPORT_SYMBOL(dev_change_flags);
4894EXPORT_SYMBOL(dev_set_mtu);
4895EXPORT_SYMBOL(dev_set_mac_address);
4896EXPORT_SYMBOL(free_netdev);
4897EXPORT_SYMBOL(netdev_boot_setup_check);
4898EXPORT_SYMBOL(netdev_set_master);
4899EXPORT_SYMBOL(netdev_state_change);
4900EXPORT_SYMBOL(netif_receive_skb);
4901EXPORT_SYMBOL(netif_rx);
4902EXPORT_SYMBOL(register_gifconf);
4903EXPORT_SYMBOL(register_netdevice);
4904EXPORT_SYMBOL(register_netdevice_notifier);
4905EXPORT_SYMBOL(skb_checksum_help);
4906EXPORT_SYMBOL(synchronize_net);
4907EXPORT_SYMBOL(unregister_netdevice);
4908EXPORT_SYMBOL(unregister_netdevice_notifier);
4909EXPORT_SYMBOL(net_enable_timestamp);
4910EXPORT_SYMBOL(net_disable_timestamp);
4911EXPORT_SYMBOL(dev_get_flags);
4912
4913#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
4914EXPORT_SYMBOL(br_handle_frame_hook);
4915EXPORT_SYMBOL(br_fdb_get_hook);
4916EXPORT_SYMBOL(br_fdb_put_hook);
4917#endif
4918
4919#ifdef CONFIG_KMOD
4920EXPORT_SYMBOL(dev_load);
4921#endif
4922
4923EXPORT_PER_CPU_SYMBOL(softnet_data);