net: allocate tx queues in register_netdevice

[net-next-2.6.git] / net / core / dev.c
diff --git a/net/core/dev.c b/net/core/dev.c

index 2c7934f8cf3ec15ca57fbcb6758fc30ebc4b947f..4c3ac53e4b16bdf65377db987951d41ee20e1ef2 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1483,8 +1483,9 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
         skb_orphan(skb);
         nf_reset(skb);
  
-       if (!(dev->flags & IFF_UP) ||
-           (skb->len > (dev->mtu + dev->hard_header_len))) {
+       if (unlikely(!(dev->flags & IFF_UP) ||
+                    (skb->len > (dev->mtu + dev->hard_header_len)))) {
+               atomic_long_inc(&dev->rx_dropped);
                 kfree_skb(skb);
                 return NET_RX_DROP;
         }
@@ -1552,21 +1553,56 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
   * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
   * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
   */
-void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
+int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
  {
-       unsigned int real_num = dev->real_num_tx_queues;
+       if (txq < 1 || txq > dev->num_tx_queues)
+               return -EINVAL;
+
+       if (dev->reg_state == NETREG_REGISTERED) {
+               ASSERT_RTNL();
  
-       if (unlikely(txq > dev->num_tx_queues))
-               ;
-       else if (txq > real_num)
-               dev->real_num_tx_queues = txq;
-       else if (txq < real_num) {
-               dev->real_num_tx_queues = txq;
-               qdisc_reset_all_tx_gt(dev, txq);
+               if (txq < dev->real_num_tx_queues)
+                       qdisc_reset_all_tx_gt(dev, txq);
         }
+
+       dev->real_num_tx_queues = txq;
+       return 0;
  }
  EXPORT_SYMBOL(netif_set_real_num_tx_queues);
  
+#ifdef CONFIG_RPS
+/**
+ *     netif_set_real_num_rx_queues - set actual number of RX queues used
+ *     @dev: Network device
+ *     @rxq: Actual number of RX queues
+ *
+ *     This must be called either with the rtnl_lock held or before
+ *     registration of the net device.  Returns 0 on success, or a
+ *     negative error code.  If called before registration, it always
+ *     succeeds.
+ */
+int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
+{
+       int rc;
+
+       if (rxq < 1 || rxq > dev->num_rx_queues)
+               return -EINVAL;
+
+       if (dev->reg_state == NETREG_REGISTERED) {
+               ASSERT_RTNL();
+
+               rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
+                                                 rxq);
+               if (rc)
+                       return rc;
+       }
+
+       dev->real_num_rx_queues = rxq;
+       return 0;
+}
+EXPORT_SYMBOL(netif_set_real_num_rx_queues);
+#endif
+
  static inline void __netif_reschedule(struct Qdisc *q)
  {
         struct softnet_data *sd;
@@ -2142,6 +2178,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
         return rc;
  }
  
+static DEFINE_PER_CPU(int, xmit_recursion);
+#define RECURSION_LIMIT 3
+
  /**
   *     dev_queue_xmit - transmit a buffer
   *     @skb: buffer to transmit
@@ -2207,10 +2246,15 @@ int dev_queue_xmit(struct sk_buff *skb)
  
                 if (txq->xmit_lock_owner != cpu) {
  
+                       if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
+                               goto recursion_alert;
+
                         HARD_TX_LOCK(dev, txq, cpu);
  
                         if (!netif_tx_queue_stopped(txq)) {
+                               __this_cpu_inc(xmit_recursion);
                                 rc = dev_hard_start_xmit(skb, dev, txq);
+                               __this_cpu_dec(xmit_recursion);
                                 if (dev_xmit_complete(rc)) {
                                         HARD_TX_UNLOCK(dev, txq);
                                         goto out;
@@ -2222,7 +2266,9 @@ int dev_queue_xmit(struct sk_buff *skb)
                                        "queue packet!\n", dev->name);
                 } else {
                         /* Recursion is detected! It is possible,
-                        * unfortunately */
+                        * unfortunately
+                        */
+recursion_alert:
                         if (net_ratelimit())
                                 printk(KERN_CRIT "Dead loop on virtual device "
                                        "%s, fix it urgently!\n", dev->name);
@@ -2352,10 +2398,11 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
  
         if (skb_rx_queue_recorded(skb)) {
                 u16 index = skb_get_rx_queue(skb);
-               if (unlikely(index >= dev->num_rx_queues)) {
-                       WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
-                               "on queue %u, but number of RX queues is %u\n",
-                               dev->name, index, dev->num_rx_queues);
+               if (unlikely(index >= dev->real_num_rx_queues)) {
+                       WARN_ONCE(dev->real_num_rx_queues > 1,
+                                 "%s received packet on queue %u, but number "
+                                 "of RX queues is %u\n",
+                                 dev->name, index, dev->real_num_rx_queues);
                         goto done;
                 }
                 rxqueue = dev->_rx + index;
@@ -2502,6 +2549,7 @@ enqueue:
  
         local_irq_restore(flags);
  
+       atomic_long_inc(&skb->dev->rx_dropped);
         kfree_skb(skb);
         return NET_RX_DROP;
  }
@@ -2656,11 +2704,10 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
   * the ingress scheduler, you just cant add policies on ingress.
   *
   */
-static int ing_filter(struct sk_buff *skb)
+static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
  {
         struct net_device *dev = skb->dev;
         u32 ttl = G_TC_RTTL(skb->tc_verd);
-       struct netdev_queue *rxq;
         int result = TC_ACT_OK;
         struct Qdisc *q;
  
@@ -2674,8 +2721,6 @@ static int ing_filter(struct sk_buff *skb)
         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
  
-       rxq = &dev->rx_queue;
-
         q = rxq->qdisc;
         if (q != &noop_qdisc) {
                 spin_lock(qdisc_lock(q));
@@ -2691,7 +2736,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
                                          struct packet_type **pt_prev,
                                          int *ret, struct net_device *orig_dev)
  {
-       if (skb->dev->rx_queue.qdisc == &noop_qdisc)
+       struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
+
+       if (!rxq || rxq->qdisc == &noop_qdisc)
                 goto out;
  
         if (*pt_prev) {
@@ -2699,7 +2746,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
                 *pt_prev = NULL;
         }
  
-       switch (ing_filter(skb)) {
+       switch (ing_filter(skb, rxq)) {
         case TC_ACT_SHOT:
         case TC_ACT_STOLEN:
                 kfree_skb(skb);
@@ -2950,6 +2997,7 @@ ncls:
         if (pt_prev) {
                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
         } else {
+               atomic_long_inc(&skb->dev->rx_dropped);
                 kfree_skb(skb);
                 /* Jamal, now you will not able to escape explaining
                  * me how you were going to use this. :-)
@@ -4868,7 +4916,7 @@ static void rollback_registered_many(struct list_head *head)
         dev = list_first_entry(head, struct net_device, unreg_list);
         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
  
-       synchronize_net();
+       rcu_barrier();
  
         list_for_each_entry(dev, head, unreg_list)
                 dev_put(dev);
@@ -4882,21 +4930,6 @@ static void rollback_registered(struct net_device *dev)
         rollback_registered_many(&single);
  }
  
-static void __netdev_init_queue_locks_one(struct net_device *dev,
-                                         struct netdev_queue *dev_queue,
-                                         void *_unused)
-{
-       spin_lock_init(&dev_queue->_xmit_lock);
-       netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
-       dev_queue->xmit_lock_owner = -1;
-}
-
-static void netdev_init_queue_locks(struct net_device *dev)
-{
-       netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
-       __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
-}
-
  unsigned long netdev_fix_features(unsigned long features, const char *name)
  {
         /* Fix illegal SG+CSUM combinations. */
@@ -4964,6 +4997,66 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
  }
  EXPORT_SYMBOL(netif_stacked_transfer_operstate);
  
+static int netif_alloc_rx_queues(struct net_device *dev)
+{
+#ifdef CONFIG_RPS
+       unsigned int i, count = dev->num_rx_queues;
+       struct netdev_rx_queue *rx;
+
+       BUG_ON(count < 1);
+
+       rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
+       if (!rx) {
+               pr_err("netdev: Unable to allocate %u rx queues.\n", count);
+               return -ENOMEM;
+       }
+       dev->_rx = rx;
+
+       /*
+        * Set a pointer to first element in the array which holds the
+        * reference count.
+        */
+       for (i = 0; i < count; i++)
+               rx[i].first = rx;
+#endif
+       return 0;
+}
+
+static int netif_alloc_netdev_queues(struct net_device *dev)
+{
+       unsigned int count = dev->num_tx_queues;
+       struct netdev_queue *tx;
+
+       BUG_ON(count < 1);
+
+       tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
+       if (!tx) {
+               pr_err("netdev: Unable to allocate %u tx queues.\n",
+                      count);
+               return -ENOMEM;
+       }
+       dev->_tx = tx;
+       return 0;
+}
+
+static void netdev_init_one_queue(struct net_device *dev,
+                                 struct netdev_queue *queue,
+                                 void *_unused)
+{
+       queue->dev = dev;
+
+       /* Initialize queue lock */
+       spin_lock_init(&queue->_xmit_lock);
+       netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
+       queue->xmit_lock_owner = -1;
+}
+
+static void netdev_init_queues(struct net_device *dev)
+{
+       netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
+       spin_lock_init(&dev->tx_global_lock);
+}
+
  /**
   *     register_netdevice      - register a network device
   *     @dev: device to register
@@ -4997,28 +5090,19 @@ int register_netdevice(struct net_device *dev)
  
         spin_lock_init(&dev->addr_list_lock);
         netdev_set_addr_lockdep_class(dev);
-       netdev_init_queue_locks(dev);
  
         dev->iflink = -1;
  
-#ifdef CONFIG_RPS
-       if (!dev->num_rx_queues) {
-               /*
-                * Allocate a single RX queue if driver never called
-                * alloc_netdev_mq
-                */
+       ret = netif_alloc_rx_queues(dev);
+       if (ret)
+               goto out;
  
-               dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
-               if (!dev->_rx) {
-                       ret = -ENOMEM;
-                       goto out;
-               }
+       ret = netif_alloc_netdev_queues(dev);
+       if (ret)
+               goto out;
+
+       netdev_init_queues(dev);
  
-               dev->_rx->first = dev->_rx;
-               atomic_set(&dev->_rx->count, 1);
-               dev->num_rx_queues = 1;
-       }
-#endif
         /* Init, if this function is available */
         if (dev->netdev_ops->ndo_init) {
                 ret = dev->netdev_ops->ndo_init(dev);
@@ -5058,10 +5142,11 @@ int register_netdevice(struct net_device *dev)
         if (dev->features & NETIF_F_SG)
                 dev->features |= NETIF_F_GSO;
  
-       /* Enable GRO for vlans by default if dev->features has GRO also.
-        * vlan_dev_init() will do the dev->features check.
+       /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
+        * vlan_dev_init() will do the dev->features check, so these features
+        * are enabled only if supported by underlying device.
          */
-       dev->vlan_features |= NETIF_F_GRO;
+       dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
  
         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
         ret = notifier_to_errno(ret);
@@ -5133,9 +5218,6 @@ int init_dummy_netdev(struct net_device *dev)
          */
         dev->reg_state = NETREG_DUMMY;
  
-       /* initialize the ref count */
-       atomic_set(&dev->refcnt, 1);
-
         /* NAPI wants this */
         INIT_LIST_HEAD(&dev->napi_list);
  
@@ -5143,6 +5225,11 @@ int init_dummy_netdev(struct net_device *dev)
         set_bit(__LINK_STATE_PRESENT, &dev->state);
         set_bit(__LINK_STATE_START, &dev->state);
  
+       /* Note : We dont allocate pcpu_refcnt for dummy devices,
+        * because users of this 'device' dont need to change
+        * its refcount.
+        */
+
         return 0;
  }
  EXPORT_SYMBOL_GPL(init_dummy_netdev);
@@ -5184,6 +5271,16 @@ out:
  }
  EXPORT_SYMBOL(register_netdev);
  
+int netdev_refcnt_read(const struct net_device *dev)
+{
+       int i, refcnt = 0;
+
+       for_each_possible_cpu(i)
+               refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
+       return refcnt;
+}
+EXPORT_SYMBOL(netdev_refcnt_read);
+
  /*
   * netdev_wait_allrefs - wait until all references are gone.
   *
@@ -5198,11 +5295,14 @@ EXPORT_SYMBOL(register_netdev);
  static void netdev_wait_allrefs(struct net_device *dev)
  {
         unsigned long rebroadcast_time, warning_time;
+       int refcnt;
  
         linkwatch_forget_dev(dev);
  
         rebroadcast_time = warning_time = jiffies;
-       while (atomic_read(&dev->refcnt) != 0) {
+       refcnt = netdev_refcnt_read(dev);
+
+       while (refcnt != 0) {
                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
                         rtnl_lock();
  
@@ -5229,11 +5329,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
  
                 msleep(250);
  
+               refcnt = netdev_refcnt_read(dev);
+
                 if (time_after(jiffies, warning_time + 10 * HZ)) {
                         printk(KERN_EMERG "unregister_netdevice: "
                                "waiting for %s to become free. Usage "
                                "count = %d\n",
-                              dev->name, atomic_read(&dev->refcnt));
+                              dev->name, refcnt);
                         warning_time = jiffies;
                 }
         }
@@ -5291,7 +5393,7 @@ void netdev_run_todo(void)
                 netdev_wait_allrefs(dev);
  
                 /* paranoia */
-               BUG_ON(atomic_read(&dev->refcnt));
+               BUG_ON(netdev_refcnt_read(dev));
                 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
                 WARN_ON(dev->ip6_ptr);
                 WARN_ON(dev->dn_ptr);
@@ -5370,30 +5472,34 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
  
         if (ops->ndo_get_stats64) {
                 memset(storage, 0, sizeof(*storage));
-               return ops->ndo_get_stats64(dev, storage);
-       }
-       if (ops->ndo_get_stats) {
+               ops->ndo_get_stats64(dev, storage);
+       } else if (ops->ndo_get_stats) {
                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
-               return storage;
+       } else {
+               netdev_stats_to_stats64(storage, &dev->stats);
+               dev_txq_stats_fold(dev, storage);
         }
-       netdev_stats_to_stats64(storage, &dev->stats);
-       dev_txq_stats_fold(dev, storage);
+       storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
         return storage;
  }
  EXPORT_SYMBOL(dev_get_stats);
  
-static void netdev_init_one_queue(struct net_device *dev,
-                                 struct netdev_queue *queue,
-                                 void *_unused)
+struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
  {
-       queue->dev = dev;
-}
+       struct netdev_queue *queue = dev_ingress_queue(dev);
  
-static void netdev_init_queues(struct net_device *dev)
-{
-       netdev_init_one_queue(dev, &dev->rx_queue, NULL);
-       netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
-       spin_lock_init(&dev->tx_global_lock);
+#ifdef CONFIG_NET_CLS_ACT
+       if (queue)
+               return queue;
+       queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+       if (!queue)
+               return NULL;
+       netdev_init_one_queue(dev, queue, NULL);
+       queue->qdisc = &noop_qdisc;
+       queue->qdisc_sleeping = &noop_qdisc;
+       rcu_assign_pointer(dev->ingress_queue, queue);
+#endif
+       return queue;
  }
  
  /**
@@ -5410,17 +5516,18 @@ static void netdev_init_queues(struct net_device *dev)
  struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
                 void (*setup)(struct net_device *), unsigned int queue_count)
  {
-       struct netdev_queue *tx;
         struct net_device *dev;
         size_t alloc_size;
         struct net_device *p;
-#ifdef CONFIG_RPS
-       struct netdev_rx_queue *rx;
-       int i;
-#endif
  
         BUG_ON(strlen(name) >= sizeof(dev->name));
  
+       if (queue_count < 1) {
+               pr_err("alloc_netdev: Unable to allocate device "
+                      "with zero queues.\n");
+               return NULL;
+       }
+
         alloc_size = sizeof(struct net_device);
         if (sizeof_priv) {
                 /* ensure 32-byte alignment of private area */
@@ -5436,55 +5543,31 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
                 return NULL;
         }
  
-       tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
-       if (!tx) {
-               printk(KERN_ERR "alloc_netdev: Unable to allocate "
-                      "tx qdiscs.\n");
-               goto free_p;
-       }
-
-#ifdef CONFIG_RPS
-       rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
-       if (!rx) {
-               printk(KERN_ERR "alloc_netdev: Unable to allocate "
-                      "rx queues.\n");
-               goto free_tx;
-       }
-
-       atomic_set(&rx->count, queue_count);
-
-       /*
-        * Set a pointer to first element in the array which holds the
-        * reference count.
-        */
-       for (i = 0; i < queue_count; i++)
-               rx[i].first = rx;
-#endif
-
         dev = PTR_ALIGN(p, NETDEV_ALIGN);
         dev->padded = (char *)dev - (char *)p;
  
+       dev->pcpu_refcnt = alloc_percpu(int);
+       if (!dev->pcpu_refcnt)
+               goto free_p;
+
         if (dev_addr_init(dev))
-               goto free_rx;
+               goto free_pcpu;
  
         dev_mc_init(dev);
         dev_uc_init(dev);
  
         dev_net_set(dev, &init_net);
  
-       dev->_tx = tx;
         dev->num_tx_queues = queue_count;
         dev->real_num_tx_queues = queue_count;
  
  #ifdef CONFIG_RPS
-       dev->_rx = rx;
         dev->num_rx_queues = queue_count;
+       dev->real_num_rx_queues = queue_count;
  #endif
  
         dev->gso_max_size = GSO_MAX_SIZE;
  
-       netdev_init_queues(dev);
-
         INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
         dev->ethtool_ntuple_list.count = 0;
         INIT_LIST_HEAD(&dev->napi_list);
@@ -5495,12 +5578,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
         strcpy(dev->name, name);
         return dev;
  
-free_rx:
-#ifdef CONFIG_RPS
-       kfree(rx);
-free_tx:
-#endif
-       kfree(tx);
+free_pcpu:
+       free_percpu(dev->pcpu_refcnt);
  free_p:
         kfree(p);
         return NULL;
@@ -5523,6 +5602,8 @@ void free_netdev(struct net_device *dev)
  
         kfree(dev->_tx);
  
+       kfree(rcu_dereference_raw(dev->ingress_queue));
+
         /* Flush device addresses */
         dev_addr_flush(dev);
  
@@ -5532,6 +5613,9 @@ void free_netdev(struct net_device *dev)
         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
                 netif_napi_del(p);
  
+       free_percpu(dev->pcpu_refcnt);
+       dev->pcpu_refcnt = NULL;
+
         /*  Compatibility with error handling in drivers */
         if (dev->reg_state == NETREG_UNINITIALIZED) {
                 kfree((char *)dev - dev->padded);