rfs: Receive Flow Steering

[net-next-2.6.git] / net / core / dev.c
diff --git a/net/core/dev.c b/net/core/dev.c

index 949c62dba719c602b5e1203276e3ca2732f10d37..d7107ac835fad207da79a52e00227e6067d9e335 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -80,6 +80,7 @@
  #include <linux/types.h>
  #include <linux/kernel.h>
  #include <linux/hash.h>
+#include <linux/slab.h>
  #include <linux/sched.h>
  #include <linux/mutex.h>
  #include <linux/string.h>
@@ -1986,9 +1987,9 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
  {
         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
                 if (net_ratelimit()) {
-                       netdev_warn(dev, "selects TX queue %d, but "
-                            "real number of TX queues is %d\n",
-                            queue_index, dev->real_num_tx_queues);
+                       pr_warning("%s selects TX queue %d, but "
+                               "real number of TX queues is %d\n",
+                               dev->name, queue_index, dev->real_num_tx_queues);
                 }
                 return 0;
         }
@@ -2014,7 +2015,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
                         if (dev->real_num_tx_queues > 1)
                                 queue_index = skb_tx_hash(dev, skb);
  
-                       if (sk && sk->sk_dst_cache)
+                       if (sk && rcu_dereference_check(sk->sk_dst_cache, 1))
                                 sk_tx_queue_set(sk, queue_index);
                 }
         }
@@ -2202,29 +2203,37 @@ int weight_p __read_mostly = 64;            /* old backlog weight */
  DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
  
  #ifdef CONFIG_RPS
+
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table *rps_sock_flow_table;
+EXPORT_SYMBOL(rps_sock_flow_table);
+
  /*
   * get_rps_cpu is called from netif_receive_skb and returns the target
   * CPU from the RPS map of the receiving queue for a given skb.
+ * rcu_read_lock must be held on entry.
   */
-static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+                      struct rps_dev_flow **rflowp)
  {
         struct ipv6hdr *ip6;
         struct iphdr *ip;
         struct netdev_rx_queue *rxqueue;
         struct rps_map *map;
+       struct rps_dev_flow_table *flow_table;
+       struct rps_sock_flow_table *sock_flow_table;
         int cpu = -1;
         u8 ip_proto;
+       u16 tcpu;
         u32 addr1, addr2, ports, ihl;
  
-       rcu_read_lock();
-
         if (skb_rx_queue_recorded(skb)) {
                 u16 index = skb_get_rx_queue(skb);
                 if (unlikely(index >= dev->num_rx_queues)) {
                         if (net_ratelimit()) {
-                               netdev_warn(dev, "received packet on queue "
-                                   "%u, but number of RX queues is %u\n",
-                                    index, dev->num_rx_queues);
+                               pr_warning("%s received packet on queue "
+                                       "%u, but number of RX queues is %u\n",
+                                       dev->name, index, dev->num_rx_queues);
                         }
                         goto done;
                 }
@@ -2232,7 +2241,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
         } else
                 rxqueue = dev->_rx;
  
-       if (!rxqueue->rps_map)
+       if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
                 goto done;
  
         if (skb->rxhash)
@@ -2284,9 +2293,48 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
                 skb->rxhash = 1;
  
  got_hash:
+       flow_table = rcu_dereference(rxqueue->rps_flow_table);
+       sock_flow_table = rcu_dereference(rps_sock_flow_table);
+       if (flow_table && sock_flow_table) {
+               u16 next_cpu;
+               struct rps_dev_flow *rflow;
+
+               rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
+               tcpu = rflow->cpu;
+
+               next_cpu = sock_flow_table->ents[skb->rxhash &
+                   sock_flow_table->mask];
+
+               /*
+                * If the desired CPU (where last recvmsg was done) is
+                * different from current CPU (one in the rx-queue flow
+                * table entry), switch if one of the following holds:
+                *   - Current CPU is unset (equal to RPS_NO_CPU).
+                *   - Current CPU is offline.
+                *   - The current CPU's queue tail has advanced beyond the
+                *     last packet that was enqueued using this table entry.
+                *     This guarantees that all previous packets for the flow
+                *     have been dequeued, thus preserving in order delivery.
+                */
+               if (unlikely(tcpu != next_cpu) &&
+                   (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
+                    ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
+                     rflow->last_qtail)) >= 0)) {
+                       tcpu = rflow->cpu = next_cpu;
+                       if (tcpu != RPS_NO_CPU)
+                               rflow->last_qtail = per_cpu(softnet_data,
+                                   tcpu).input_queue_head;
+               }
+               if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
+                       *rflowp = rflow;
+                       cpu = tcpu;
+                       goto done;
+               }
+       }
+
         map = rcu_dereference(rxqueue->rps_map);
         if (map) {
-               u16 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
+               tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
  
                 if (cpu_online(tcpu)) {
                         cpu = tcpu;
@@ -2295,7 +2343,6 @@ got_hash:
         }
  
  done:
-       rcu_read_unlock();
         return cpu;
  }
  
@@ -2321,13 +2368,14 @@ static void trigger_softirq(void *data)
         __napi_schedule(&queue->backlog);
         __get_cpu_var(netdev_rx_stat).received_rps++;
  }
-#endif /* CONFIG_SMP */
+#endif /* CONFIG_RPS */
  
  /*
   * enqueue_to_backlog is called to queue an skb to a per CPU backlog
   * queue (may be a remote CPU queue).
   */
-static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
+                             unsigned int *qtail)
  {
         struct softnet_data *queue;
         unsigned long flags;
@@ -2342,6 +2390,10 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
                 if (queue->input_pkt_queue.qlen) {
  enqueue:
                         __skb_queue_tail(&queue->input_pkt_queue, skb);
+#ifdef CONFIG_RPS
+                       *qtail = queue->input_queue_head +
+                           queue->input_pkt_queue.qlen;
+#endif
                         rps_unlock(queue);
                         local_irq_restore(flags);
                         return NET_RX_SUCCESS;
@@ -2356,11 +2408,10 @@ enqueue:
  
                                 cpu_set(cpu, rcpus->mask[rcpus->select]);
                                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
-                       } else
-                               __napi_schedule(&queue->backlog);
-#else
-                       __napi_schedule(&queue->backlog);
+                               goto enqueue;
+                       }
  #endif
+                       __napi_schedule(&queue->backlog);
                 }
                 goto enqueue;
         }
@@ -2391,7 +2442,7 @@ enqueue:
  
  int netif_rx(struct sk_buff *skb)
  {
-       int cpu;
+       int ret;
  
         /* if netpoll wants it, pretend we never saw it */
         if (netpoll_rx(skb))
@@ -2401,14 +2452,28 @@ int netif_rx(struct sk_buff *skb)
                 net_timestamp(skb);
  
  #ifdef CONFIG_RPS
-       cpu = get_rps_cpu(skb->dev, skb);
-       if (cpu < 0)
-               cpu = smp_processor_id();
+       {
+               struct rps_dev_flow voidflow, *rflow = &voidflow;
+               int cpu;
+
+               rcu_read_lock();
+
+               cpu = get_rps_cpu(skb->dev, skb, &rflow);
+               if (cpu < 0)
+                       cpu = smp_processor_id();
+
+               ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+
+               rcu_read_unlock();
+       }
  #else
-       cpu = smp_processor_id();
+       {
+               unsigned int qtail;
+               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
+               put_cpu();
+       }
  #endif
-
-       return enqueue_to_backlog(skb, cpu);
+       return ret;
  }
  EXPORT_SYMBOL(netif_rx);
  
@@ -2645,6 +2710,55 @@ void netif_nit_deliver(struct sk_buff *skb)
         rcu_read_unlock();
  }
  
+static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
+                                             struct net_device *master)
+{
+       if (skb->pkt_type == PACKET_HOST) {
+               u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
+
+               memcpy(dest, master->dev_addr, ETH_ALEN);
+       }
+}
+
+/* On bonding slaves other than the currently active slave, suppress
+ * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
+ * ARP on active-backup slaves with arp_validate enabled.
+ */
+int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
+{
+       struct net_device *dev = skb->dev;
+
+       if (master->priv_flags & IFF_MASTER_ARPMON)
+               dev->last_rx = jiffies;
+
+       if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
+               /* Do address unmangle. The local destination address
+                * will be always the one master has. Provides the right
+                * functionality in a bridge.
+                */
+               skb_bond_set_mac_by_master(skb, master);
+       }
+
+       if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
+               if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
+                   skb->protocol == __cpu_to_be16(ETH_P_ARP))
+                       return 0;
+
+               if (master->priv_flags & IFF_MASTER_ALB) {
+                       if (skb->pkt_type != PACKET_BROADCAST &&
+                           skb->pkt_type != PACKET_MULTICAST)
+                               return 0;
+               }
+               if (master->priv_flags & IFF_MASTER_8023AD &&
+                   skb->protocol == __cpu_to_be16(ETH_P_SLOW))
+                       return 0;
+
+               return 1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(__skb_bond_should_drop);
+
  static int __netif_receive_skb(struct sk_buff *skb)
  {
         struct packet_type *ptype, *pt_prev;
@@ -2775,14 +2889,22 @@ out:
  int netif_receive_skb(struct sk_buff *skb)
  {
  #ifdef CONFIG_RPS
-       int cpu;
+       struct rps_dev_flow voidflow, *rflow = &voidflow;
+       int cpu, ret;
  
-       cpu = get_rps_cpu(skb->dev, skb);
+       rcu_read_lock();
  
-       if (cpu < 0)
-               return __netif_receive_skb(skb);
-       else
-               return enqueue_to_backlog(skb, cpu);
+       cpu = get_rps_cpu(skb->dev, skb, &rflow);
+
+       if (cpu >= 0) {
+               ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+               rcu_read_unlock();
+       } else {
+               rcu_read_unlock();
+               ret = __netif_receive_skb(skb);
+       }
+
+       return ret;
  #else
         return __netif_receive_skb(skb);
  #endif
@@ -2801,6 +2923,7 @@ static void flush_backlog(void *arg)
                 if (skb->dev == dev) {
                         __skb_unlink(skb, &queue->input_pkt_queue);
                         kfree_skb(skb);
+                       incr_input_queue_head(queue);
                 }
         rps_unlock(queue);
  }
@@ -3120,9 +3243,11 @@ static int process_backlog(struct napi_struct *napi, int quota)
                 skb = __skb_dequeue(&queue->input_pkt_queue);
                 if (!skb) {
                         __napi_complete(napi);
-                       spin_unlock_irq(&queue->input_pkt_queue.lock);
+                       rps_unlock(queue);
+                       local_irq_enable();
                         break;
                 }
+               incr_input_queue_head(queue);
                 rps_unlock(queue);
                 local_irq_enable();
  
@@ -3968,140 +4093,6 @@ void dev_set_rx_mode(struct net_device *dev)
         netif_addr_unlock_bh(dev);
  }
  
-/* multicast addresses handling functions */
-
-int __dev_addr_delete(struct dev_addr_list **list, int *count,
-                     void *addr, int alen, int glbl)
-{
-       struct dev_addr_list *da;
-
-       for (; (da = *list) != NULL; list = &da->next) {
-               if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
-                   alen == da->da_addrlen) {
-                       if (glbl) {
-                               int old_glbl = da->da_gusers;
-                               da->da_gusers = 0;
-                               if (old_glbl == 0)
-                                       break;
-                       }
-                       if (--da->da_users)
-                               return 0;
-
-                       *list = da->next;
-                       kfree(da);
-                       (*count)--;
-                       return 0;
-               }
-       }
-       return -ENOENT;
-}
-
-int __dev_addr_add(struct dev_addr_list **list, int *count,
-                  void *addr, int alen, int glbl)
-{
-       struct dev_addr_list *da;
-
-       for (da = *list; da != NULL; da = da->next) {
-               if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
-                   da->da_addrlen == alen) {
-                       if (glbl) {
-                               int old_glbl = da->da_gusers;
-                               da->da_gusers = 1;
-                               if (old_glbl)
-                                       return 0;
-                       }
-                       da->da_users++;
-                       return 0;
-               }
-       }
-
-       da = kzalloc(sizeof(*da), GFP_ATOMIC);
-       if (da == NULL)
-               return -ENOMEM;
-       memcpy(da->da_addr, addr, alen);
-       da->da_addrlen = alen;
-       da->da_users = 1;
-       da->da_gusers = glbl ? 1 : 0;
-       da->next = *list;
-       *list = da;
-       (*count)++;
-       return 0;
-}
-
-
-int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
-                   struct dev_addr_list **from, int *from_count)
-{
-       struct dev_addr_list *da, *next;
-       int err = 0;
-
-       da = *from;
-       while (da != NULL) {
-               next = da->next;
-               if (!da->da_synced) {
-                       err = __dev_addr_add(to, to_count,
-                                            da->da_addr, da->da_addrlen, 0);
-                       if (err < 0)
-                               break;
-                       da->da_synced = 1;
-                       da->da_users++;
-               } else if (da->da_users == 1) {
-                       __dev_addr_delete(to, to_count,
-                                         da->da_addr, da->da_addrlen, 0);
-                       __dev_addr_delete(from, from_count,
-                                         da->da_addr, da->da_addrlen, 0);
-               }
-               da = next;
-       }
-       return err;
-}
-EXPORT_SYMBOL_GPL(__dev_addr_sync);
-
-void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
-                      struct dev_addr_list **from, int *from_count)
-{
-       struct dev_addr_list *da, *next;
-
-       da = *from;
-       while (da != NULL) {
-               next = da->next;
-               if (da->da_synced) {
-                       __dev_addr_delete(to, to_count,
-                                         da->da_addr, da->da_addrlen, 0);
-                       da->da_synced = 0;
-                       __dev_addr_delete(from, from_count,
-                                         da->da_addr, da->da_addrlen, 0);
-               }
-               da = next;
-       }
-}
-EXPORT_SYMBOL_GPL(__dev_addr_unsync);
-
-static void __dev_addr_discard(struct dev_addr_list **list)
-{
-       struct dev_addr_list *tmp;
-
-       while (*list != NULL) {
-               tmp = *list;
-               *list = tmp->next;
-               if (tmp->da_users > tmp->da_gusers)
-                       printk("__dev_addr_discard: address leakage! "
-                              "da_users=%d\n", tmp->da_users);
-               kfree(tmp);
-       }
-}
-
-void dev_addr_discard(struct net_device *dev)
-{
-       netif_addr_lock_bh(dev);
-
-       __dev_addr_discard(&dev->mc_list);
-       netdev_mc_count(dev) = 0;
-
-       netif_addr_unlock_bh(dev);
-}
-EXPORT_SYMBOL(dev_addr_discard);
-
  /**
   *     dev_get_flags - get flags reported to userspace
   *     @dev: device
@@ -4412,8 +4403,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
                         return -EINVAL;
                 if (!netif_device_present(dev))
                         return -ENODEV;
-               return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
-                                 dev->addr_len, 1);
+               return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
  
         case SIOCDELMULTI:
                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
@@ -4421,8 +4411,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
                         return -EINVAL;
                 if (!netif_device_present(dev))
                         return -ENODEV;
-               return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
-                                    dev->addr_len, 1);
+               return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
  
         case SIOCSIFTXQLEN:
                 if (ifr->ifr_qlen < 0)
@@ -4730,7 +4719,7 @@ static void rollback_registered_many(struct list_head *head)
                  *      Flush the unicast and multicast chains
                  */
                 dev_uc_flush(dev);
-               dev_addr_discard(dev);
+               dev_mc_flush(dev);
  
                 if (dev->netdev_ops->ndo_uninit)
                         dev->netdev_ops->ndo_uninit(dev);
@@ -5310,6 +5299,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
         if (dev_addr_init(dev))
                 goto free_rx;
  
+       dev_mc_init(dev);
         dev_uc_init(dev);
  
         dev_net_set(dev, &init_net);
@@ -5545,7 +5535,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
          *      Flush the unicast and multicast chains
          */
         dev_uc_flush(dev);
-       dev_addr_discard(dev);
+       dev_mc_flush(dev);
  
         netdev_unregister_kobject(dev);
  
@@ -5621,8 +5611,10 @@ static int dev_cpu_callback(struct notifier_block *nfb,
         local_irq_enable();
  
         /* Process offline CPU's input_pkt_queue */
-       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
+       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
                 netif_rx(skb);
+               incr_input_queue_head(oldsd);
+       }
  
         return NOTIFY_OK;
  }