rfs: Receive Flow Steering

[net-next-2.6.git] / net / core / dev.c
diff --git a/net/core/dev.c b/net/core/dev.c

index 74f77ca033497604bce3985ad44bef2ec8f8ed29..d7107ac835fad207da79a52e00227e6067d9e335 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -80,6 +80,7 @@
  #include <linux/types.h>
  #include <linux/kernel.h>
  #include <linux/hash.h>
+#include <linux/slab.h>
  #include <linux/sched.h>
  #include <linux/mutex.h>
  #include <linux/string.h>
@@ -1986,9 +1987,9 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
  {
         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
                 if (net_ratelimit()) {
-                       netdev_warn(dev, "selects TX queue %d, but "
-                            "real number of TX queues is %d\n",
-                            queue_index, dev->real_num_tx_queues);
+                       pr_warning("%s selects TX queue %d, but "
+                               "real number of TX queues is %d\n",
+                               dev->name, queue_index, dev->real_num_tx_queues);
                 }
                 return 0;
         }
@@ -2014,7 +2015,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
                         if (dev->real_num_tx_queues > 1)
                                 queue_index = skb_tx_hash(dev, skb);
  
-                       if (sk && sk->sk_dst_cache)
+                       if (sk && rcu_dereference_check(sk->sk_dst_cache, 1))
                                 sk_tx_queue_set(sk, queue_index);
                 }
         }
@@ -2202,29 +2203,37 @@ int weight_p __read_mostly = 64;            /* old backlog weight */
  DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
  
  #ifdef CONFIG_RPS
+
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table *rps_sock_flow_table;
+EXPORT_SYMBOL(rps_sock_flow_table);
+
  /*
   * get_rps_cpu is called from netif_receive_skb and returns the target
   * CPU from the RPS map of the receiving queue for a given skb.
+ * rcu_read_lock must be held on entry.
   */
-static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+                      struct rps_dev_flow **rflowp)
  {
         struct ipv6hdr *ip6;
         struct iphdr *ip;
         struct netdev_rx_queue *rxqueue;
         struct rps_map *map;
+       struct rps_dev_flow_table *flow_table;
+       struct rps_sock_flow_table *sock_flow_table;
         int cpu = -1;
         u8 ip_proto;
+       u16 tcpu;
         u32 addr1, addr2, ports, ihl;
  
-       rcu_read_lock();
-
         if (skb_rx_queue_recorded(skb)) {
                 u16 index = skb_get_rx_queue(skb);
                 if (unlikely(index >= dev->num_rx_queues)) {
                         if (net_ratelimit()) {
-                               netdev_warn(dev, "received packet on queue "
-                                   "%u, but number of RX queues is %u\n",
-                                    index, dev->num_rx_queues);
+                               pr_warning("%s received packet on queue "
+                                       "%u, but number of RX queues is %u\n",
+                                       dev->name, index, dev->num_rx_queues);
                         }
                         goto done;
                 }
@@ -2232,7 +2241,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
         } else
                 rxqueue = dev->_rx;
  
-       if (!rxqueue->rps_map)
+       if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
                 goto done;
  
         if (skb->rxhash)
@@ -2284,9 +2293,48 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
                 skb->rxhash = 1;
  
  got_hash:
+       flow_table = rcu_dereference(rxqueue->rps_flow_table);
+       sock_flow_table = rcu_dereference(rps_sock_flow_table);
+       if (flow_table && sock_flow_table) {
+               u16 next_cpu;
+               struct rps_dev_flow *rflow;
+
+               rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
+               tcpu = rflow->cpu;
+
+               next_cpu = sock_flow_table->ents[skb->rxhash &
+                   sock_flow_table->mask];
+
+               /*
+                * If the desired CPU (where last recvmsg was done) is
+                * different from current CPU (one in the rx-queue flow
+                * table entry), switch if one of the following holds:
+                *   - Current CPU is unset (equal to RPS_NO_CPU).
+                *   - Current CPU is offline.
+                *   - The current CPU's queue tail has advanced beyond the
+                *     last packet that was enqueued using this table entry.
+                *     This guarantees that all previous packets for the flow
+                *     have been dequeued, thus preserving in order delivery.
+                */
+               if (unlikely(tcpu != next_cpu) &&
+                   (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
+                    ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
+                     rflow->last_qtail)) >= 0)) {
+                       tcpu = rflow->cpu = next_cpu;
+                       if (tcpu != RPS_NO_CPU)
+                               rflow->last_qtail = per_cpu(softnet_data,
+                                   tcpu).input_queue_head;
+               }
+               if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
+                       *rflowp = rflow;
+                       cpu = tcpu;
+                       goto done;
+               }
+       }
+
         map = rcu_dereference(rxqueue->rps_map);
         if (map) {
-               u16 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
+               tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
  
                 if (cpu_online(tcpu)) {
                         cpu = tcpu;
@@ -2295,7 +2343,6 @@ got_hash:
         }
  
  done:
-       rcu_read_unlock();
         return cpu;
  }
  
@@ -2321,13 +2368,14 @@ static void trigger_softirq(void *data)
         __napi_schedule(&queue->backlog);
         __get_cpu_var(netdev_rx_stat).received_rps++;
  }
-#endif /* CONFIG_SMP */
+#endif /* CONFIG_RPS */
  
  /*
   * enqueue_to_backlog is called to queue an skb to a per CPU backlog
   * queue (may be a remote CPU queue).
   */
-static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
+                             unsigned int *qtail)
  {
         struct softnet_data *queue;
         unsigned long flags;
@@ -2342,6 +2390,10 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
                 if (queue->input_pkt_queue.qlen) {
  enqueue:
                         __skb_queue_tail(&queue->input_pkt_queue, skb);
+#ifdef CONFIG_RPS
+                       *qtail = queue->input_queue_head +
+                           queue->input_pkt_queue.qlen;
+#endif
                         rps_unlock(queue);
                         local_irq_restore(flags);
                         return NET_RX_SUCCESS;
@@ -2356,11 +2408,10 @@ enqueue:
  
                                 cpu_set(cpu, rcpus->mask[rcpus->select]);
                                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
-                       } else
-                               __napi_schedule(&queue->backlog);
-#else
-                       __napi_schedule(&queue->backlog);
+                               goto enqueue;
+                       }
  #endif
+                       __napi_schedule(&queue->backlog);
                 }
                 goto enqueue;
         }
@@ -2391,7 +2442,7 @@ enqueue:
  
  int netif_rx(struct sk_buff *skb)
  {
-       int cpu;
+       int ret;
  
         /* if netpoll wants it, pretend we never saw it */
         if (netpoll_rx(skb))
@@ -2401,14 +2452,28 @@ int netif_rx(struct sk_buff *skb)
                 net_timestamp(skb);
  
  #ifdef CONFIG_RPS
-       cpu = get_rps_cpu(skb->dev, skb);
-       if (cpu < 0)
-               cpu = smp_processor_id();
+       {
+               struct rps_dev_flow voidflow, *rflow = &voidflow;
+               int cpu;
+
+               rcu_read_lock();
+
+               cpu = get_rps_cpu(skb->dev, skb, &rflow);
+               if (cpu < 0)
+                       cpu = smp_processor_id();
+
+               ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+
+               rcu_read_unlock();
+       }
  #else
-       cpu = smp_processor_id();
+       {
+               unsigned int qtail;
+               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
+               put_cpu();
+       }
  #endif
-
-       return enqueue_to_backlog(skb, cpu);
+       return ret;
  }
  EXPORT_SYMBOL(netif_rx);
  
@@ -2645,6 +2710,55 @@ void netif_nit_deliver(struct sk_buff *skb)
         rcu_read_unlock();
  }
  
+static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
+                                             struct net_device *master)
+{
+       if (skb->pkt_type == PACKET_HOST) {
+               u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
+
+               memcpy(dest, master->dev_addr, ETH_ALEN);
+       }
+}
+
+/* On bonding slaves other than the currently active slave, suppress
+ * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
+ * ARP on active-backup slaves with arp_validate enabled.
+ */
+int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
+{
+       struct net_device *dev = skb->dev;
+
+       if (master->priv_flags & IFF_MASTER_ARPMON)
+               dev->last_rx = jiffies;
+
+       if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
+               /* Do address unmangle. The local destination address
+                * will be always the one master has. Provides the right
+                * functionality in a bridge.
+                */
+               skb_bond_set_mac_by_master(skb, master);
+       }
+
+       if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
+               if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
+                   skb->protocol == __cpu_to_be16(ETH_P_ARP))
+                       return 0;
+
+               if (master->priv_flags & IFF_MASTER_ALB) {
+                       if (skb->pkt_type != PACKET_BROADCAST &&
+                           skb->pkt_type != PACKET_MULTICAST)
+                               return 0;
+               }
+               if (master->priv_flags & IFF_MASTER_8023AD &&
+                   skb->protocol == __cpu_to_be16(ETH_P_SLOW))
+                       return 0;
+
+               return 1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(__skb_bond_should_drop);
+
  static int __netif_receive_skb(struct sk_buff *skb)
  {
         struct packet_type *ptype, *pt_prev;
@@ -2775,14 +2889,22 @@ out:
  int netif_receive_skb(struct sk_buff *skb)
  {
  #ifdef CONFIG_RPS
-       int cpu;
+       struct rps_dev_flow voidflow, *rflow = &voidflow;
+       int cpu, ret;
  
-       cpu = get_rps_cpu(skb->dev, skb);
+       rcu_read_lock();
  
-       if (cpu < 0)
-               return __netif_receive_skb(skb);
-       else
-               return enqueue_to_backlog(skb, cpu);
+       cpu = get_rps_cpu(skb->dev, skb, &rflow);
+
+       if (cpu >= 0) {
+               ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+               rcu_read_unlock();
+       } else {
+               rcu_read_unlock();
+               ret = __netif_receive_skb(skb);
+       }
+
+       return ret;
  #else
         return __netif_receive_skb(skb);
  #endif
@@ -2801,6 +2923,7 @@ static void flush_backlog(void *arg)
                 if (skb->dev == dev) {
                         __skb_unlink(skb, &queue->input_pkt_queue);
                         kfree_skb(skb);
+                       incr_input_queue_head(queue);
                 }
         rps_unlock(queue);
  }
@@ -3121,8 +3244,10 @@ static int process_backlog(struct napi_struct *napi, int quota)
                 if (!skb) {
                         __napi_complete(napi);
                         rps_unlock(queue);
+                       local_irq_enable();
                         break;
                 }
+               incr_input_queue_head(queue);
                 rps_unlock(queue);
                 local_irq_enable();
  
@@ -5486,8 +5611,10 @@ static int dev_cpu_callback(struct notifier_block *nfb,
         local_irq_enable();
  
         /* Process offline CPU's input_pkt_queue */
-       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
+       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
                 netif_rx(skb);
+               incr_input_queue_head(oldsd);
+       }
  
         return NOTIFY_OK;
  }