Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6

[net-next-2.6.git] / net / core / dev.c
diff --git a/net/core/dev.c b/net/core/dev.c

index 1f466e82ac339359ffa0587f0e1e1f324c0f5dd7..42b200fdf12e42042fa30a45eec6ab33bc2bef8e 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -101,8 +101,6 @@
  #include <linux/proc_fs.h>
  #include <linux/seq_file.h>
  #include <linux/stat.h>
-#include <linux/if_bridge.h>
-#include <linux/if_macvlan.h>
  #include <net/dst.h>
  #include <net/pkt_sched.h>
  #include <net/checksum.h>
@@ -131,6 +129,7 @@
  #include <linux/random.h>
  #include <trace/events/napi.h>
  #include <linux/pci.h>
+#include <linux/inetdevice.h>
  
  #include "net-sysfs.h"
  
@@ -373,6 +372,14 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
   *                                                     --ANK (980803)
   */
  
+static inline struct list_head *ptype_head(const struct packet_type *pt)
+{
+       if (pt->type == htons(ETH_P_ALL))
+               return &ptype_all;
+       else
+               return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
+}
+
  /**
   *     dev_add_pack - add packet handler
   *     @pt: packet type declaration
@@ -388,16 +395,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  
  void dev_add_pack(struct packet_type *pt)
  {
-       int hash;
+       struct list_head *head = ptype_head(pt);
  
-       spin_lock_bh(&ptype_lock);
-       if (pt->type == htons(ETH_P_ALL))
-               list_add_rcu(&pt->list, &ptype_all);
-       else {
-               hash = ntohs(pt->type) & PTYPE_HASH_MASK;
-               list_add_rcu(&pt->list, &ptype_base[hash]);
-       }
-       spin_unlock_bh(&ptype_lock);
+       spin_lock(&ptype_lock);
+       list_add_rcu(&pt->list, head);
+       spin_unlock(&ptype_lock);
  }
  EXPORT_SYMBOL(dev_add_pack);
  
@@ -416,15 +418,10 @@ EXPORT_SYMBOL(dev_add_pack);
   */
  void __dev_remove_pack(struct packet_type *pt)
  {
-       struct list_head *head;
+       struct list_head *head = ptype_head(pt);
         struct packet_type *pt1;
  
-       spin_lock_bh(&ptype_lock);
-
-       if (pt->type == htons(ETH_P_ALL))
-               head = &ptype_all;
-       else
-               head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
+       spin_lock(&ptype_lock);
  
         list_for_each_entry(pt1, head, list) {
                 if (pt == pt1) {
@@ -435,7 +432,7 @@ void __dev_remove_pack(struct packet_type *pt)
  
         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
  out:
-       spin_unlock_bh(&ptype_lock);
+       spin_unlock(&ptype_lock);
  }
  EXPORT_SYMBOL(__dev_remove_pack);
  
@@ -803,35 +800,31 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
  EXPORT_SYMBOL(dev_getfirstbyhwtype);
  
  /**
- *     dev_get_by_flags - find any device with given flags
+ *     dev_get_by_flags_rcu - find any device with given flags
   *     @net: the applicable net namespace
   *     @if_flags: IFF_* values
   *     @mask: bitmask of bits in if_flags to check
   *
   *     Search for any interface with the given flags. Returns NULL if a device
- *     is not found or a pointer to the device. The device returned has
- *     had a reference added and the pointer is safe until the user calls
- *     dev_put to indicate they have finished with it.
+ *     is not found or a pointer to the device. Must be called inside
+ *     rcu_read_lock(), and result refcount is unchanged.
   */
  
-struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
+struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
                                     unsigned short mask)
  {
         struct net_device *dev, *ret;
  
         ret = NULL;
-       rcu_read_lock();
         for_each_netdev_rcu(net, dev) {
                 if (((dev->flags ^ if_flags) & mask) == 0) {
-                       dev_hold(dev);
                         ret = dev;
                         break;
                 }
         }
-       rcu_read_unlock();
         return ret;
  }
-EXPORT_SYMBOL(dev_get_by_flags);
+EXPORT_SYMBOL(dev_get_by_flags_rcu);
  
  /**
   *     dev_valid_name - check if name is okay for network device
@@ -1542,7 +1535,8 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
                                 if (net_ratelimit())
                                         printk(KERN_CRIT "protocol %04x is "
                                                "buggy, dev %s\n",
-                                              skb2->protocol, dev->name);
+                                              ntohs(skb2->protocol),
+                                              dev->name);
                                 skb_reset_network_header(skb2);
                         }
  
@@ -1907,14 +1901,14 @@ static int dev_gso_segment(struct sk_buff *skb)
  
  /*
   * Try to orphan skb early, right before transmission by the device.
- * We cannot orphan skb if tx timestamp is requested, since
- * drivers need to call skb_tstamp_tx() to send the timestamp.
+ * We cannot orphan skb if tx timestamp is requested or the sk-reference
+ * is needed on driver level for other reasons, e.g. see net/can/raw.c
   */
  static inline void skb_orphan_try(struct sk_buff *skb)
  {
         struct sock *sk = skb->sk;
  
-       if (sk && !skb_tx(skb)->flags) {
+       if (sk && !skb_shinfo(skb)->tx_flags) {
                 /* skb_tx_hash() wont be able to get sk.
                  * We copy sk_hash into skb->rxhash
                  */
@@ -1924,6 +1918,22 @@ static inline void skb_orphan_try(struct sk_buff *skb)
         }
  }
  
+/*
+ * Returns true if either:
+ *     1. skb has frag_list and the device doesn't support FRAGLIST, or
+ *     2. skb is fragmented and the device does not support SG, or if
+ *        at least one of fragments is in highmem and device does not
+ *        support DMA from it.
+ */
+static inline int skb_needs_linearize(struct sk_buff *skb,
+                                     struct net_device *dev)
+{
+       return skb_is_nonlinear(skb) &&
+              ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
+               (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
+                                             illegal_highdma(dev, skb))));
+}
+
  int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
                         struct netdev_queue *txq)
  {
@@ -1948,6 +1958,22 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
                                 goto out_kfree_skb;
                         if (skb->next)
                                 goto gso;
+               } else {
+                       if (skb_needs_linearize(skb, dev) &&
+                           __skb_linearize(skb))
+                               goto out_kfree_skb;
+
+                       /* If packet is not checksummed and device does not
+                        * support checksumming for this protocol, complete
+                        * checksumming here.
+                        */
+                       if (skb->ip_summed == CHECKSUM_PARTIAL) {
+                               skb_set_transport_header(skb, skb->csum_start -
+                                             skb_headroom(skb));
+                               if (!dev_can_checksum(dev, skb) &&
+                                    skb_checksum_help(skb))
+                                       goto out_kfree_skb;
+                       }
                 }
  
                 rc = ops->ndo_start_xmit(skb, dev);
@@ -2031,16 +2057,16 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
                                         struct sk_buff *skb)
  {
         int queue_index;
-       struct sock *sk = skb->sk;
+       const struct net_device_ops *ops = dev->netdev_ops;
  
-       queue_index = sk_tx_queue_get(sk);
-       if (queue_index < 0) {
-               const struct net_device_ops *ops = dev->netdev_ops;
+       if (ops->ndo_select_queue) {
+               queue_index = ops->ndo_select_queue(dev, skb);
+               queue_index = dev_cap_txqueue(dev, queue_index);
+       } else {
+               struct sock *sk = skb->sk;
+               queue_index = sk_tx_queue_get(sk);
+               if (queue_index < 0) {
  
-               if (ops->ndo_select_queue) {
-                       queue_index = ops->ndo_select_queue(dev, skb);
-                       queue_index = dev_cap_txqueue(dev, queue_index);
-               } else {
                         queue_index = 0;
                         if (dev->real_num_tx_queues > 1)
                                 queue_index = skb_tx_hash(dev, skb);
@@ -2063,14 +2089,24 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
                                  struct netdev_queue *txq)
  {
         spinlock_t *root_lock = qdisc_lock(q);
+       bool contended = qdisc_is_running(q);
         int rc;
  
+       /*
+        * Heuristic to force contended enqueues to serialize on a
+        * separate lock before trying to get qdisc main lock.
+        * This permits __QDISC_STATE_RUNNING owner to get the lock more often
+        * and dequeue packets faster.
+        */
+       if (unlikely(contended))
+               spin_lock(&q->busylock);
+
         spin_lock(root_lock);
         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
                 kfree_skb(skb);
                 rc = NET_XMIT_DROP;
         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
-                  !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
+                  qdisc_run_begin(q)) {
                 /*
                  * This is a work-conserving queue; there are no old skbs
                  * waiting to be sent out; and the qdisc is not running -
@@ -2079,37 +2115,33 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
                         skb_dst_force(skb);
                 __qdisc_update_bstats(q, skb->len);
-               if (sch_direct_xmit(skb, q, dev, txq, root_lock))
+               if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
+                       if (unlikely(contended)) {
+                               spin_unlock(&q->busylock);
+                               contended = false;
+                       }
                         __qdisc_run(q);
-               else
-                       clear_bit(__QDISC_STATE_RUNNING, &q->state);
+               } else
+                       qdisc_run_end(q);
  
                 rc = NET_XMIT_SUCCESS;
         } else {
                 skb_dst_force(skb);
                 rc = qdisc_enqueue_root(skb, q);
-               qdisc_run(q);
+               if (qdisc_run_begin(q)) {
+                       if (unlikely(contended)) {
+                               spin_unlock(&q->busylock);
+                               contended = false;
+                       }
+                       __qdisc_run(q);
+               }
         }
         spin_unlock(root_lock);
-
+       if (unlikely(contended))
+               spin_unlock(&q->busylock);
         return rc;
  }
  
-/*
- * Returns true if either:
- *     1. skb has frag_list and the device doesn't support FRAGLIST, or
- *     2. skb is fragmented and the device does not support SG, or if
- *        at least one of fragments is in highmem and device does not
- *        support DMA from it.
- */
-static inline int skb_needs_linearize(struct sk_buff *skb,
-                                     struct net_device *dev)
-{
-       return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
-              (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
-                                             illegal_highdma(dev, skb)));
-}
-
  /**
   *     dev_queue_xmit - transmit a buffer
   *     @skb: buffer to transmit
@@ -2142,25 +2174,6 @@ int dev_queue_xmit(struct sk_buff *skb)
         struct Qdisc *q;
         int rc = -ENOMEM;
  
-       /* GSO will handle the following emulations directly. */
-       if (netif_needs_gso(dev, skb))
-               goto gso;
-
-       /* Convert a paged skb to linear, if required */
-       if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
-               goto out_kfree_skb;
-
-       /* If packet is not checksummed and device does not support
-        * checksumming for this protocol, complete checksumming here.
-        */
-       if (skb->ip_summed == CHECKSUM_PARTIAL) {
-               skb_set_transport_header(skb, skb->csum_start -
-                                             skb_headroom(skb));
-               if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
-                       goto out_kfree_skb;
-       }
-
-gso:
         /* Disable soft irqs for various locks below. Also
          * stops preemption for RCU.
          */
@@ -2219,7 +2232,6 @@ gso:
         rc = -ENETDOWN;
         rcu_read_unlock_bh();
  
-out_kfree_skb:
         kfree_skb(skb);
         return rc;
  out:
@@ -2246,69 +2258,44 @@ static inline void ____napi_schedule(struct softnet_data *sd,
         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
  }
  
-#ifdef CONFIG_RPS
-
-/* One global table that all flow-based protocols share. */
-struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
-EXPORT_SYMBOL(rps_sock_flow_table);
-
  /*
- * get_rps_cpu is called from netif_receive_skb and returns the target
- * CPU from the RPS map of the receiving queue for a given skb.
- * rcu_read_lock must be held on entry.
+ * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
+ * and src/dst port numbers. Returns a non-zero hash number on success
+ * and 0 on failure.
   */
-static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
-                      struct rps_dev_flow **rflowp)
+__u32 __skb_get_rxhash(struct sk_buff *skb)
  {
+       int nhoff, hash = 0, poff;
         struct ipv6hdr *ip6;
         struct iphdr *ip;
-       struct netdev_rx_queue *rxqueue;
-       struct rps_map *map;
-       struct rps_dev_flow_table *flow_table;
-       struct rps_sock_flow_table *sock_flow_table;
-       int cpu = -1;
         u8 ip_proto;
-       u16 tcpu;
         u32 addr1, addr2, ihl;
         union {
                 u32 v32;
                 u16 v16[2];
         } ports;
  
-       if (skb_rx_queue_recorded(skb)) {
-               u16 index = skb_get_rx_queue(skb);
-               if (unlikely(index >= dev->num_rx_queues)) {
-                       WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
-                               "on queue %u, but number of RX queues is %u\n",
-                               dev->name, index, dev->num_rx_queues);
-                       goto done;
-               }
-               rxqueue = dev->_rx + index;
-       } else
-               rxqueue = dev->_rx;
-
-       if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
-               goto done;
-
-       if (skb->rxhash)
-               goto got_hash; /* Skip hash computation on packet header */
+       nhoff = skb_network_offset(skb);
  
         switch (skb->protocol) {
         case __constant_htons(ETH_P_IP):
-               if (!pskb_may_pull(skb, sizeof(*ip)))
+               if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
                         goto done;
  
-               ip = (struct iphdr *) skb->data;
-               ip_proto = ip->protocol;
+               ip = (struct iphdr *) (skb->data + nhoff);
+               if (ip->frag_off & htons(IP_MF | IP_OFFSET))
+                       ip_proto = 0;
+               else
+                       ip_proto = ip->protocol;
                 addr1 = (__force u32) ip->saddr;
                 addr2 = (__force u32) ip->daddr;
                 ihl = ip->ihl;
                 break;
         case __constant_htons(ETH_P_IPV6):
-               if (!pskb_may_pull(skb, sizeof(*ip6)))
+               if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
                         goto done;
  
-               ip6 = (struct ipv6hdr *) skb->data;
+               ip6 = (struct ipv6hdr *) (skb->data + nhoff);
                 ip_proto = ip6->nexthdr;
                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
@@ -2317,33 +2304,80 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
         default:
                 goto done;
         }
-       switch (ip_proto) {
-       case IPPROTO_TCP:
-       case IPPROTO_UDP:
-       case IPPROTO_DCCP:
-       case IPPROTO_ESP:
-       case IPPROTO_AH:
-       case IPPROTO_SCTP:
-       case IPPROTO_UDPLITE:
-               if (pskb_may_pull(skb, (ihl * 4) + 4)) {
-                       ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
+
+       ports.v32 = 0;
+       poff = proto_ports_offset(ip_proto);
+       if (poff >= 0) {
+               nhoff += ihl * 4 + poff;
+               if (pskb_may_pull(skb, nhoff + 4)) {
+                       ports.v32 = * (__force u32 *) (skb->data + nhoff);
                         if (ports.v16[1] < ports.v16[0])
                                 swap(ports.v16[0], ports.v16[1]);
-                       break;
                 }
-       default:
-               ports.v32 = 0;
-               break;
         }
  
         /* get a consistent hash (same value on both flow directions) */
         if (addr2 < addr1)
                 swap(addr1, addr2);
-       skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
-       if (!skb->rxhash)
-               skb->rxhash = 1;
  
-got_hash:
+       hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
+       if (!hash)
+               hash = 1;
+
+done:
+       return hash;
+}
+EXPORT_SYMBOL(__skb_get_rxhash);
+
+#ifdef CONFIG_RPS
+
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
+EXPORT_SYMBOL(rps_sock_flow_table);
+
+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving queue for a given skb.
+ * rcu_read_lock must be held on entry.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+                      struct rps_dev_flow **rflowp)
+{
+       struct netdev_rx_queue *rxqueue;
+       struct rps_map *map = NULL;
+       struct rps_dev_flow_table *flow_table;
+       struct rps_sock_flow_table *sock_flow_table;
+       int cpu = -1;
+       u16 tcpu;
+
+       if (skb_rx_queue_recorded(skb)) {
+               u16 index = skb_get_rx_queue(skb);
+               if (unlikely(index >= dev->num_rx_queues)) {
+                       WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
+                               "on queue %u, but number of RX queues is %u\n",
+                               dev->name, index, dev->num_rx_queues);
+                       goto done;
+               }
+               rxqueue = dev->_rx + index;
+       } else
+               rxqueue = dev->_rx;
+
+       if (rxqueue->rps_map) {
+               map = rcu_dereference(rxqueue->rps_map);
+               if (map && map->len == 1) {
+                       tcpu = map->cpus[0];
+                       if (cpu_online(tcpu))
+                               cpu = tcpu;
+                       goto done;
+               }
+       } else if (!rxqueue->rps_flow_table) {
+               goto done;
+       }
+
+       skb_reset_network_header(skb);
+       if (!skb_get_rxhash(skb))
+               goto done;
+
         flow_table = rcu_dereference(rxqueue->rps_flow_table);
         sock_flow_table = rcu_dereference(rps_sock_flow_table);
         if (flow_table && sock_flow_table) {
@@ -2383,7 +2417,6 @@ got_hash:
                 }
         }
  
-       map = rcu_dereference(rxqueue->rps_map);
         if (map) {
                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
  
@@ -2504,6 +2537,7 @@ int netif_rx(struct sk_buff *skb)
                 struct rps_dev_flow voidflow, *rflow = &voidflow;
                 int cpu;
  
+               preempt_disable();
                 rcu_read_lock();
  
                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -2513,6 +2547,7 @@ int netif_rx(struct sk_buff *skb)
                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
  
                 rcu_read_unlock();
+               preempt_enable();
         }
  #else
         {
@@ -2604,70 +2639,14 @@ static inline int deliver_skb(struct sk_buff *skb,
         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
  }
  
-#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
-
-#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
+#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
+    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
  /* This hook is defined here for ATM LANE */
  int (*br_fdb_test_addr_hook)(struct net_device *dev,
                              unsigned char *addr) __read_mostly;
  EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
  #endif
  
-/*
- * If bridge module is loaded call bridging hook.
- *  returns NULL if packet was consumed.
- */
-struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
-                                       struct sk_buff *skb) __read_mostly;
-EXPORT_SYMBOL_GPL(br_handle_frame_hook);
-
-static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
-                                           struct packet_type **pt_prev, int *ret,
-                                           struct net_device *orig_dev)
-{
-       struct net_bridge_port *port;
-
-       if (skb->pkt_type == PACKET_LOOPBACK ||
-           (port = rcu_dereference(skb->dev->br_port)) == NULL)
-               return skb;
-
-       if (*pt_prev) {
-               *ret = deliver_skb(skb, *pt_prev, orig_dev);
-               *pt_prev = NULL;
-       }
-
-       return br_handle_frame_hook(port, skb);
-}
-#else
-#define handle_bridge(skb, pt_prev, ret, orig_dev)     (skb)
-#endif
-
-#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
-struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *p,
-                                            struct sk_buff *skb) __read_mostly;
-EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
-
-static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
-                                            struct packet_type **pt_prev,
-                                            int *ret,
-                                            struct net_device *orig_dev)
-{
-       struct macvlan_port *port;
-
-       port = rcu_dereference(skb->dev->macvlan_port);
-       if (!port)
-               return skb;
-
-       if (*pt_prev) {
-               *ret = deliver_skb(skb, *pt_prev, orig_dev);
-               *pt_prev = NULL;
-       }
-       return macvlan_handle_frame_hook(port, skb);
-}
-#else
-#define handle_macvlan(skb, pt_prev, ret, orig_dev)    (skb)
-#endif
-
  #ifdef CONFIG_NET_CLS_ACT
  /* TODO: Maybe we should just force sch_ingress to be compiled in
   * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
@@ -2685,10 +2664,10 @@ static int ing_filter(struct sk_buff *skb)
         int result = TC_ACT_OK;
         struct Qdisc *q;
  
-       if (MAX_RED_LOOP < ttl++) {
-               printk(KERN_WARNING
-                      "Redir loop detected Dropping packet (%d->%d)\n",
-                      skb->skb_iif, dev->ifindex);
+       if (unlikely(MAX_RED_LOOP < ttl++)) {
+               if (net_ratelimit())
+                       pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
+                              skb->skb_iif, dev->ifindex);
                 return TC_ACT_SHOT;
         }
  
@@ -2718,9 +2697,6 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
         if (*pt_prev) {
                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
                 *pt_prev = NULL;
-       } else {
-               /* Huh? Why does turning on AF_PACKET affect this? */
-               skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
         }
  
         switch (ing_filter(skb)) {
@@ -2763,6 +2739,51 @@ void netif_nit_deliver(struct sk_buff *skb)
         rcu_read_unlock();
  }
  
+/**
+ *     netdev_rx_handler_register - register receive handler
+ *     @dev: device to register a handler for
+ *     @rx_handler: receive handler to register
+ *     @rx_handler_data: data pointer that is used by rx handler
+ *
+ *     Register a receive hander for a device. This handler will then be
+ *     called from __netif_receive_skb. A negative errno code is returned
+ *     on a failure.
+ *
+ *     The caller must hold the rtnl_mutex.
+ */
+int netdev_rx_handler_register(struct net_device *dev,
+                              rx_handler_func_t *rx_handler,
+                              void *rx_handler_data)
+{
+       ASSERT_RTNL();
+
+       if (dev->rx_handler)
+               return -EBUSY;
+
+       rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
+       rcu_assign_pointer(dev->rx_handler, rx_handler);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
+
+/**
+ *     netdev_rx_handler_unregister - unregister receive handler
+ *     @dev: device to unregister a handler from
+ *
+ *     Unregister a receive hander from a device.
+ *
+ *     The caller must hold the rtnl_mutex.
+ */
+void netdev_rx_handler_unregister(struct net_device *dev)
+{
+
+       ASSERT_RTNL();
+       rcu_assign_pointer(dev->rx_handler, NULL);
+       rcu_assign_pointer(dev->rx_handler_data, NULL);
+}
+EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
+
  static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
                                               struct net_device *master)
  {
@@ -2784,7 +2805,8 @@ int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
         if (master->priv_flags & IFF_MASTER_ARPMON)
                 dev->last_rx = jiffies;
  
-       if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
+       if ((master->priv_flags & IFF_MASTER_ALB) &&
+           (master->priv_flags & IFF_BRIDGE_PORT)) {
                 /* Do address unmangle. The local destination address
                  * will be always the one master has. Provides the right
                  * functionality in a bridge.
@@ -2815,6 +2837,7 @@ EXPORT_SYMBOL(__skb_bond_should_drop);
  static int __netif_receive_skb(struct sk_buff *skb)
  {
         struct packet_type *ptype, *pt_prev;
+       rx_handler_func_t *rx_handler;
         struct net_device *orig_dev;
         struct net_device *master;
         struct net_device *null_or_orig;
@@ -2825,8 +2848,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
         if (!netdev_tstamp_prequeue)
                 net_timestamp_check(skb);
  
-       if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
-               return NET_RX_SUCCESS;
+       if (vlan_tx_tag_present(skb))
+               vlan_hwaccel_do_receive(skb);
  
         /* if we've gotten here through NAPI, check netpoll */
         if (netpoll_receive_skb(skb))
@@ -2856,8 +2879,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
                         skb->dev = master;
         }
  
-       __get_cpu_var(softnet_data).processed++;
-
+       __this_cpu_inc(softnet_data.processed);
         skb_reset_network_header(skb);
         skb_reset_transport_header(skb);
         skb->mac_len = skb->network_header - skb->mac_header;
@@ -2889,12 +2911,17 @@ static int __netif_receive_skb(struct sk_buff *skb)
  ncls:
  #endif
  
-       skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
-       if (!skb)
-               goto out;
-       skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
-       if (!skb)
-               goto out;
+       /* Handle special case of bridge or macvlan */
+       rx_handler = rcu_dereference(skb->dev->rx_handler);
+       if (rx_handler) {
+               if (pt_prev) {
+                       ret = deliver_skb(skb, pt_prev, orig_dev);
+                       pt_prev = NULL;
+               }
+               skb = rx_handler(skb);
+               if (!skb)
+                       goto out;
+       }
  
         /*
          * Make sure frames received on VLAN interfaces stacked on
@@ -2955,6 +2982,9 @@ int netif_receive_skb(struct sk_buff *skb)
         if (netdev_tstamp_prequeue)
                 net_timestamp_check(skb);
  
+       if (skb_defer_rx_timestamp(skb))
+               return NET_RX_SUCCESS;
+
  #ifdef CONFIG_RPS
         {
                 struct rps_dev_flow voidflow, *rflow = &voidflow;
@@ -3040,7 +3070,7 @@ out:
         return netif_receive_skb(skb);
  }
  
-static void napi_gro_flush(struct napi_struct *napi)
+inline void napi_gro_flush(struct napi_struct *napi)
  {
         struct sk_buff *skb, *next;
  
@@ -3053,6 +3083,7 @@ static void napi_gro_flush(struct napi_struct *napi)
         napi->gro_count = 0;
         napi->gro_list = NULL;
  }
+EXPORT_SYMBOL(napi_gro_flush);
  
  enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
  {
@@ -3064,10 +3095,10 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
         int mac_len;
         enum gro_result ret;
  
-       if (!(skb->dev->features & NETIF_F_GRO))
+       if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
                 goto normal;
  
-       if (skb_is_gso(skb) || skb_has_frags(skb))
+       if (skb_is_gso(skb) || skb_has_frag_list(skb))
                 goto normal;
  
         rcu_read_lock();
@@ -3133,7 +3164,7 @@ pull:
                         put_page(skb_shinfo(skb)->frags[0].page);
                         memmove(skb_shinfo(skb)->frags,
                                 skb_shinfo(skb)->frags + 1,
-                               --skb_shinfo(skb)->nr_frags);
+                               --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
                 }
         }
  
@@ -3146,19 +3177,18 @@ normal:
  }
  EXPORT_SYMBOL(dev_gro_receive);
  
-static gro_result_t
+static inline gro_result_t
  __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
  {
         struct sk_buff *p;
  
-       if (netpoll_rx_on(skb))
-               return GRO_NORMAL;
-
         for (p = napi->gro_list; p; p = p->next) {
-               NAPI_GRO_CB(p)->same_flow =
-                       (p->dev == skb->dev) &&
-                       !compare_ether_header(skb_mac_header(p),
+               unsigned long diffs;
+
+               diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
+               diffs |= compare_ether_header(skb_mac_header(p),
                                               skb_gro_mac_header(skb));
+               NAPI_GRO_CB(p)->same_flow = !diffs;
                 NAPI_GRO_CB(p)->flush = 0;
         }
  
@@ -3719,10 +3749,11 @@ void dev_seq_stop(struct seq_file *seq, void *v)
  
  static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
  {
-       const struct net_device_stats *stats = dev_get_stats(dev);
+       struct rtnl_link_stats64 temp;
+       const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
  
-       seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
-                  "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
+       seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
+                  "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
                    dev->name, stats->rx_bytes, stats->rx_packets,
                    stats->rx_errors,
                    stats->rx_dropped + stats->rx_missed_errors,
@@ -4837,7 +4868,7 @@ static void rollback_registered_many(struct list_head *head)
         dev = list_first_entry(head, struct net_device, unreg_list);
         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
  
-       synchronize_net();
+       rcu_barrier();
  
         list_for_each_entry(dev, head, unreg_list)
                 dev_put(dev);
@@ -4933,6 +4964,34 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
  }
  EXPORT_SYMBOL(netif_stacked_transfer_operstate);
  
+static int netif_alloc_rx_queues(struct net_device *dev)
+{
+#ifdef CONFIG_RPS
+       unsigned int i, count = dev->num_rx_queues;
+
+       if (count) {
+               struct netdev_rx_queue *rx;
+
+               rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
+               if (!rx) {
+                       pr_err("netdev: Unable to allocate %u rx queues.\n",
+                              count);
+                       return -ENOMEM;
+               }
+               dev->_rx = rx;
+               atomic_set(&rx->count, count);
+
+               /*
+                * Set a pointer to first element in the array which holds the
+                * reference count.
+                */
+               for (i = 0; i < count; i++)
+                       rx[i].first = rx;
+       }
+#endif
+       return 0;
+}
+
  /**
   *     register_netdevice      - register a network device
   *     @dev: device to register
@@ -4970,24 +5029,10 @@ int register_netdevice(struct net_device *dev)
  
         dev->iflink = -1;
  
-#ifdef CONFIG_RPS
-       if (!dev->num_rx_queues) {
-               /*
-                * Allocate a single RX queue if driver never called
-                * alloc_netdev_mq
-                */
-
-               dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
-               if (!dev->_rx) {
-                       ret = -ENOMEM;
-                       goto out;
-               }
+       ret = netif_alloc_rx_queues(dev);
+       if (ret)
+               goto out;
  
-               dev->_rx->first = dev->_rx;
-               atomic_set(&dev->_rx->count, 1);
-               dev->num_rx_queues = 1;
-       }
-#endif
         /* Init, if this function is available */
         if (dev->netdev_ops->ndo_init) {
                 ret = dev->netdev_ops->ndo_init(dev);
@@ -5027,6 +5072,12 @@ int register_netdevice(struct net_device *dev)
         if (dev->features & NETIF_F_SG)
                 dev->features |= NETIF_F_GSO;
  
+       /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
+        * vlan_dev_init() will do the dev->features check, so these features
+        * are enabled only if supported by underlying device.
+        */
+       dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
+
         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
         ret = notifier_to_errno(ret);
         if (ret)
@@ -5256,7 +5307,7 @@ void netdev_run_todo(void)
  
                 /* paranoia */
                 BUG_ON(atomic_read(&dev->refcnt));
-               WARN_ON(dev->ip_ptr);
+               WARN_ON(rcu_dereference_raw(dev->ip_ptr));
                 WARN_ON(dev->ip6_ptr);
                 WARN_ON(dev->dn_ptr);
  
@@ -5271,20 +5322,22 @@ void netdev_run_todo(void)
  /**
   *     dev_txq_stats_fold - fold tx_queues stats
   *     @dev: device to get statistics from
- *     @stats: struct net_device_stats to hold results
+ *     @stats: struct rtnl_link_stats64 to hold results
   */
  void dev_txq_stats_fold(const struct net_device *dev,
-                       struct net_device_stats *stats)
+                       struct rtnl_link_stats64 *stats)
  {
-       unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
+       u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
         unsigned int i;
         struct netdev_queue *txq;
  
         for (i = 0; i < dev->num_tx_queues; i++) {
                 txq = netdev_get_tx_queue(dev, i);
+               spin_lock_bh(&txq->_xmit_lock);
                 tx_bytes   += txq->tx_bytes;
                 tx_packets += txq->tx_packets;
                 tx_dropped += txq->tx_dropped;
+               spin_unlock_bh(&txq->_xmit_lock);
         }
         if (tx_bytes || tx_packets || tx_dropped) {
                 stats->tx_bytes   = tx_bytes;
@@ -5294,23 +5347,53 @@ void dev_txq_stats_fold(const struct net_device *dev,
  }
  EXPORT_SYMBOL(dev_txq_stats_fold);
  
+/* Convert net_device_stats to rtnl_link_stats64.  They have the same
+ * fields in the same order, with only the type differing.
+ */
+static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
+                                   const struct net_device_stats *netdev_stats)
+{
+#if BITS_PER_LONG == 64
+        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
+        memcpy(stats64, netdev_stats, sizeof(*stats64));
+#else
+       size_t i, n = sizeof(*stats64) / sizeof(u64);
+       const unsigned long *src = (const unsigned long *)netdev_stats;
+       u64 *dst = (u64 *)stats64;
+
+       BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
+                    sizeof(*stats64) / sizeof(u64));
+       for (i = 0; i < n; i++)
+               dst[i] = src[i];
+#endif
+}
+
  /**
   *     dev_get_stats   - get network device statistics
   *     @dev: device to get statistics from
+ *     @storage: place to store stats
   *
- *     Get network statistics from device. The device driver may provide
- *     its own method by setting dev->netdev_ops->get_stats; otherwise
- *     the internal statistics structure is used.
+ *     Get network statistics from device. Return @storage.
+ *     The device driver may provide its own method by setting
+ *     dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
+ *     otherwise the internal statistics structure is used.
   */
-const struct net_device_stats *dev_get_stats(struct net_device *dev)
+struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
+                                       struct rtnl_link_stats64 *storage)
  {
         const struct net_device_ops *ops = dev->netdev_ops;
  
-       if (ops->ndo_get_stats)
-               return ops->ndo_get_stats(dev);
-
-       dev_txq_stats_fold(dev, &dev->stats);
-       return &dev->stats;
+       if (ops->ndo_get_stats64) {
+               memset(storage, 0, sizeof(*storage));
+               return ops->ndo_get_stats64(dev, storage);
+       }
+       if (ops->ndo_get_stats) {
+               netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
+               return storage;
+       }
+       netdev_stats_to_stats64(storage, &dev->stats);
+       dev_txq_stats_fold(dev, storage);
+       return storage;
  }
  EXPORT_SYMBOL(dev_get_stats);
  
@@ -5346,10 +5429,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
         struct net_device *dev;
         size_t alloc_size;
         struct net_device *p;
-#ifdef CONFIG_RPS
-       struct netdev_rx_queue *rx;
-       int i;
-#endif
  
         BUG_ON(strlen(name) >= sizeof(dev->name));
  
@@ -5375,29 +5454,12 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
                 goto free_p;
         }
  
-#ifdef CONFIG_RPS
-       rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
-       if (!rx) {
-               printk(KERN_ERR "alloc_netdev: Unable to allocate "
-                      "rx queues.\n");
-               goto free_tx;
-       }
-
-       atomic_set(&rx->count, queue_count);
-
-       /*
-        * Set a pointer to first element in the array which holds the
-        * reference count.
-        */
-       for (i = 0; i < queue_count; i++)
-               rx[i].first = rx;
-#endif
  
         dev = PTR_ALIGN(p, NETDEV_ALIGN);
         dev->padded = (char *)dev - (char *)p;
  
         if (dev_addr_init(dev))
-               goto free_rx;
+               goto free_tx;
  
         dev_mc_init(dev);
         dev_uc_init(dev);
@@ -5409,7 +5471,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
         dev->real_num_tx_queues = queue_count;
  
  #ifdef CONFIG_RPS
-       dev->_rx = rx;
         dev->num_rx_queues = queue_count;
  #endif
  
@@ -5427,11 +5488,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
         strcpy(dev->name, name);
         return dev;
  
-free_rx:
-#ifdef CONFIG_RPS
-       kfree(rx);
  free_tx:
-#endif
         kfree(tx);
  free_p:
         kfree(p);
@@ -5618,6 +5675,10 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
  
         /* Notify protocols, that we are about to destroy
            this device. They should clean all the things.
+
+          Note that dev->reg_state stays at NETREG_REGISTERED.
+          This is wanted because this way 8021q and macvlan know
+          the device is just moving and can keep their slaves up.
         */
         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
@@ -5815,6 +5876,68 @@ char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
         return buffer;
  }
  
+static int __netdev_printk(const char *level, const struct net_device *dev,
+                          struct va_format *vaf)
+{
+       int r;
+
+       if (dev && dev->dev.parent)
+               r = dev_printk(level, dev->dev.parent, "%s: %pV",
+                              netdev_name(dev), vaf);
+       else if (dev)
+               r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
+       else
+               r = printk("%s(NULL net_device): %pV", level, vaf);
+
+       return r;
+}
+
+int netdev_printk(const char *level, const struct net_device *dev,
+                 const char *format, ...)
+{
+       struct va_format vaf;
+       va_list args;
+       int r;
+
+       va_start(args, format);
+
+       vaf.fmt = format;
+       vaf.va = &args;
+
+       r = __netdev_printk(level, dev, &vaf);
+       va_end(args);
+
+       return r;
+}
+EXPORT_SYMBOL(netdev_printk);
+
+#define define_netdev_printk_level(func, level)                        \
+int func(const struct net_device *dev, const char *fmt, ...)   \
+{                                                              \
+       int r;                                                  \
+       struct va_format vaf;                                   \
+       va_list args;                                           \
+                                                               \
+       va_start(args, fmt);                                    \
+                                                               \
+       vaf.fmt = fmt;                                          \
+       vaf.va = &args;                                         \
+                                                               \
+       r = __netdev_printk(level, dev, &vaf);                  \
+       va_end(args);                                           \
+                                                               \
+       return r;                                               \
+}                                                              \
+EXPORT_SYMBOL(func);
+
+define_netdev_printk_level(netdev_emerg, KERN_EMERG);
+define_netdev_printk_level(netdev_alert, KERN_ALERT);
+define_netdev_printk_level(netdev_crit, KERN_CRIT);
+define_netdev_printk_level(netdev_err, KERN_ERR);
+define_netdev_printk_level(netdev_warn, KERN_WARNING);
+define_netdev_printk_level(netdev_notice, KERN_NOTICE);
+define_netdev_printk_level(netdev_info, KERN_INFO);
+
  static void __net_exit netdev_exit(struct net *net)
  {
         kfree(net->dev_name_head);