]> bbs.cooldavid.org Git - net-next-2.6.git/blobdiff - net/ipv4/route.c
ipv4: add __rcu annotations to routes.c
[net-next-2.6.git] / net / ipv4 / route.c
index ac6559cb54f9f650986e4dd86996349e328cd92a..987bf9adb31833c19a0db04ce76060306d8e6994 100644 (file)
@@ -159,7 +159,6 @@ static struct dst_ops ipv4_dst_ops = {
        .link_failure =         ipv4_link_failure,
        .update_pmtu =          ip_rt_update_pmtu,
        .local_out =            __ip_local_out,
-       .entries =              ATOMIC_INIT(0),
 };
 
 #define ECN_OR_COST(class)     TC_PRIO_##class
@@ -199,7 +198,7 @@ const __u8 ip_tos2prio[16] = {
  */
 
 struct rt_hash_bucket {
-       struct rtable   *chain;
+       struct rtable __rcu     *chain;
 };
 
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
@@ -281,7 +280,7 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
        struct rtable *r = NULL;
 
        for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
-               if (!rt_hash_table[st->bucket].chain)
+               if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
                        continue;
                rcu_read_lock_bh();
                r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
@@ -301,17 +300,17 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 {
        struct rt_cache_iter_state *st = seq->private;
 
-       r = r->dst.rt_next;
+       r = rcu_dereference_bh(r->dst.rt_next);
        while (!r) {
                rcu_read_unlock_bh();
                do {
                        if (--st->bucket < 0)
                                return NULL;
-               } while (!rt_hash_table[st->bucket].chain);
+               } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
                rcu_read_lock_bh();
-               r = rt_hash_table[st->bucket].chain;
+               r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
        }
-       return rcu_dereference_bh(r);
+       return r;
 }
 
 static struct rtable *rt_cache_get_next(struct seq_file *seq,
@@ -466,7 +465,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 
        seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
                   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
-                  atomic_read(&ipv4_dst_ops.entries),
+                  dst_entries_get_slow(&ipv4_dst_ops),
                   st->in_hit,
                   st->in_slow_tot,
                   st->in_slow_mc,
@@ -722,19 +721,23 @@ static void rt_do_flush(int process_context)
        for (i = 0; i <= rt_hash_mask; i++) {
                if (process_context && need_resched())
                        cond_resched();
-               rth = rt_hash_table[i].chain;
+               rth = rcu_dereference_raw(rt_hash_table[i].chain);
                if (!rth)
                        continue;
 
                spin_lock_bh(rt_hash_lock_addr(i));
 #ifdef CONFIG_NET_NS
                {
-               struct rtable ** prev, * p;
+               struct rtable __rcu **prev;
+               struct rtable *p;
 
-               rth = rt_hash_table[i].chain;
+               rth = rcu_dereference_protected(rt_hash_table[i].chain,
+                       lockdep_is_held(rt_hash_lock_addr(i)));
 
                /* defer releasing the head of the list after spin_unlock */
-               for (tail = rth; tail; tail = tail->dst.rt_next)
+               for (tail = rth; tail;
+                    tail = rcu_dereference_protected(tail->dst.rt_next,
+                               lockdep_is_held(rt_hash_lock_addr(i))))
                        if (!rt_is_expired(tail))
                                break;
                if (rth != tail)
@@ -742,8 +745,12 @@ static void rt_do_flush(int process_context)
 
                /* call rt_free on entries after the tail requiring flush */
                prev = &rt_hash_table[i].chain;
-               for (p = *prev; p; p = next) {
-                       next = p->dst.rt_next;
+               for (p = rcu_dereference_protected(*prev,
+                               lockdep_is_held(rt_hash_lock_addr(i)));
+                    p != NULL;
+                    p = next) {
+                       next = rcu_dereference_protected(p->dst.rt_next,
+                               lockdep_is_held(rt_hash_lock_addr(i)));
                        if (!rt_is_expired(p)) {
                                prev = &p->dst.rt_next;
                        } else {
@@ -753,14 +760,15 @@ static void rt_do_flush(int process_context)
                }
                }
 #else
-               rth = rt_hash_table[i].chain;
-               rt_hash_table[i].chain = NULL;
+               rth = rcu_dereference_protected(rt_hash_table[i].chain,
+                       lockdep_is_held(rt_hash_lock_addr(i)));
+               rcu_assign_pointer(rt_hash_table[i].chain, NULL);
                tail = NULL;
 #endif
                spin_unlock_bh(rt_hash_lock_addr(i));
 
                for (; rth != tail; rth = next) {
-                       next = rth->dst.rt_next;
+                       next = rcu_dereference_protected(rth->dst.rt_next, 1);
                        rt_free(rth);
                }
        }
@@ -791,7 +799,7 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
        while (aux != rth) {
                if (compare_hash_inputs(&aux->fl, &rth->fl))
                        return 0;
-               aux = aux->dst.rt_next;
+               aux = rcu_dereference_protected(aux->dst.rt_next, 1);
        }
        return ONE;
 }
@@ -800,7 +808,8 @@ static void rt_check_expire(void)
 {
        static unsigned int rover;
        unsigned int i = rover, goal;
-       struct rtable *rth, **rthp;
+       struct rtable *rth;
+       struct rtable __rcu **rthp;
        unsigned long samples = 0;
        unsigned long sum = 0, sum2 = 0;
        unsigned long delta;
@@ -826,11 +835,12 @@ static void rt_check_expire(void)
 
                samples++;
 
-               if (*rthp == NULL)
+               if (rcu_dereference_raw(*rthp) == NULL)
                        continue;
                length = 0;
                spin_lock_bh(rt_hash_lock_addr(i));
-               while ((rth = *rthp) != NULL) {
+               while ((rth = rcu_dereference_protected(*rthp,
+                                       lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
                        prefetch(rth->dst.rt_next);
                        if (rt_is_expired(rth)) {
                                *rthp = rth->dst.rt_next;
@@ -942,9 +952,11 @@ static int rt_garbage_collect(struct dst_ops *ops)
        static unsigned long last_gc;
        static int rover;
        static int equilibrium;
-       struct rtable *rth, **rthp;
+       struct rtable *rth;
+       struct rtable __rcu **rthp;
        unsigned long now = jiffies;
        int goal;
+       int entries = dst_entries_get_fast(&ipv4_dst_ops);
 
        /*
         * Garbage collection is pretty expensive,
@@ -954,28 +966,28 @@ static int rt_garbage_collect(struct dst_ops *ops)
        RT_CACHE_STAT_INC(gc_total);
 
        if (now - last_gc < ip_rt_gc_min_interval &&
-           atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
+           entries < ip_rt_max_size) {
                RT_CACHE_STAT_INC(gc_ignored);
                goto out;
        }
 
+       entries = dst_entries_get_slow(&ipv4_dst_ops);
        /* Calculate number of entries, which we want to expire now. */
-       goal = atomic_read(&ipv4_dst_ops.entries) -
-               (ip_rt_gc_elasticity << rt_hash_log);
+       goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
        if (goal <= 0) {
                if (equilibrium < ipv4_dst_ops.gc_thresh)
                        equilibrium = ipv4_dst_ops.gc_thresh;
-               goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+               goal = entries - equilibrium;
                if (goal > 0) {
                        equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-                       goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+                       goal = entries - equilibrium;
                }
        } else {
                /* We are in dangerous area. Try to reduce cache really
                 * aggressively.
                 */
                goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-               equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
+               equilibrium = entries - goal;
        }
 
        if (now - last_gc >= ip_rt_gc_min_interval)
@@ -995,7 +1007,8 @@ static int rt_garbage_collect(struct dst_ops *ops)
                        k = (k + 1) & rt_hash_mask;
                        rthp = &rt_hash_table[k].chain;
                        spin_lock_bh(rt_hash_lock_addr(k));
-                       while ((rth = *rthp) != NULL) {
+                       while ((rth = rcu_dereference_protected(*rthp,
+                                       lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
                                if (!rt_is_expired(rth) &&
                                        !rt_may_expire(rth, tmo, expire)) {
                                        tmo >>= 1;
@@ -1032,14 +1045,16 @@ static int rt_garbage_collect(struct dst_ops *ops)
                expire >>= 1;
 #if RT_CACHE_DEBUG >= 2
                printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
-                               atomic_read(&ipv4_dst_ops.entries), goal, i);
+                               dst_entries_get_fast(&ipv4_dst_ops), goal, i);
 #endif
 
-               if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+               if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
                        goto out;
        } while (!in_softirq() && time_before_eq(jiffies, now));
 
-       if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+       if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
+               goto out;
+       if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
                goto out;
        if (net_ratelimit())
                printk(KERN_WARNING "dst cache overflow\n");
@@ -1049,11 +1064,12 @@ static int rt_garbage_collect(struct dst_ops *ops)
 work_done:
        expire += ip_rt_gc_min_interval;
        if (expire > ip_rt_gc_timeout ||
-           atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
+           dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
+           dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
                expire = ip_rt_gc_timeout;
 #if RT_CACHE_DEBUG >= 2
        printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
-                       atomic_read(&ipv4_dst_ops.entries), goal, rover);
+                       dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
 #endif
 out:   return 0;
 }
@@ -1068,7 +1084,7 @@ static int slow_chain_length(const struct rtable *head)
 
        while (rth) {
                length += has_noalias(head, rth);
-               rth = rth->dst.rt_next;
+               rth = rcu_dereference_protected(rth->dst.rt_next, 1);
        }
        return length >> FRACT_BITS;
 }
@@ -1076,9 +1092,9 @@ static int slow_chain_length(const struct rtable *head)
 static int rt_intern_hash(unsigned hash, struct rtable *rt,
                          struct rtable **rp, struct sk_buff *skb, int ifindex)
 {
-       struct rtable   *rth, **rthp;
+       struct rtable   *rth, *cand;
+       struct rtable __rcu **rthp, **candp;
        unsigned long   now;
-       struct rtable *cand, **candp;
        u32             min_score;
        int             chain_length;
        int attempts = !in_softirq();
@@ -1102,30 +1118,31 @@ restart:
                 * Note that we do rt_free on this new route entry, so that
                 * once its refcount hits zero, we are still able to reap it
                 * (Thanks Alexey)
-                * Note also the rt_free uses call_rcu.  We don't actually
-                * need rcu protection here, this is just our path to get
-                * on the route gc list.
+                * Note: To avoid expensive rcu stuff for this uncached dst,
+                * we set DST_NOCACHE so that dst_release() can free dst without
+                * waiting a grace period.
                 */
 
+               rt->dst.flags |= DST_NOCACHE;
                if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
                        int err = arp_bind_neighbour(&rt->dst);
                        if (err) {
                                if (net_ratelimit())
                                        printk(KERN_WARNING
                                            "Neighbour table failure & not caching routes.\n");
-                               rt_drop(rt);
+                               ip_rt_put(rt);
                                return err;
                        }
                }
 
-               rt_free(rt);
                goto skip_hashing;
        }
 
        rthp = &rt_hash_table[hash].chain;
 
        spin_lock_bh(rt_hash_lock_addr(hash));
-       while ((rth = *rthp) != NULL) {
+       while ((rth = rcu_dereference_protected(*rthp,
+                       lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
                if (rt_is_expired(rth)) {
                        *rthp = rth->dst.rt_next;
                        rt_free(rth);
@@ -1268,18 +1285,11 @@ skip_hashing:
 
 void rt_bind_peer(struct rtable *rt, int create)
 {
-       static DEFINE_SPINLOCK(rt_peer_lock);
        struct inet_peer *peer;
 
        peer = inet_getpeer(rt->rt_dst, create);
 
-       spin_lock_bh(&rt_peer_lock);
-       if (rt->peer == NULL) {
-               rt->peer = peer;
-               peer = NULL;
-       }
-       spin_unlock_bh(&rt_peer_lock);
-       if (peer)
+       if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
                inet_putpeer(peer);
 }
 
@@ -1328,12 +1338,14 @@ EXPORT_SYMBOL(__ip_select_ident);
 
 static void rt_del(unsigned hash, struct rtable *rt)
 {
-       struct rtable **rthp, *aux;
+       struct rtable __rcu **rthp;
+       struct rtable *aux;
 
        rthp = &rt_hash_table[hash].chain;
        spin_lock_bh(rt_hash_lock_addr(hash));
        ip_rt_put(rt);
-       while ((aux = *rthp) != NULL) {
+       while ((aux = rcu_dereference_protected(*rthp,
+                       lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
                if (aux == rt || rt_is_expired(aux)) {
                        *rthp = aux->dst.rt_next;
                        rt_free(aux);
@@ -1350,7 +1362,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 {
        int i, k;
        struct in_device *in_dev = __in_dev_get_rcu(dev);
-       struct rtable *rth, **rthp;
+       struct rtable *rth;
+       struct rtable __rcu **rthp;
        __be32  skeys[2] = { saddr, 0 };
        int  ikeys[2] = { dev->ifindex, 0 };
        struct netevent_redirect netevent;
@@ -1383,7 +1396,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
                        unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
                                                rt_genid(net));
 
-                       rthp=&rt_hash_table[hash].chain;
+                       rthp = &rt_hash_table[hash].chain;
 
                        while ((rth = rcu_dereference(*rthp)) != NULL) {
                                struct rtable *rt;
@@ -1779,12 +1792,15 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
 
        if (rt->fl.iif == 0)
                src = rt->rt_src;
-       else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) {
-               src = FIB_RES_PREFSRC(res);
-               fib_res_put(&res);
-       } else
-               src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
+       else {
+               rcu_read_lock();
+               if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
+                       src = FIB_RES_PREFSRC(res);
+               else
+                       src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
                                        RT_SCOPE_UNIVERSE);
+               rcu_read_unlock();
+       }
        memcpy(addr, &src, 4);
 }
 
@@ -2087,6 +2103,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
  *     Such approach solves two big problems:
  *     1. Not simplex devices are handled properly.
  *     2. IP spoofing attempts are filtered with 100% of guarantee.
+ *     called with rcu_read_lock()
  */
 
 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2108,7 +2125,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        unsigned        hash;
        __be32          spec_dst;
        int             err = -EINVAL;
-       int             free_res = 0;
        struct net    * net = dev_net(dev);
 
        /* IP on this device is disabled. */
@@ -2124,7 +2140,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
            ipv4_is_loopback(saddr))
                goto martian_source;
 
-       if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
+       if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
                goto brd_input;
 
        /* Accept zero addresses only to limited broadcast;
@@ -2133,19 +2149,18 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        if (ipv4_is_zeronet(saddr))
                goto martian_source;
 
-       if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
-           ipv4_is_loopback(daddr))
+       if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
                goto martian_destination;
 
        /*
         *      Now we are ready to route packet.
         */
-       if ((err = fib_lookup(net, &fl, &res)) != 0) {
+       err = fib_lookup(net, &fl, &res);
+       if (err != 0) {
                if (!IN_DEV_FORWARD(in_dev))
                        goto e_hostunreach;
                goto no_route;
        }
-       free_res = 1;
 
        RT_CACHE_STAT_INC(in_slow_tot);
 
@@ -2154,8 +2169,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 
        if (res.type == RTN_LOCAL) {
                err = fib_validate_source(saddr, daddr, tos,
-                                            net->loopback_dev->ifindex,
-                                            dev, &spec_dst, &itag, skb->mark);
+                                         net->loopback_dev->ifindex,
+                                         dev, &spec_dst, &itag, skb->mark);
                if (err < 0)
                        goto martian_source_keep_err;
                if (err)
@@ -2170,9 +2185,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                goto martian_destination;
 
        err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
-done:
-       if (free_res)
-               fib_res_put(&res);
 out:   return err;
 
 brd_input:
@@ -2232,7 +2244,7 @@ local_input:
        rth->rt_type    = res.type;
        hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
        err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
-       goto done;
+       goto out;
 
 no_route:
        RT_CACHE_STAT_INC(in_no_route);
@@ -2255,21 +2267,21 @@ martian_destination:
 
 e_hostunreach:
        err = -EHOSTUNREACH;
-       goto done;
+       goto out;
 
 e_inval:
        err = -EINVAL;
-       goto done;
+       goto out;
 
 e_nobufs:
        err = -ENOBUFS;
-       goto done;
+       goto out;
 
 martian_source:
        err = -EINVAL;
 martian_source_keep_err:
        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
-       goto done;
+       goto out;
 }
 
 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2355,6 +2367,7 @@ skip_cache:
 }
 EXPORT_SYMBOL(ip_route_input_common);
 
+/* called with rcu_read_lock() */
 static int __mkroute_output(struct rtable **result,
                            struct fib_result *res,
                            const struct flowi *fl,
@@ -2365,53 +2378,47 @@ static int __mkroute_output(struct rtable **result,
        struct rtable *rth;
        struct in_device *in_dev;
        u32 tos = RT_FL_TOS(oldflp);
-       int err = 0;
 
-       if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
+       if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
                return -EINVAL;
 
-       if (fl->fl4_dst == htonl(0xFFFFFFFF))
+       if (ipv4_is_lbcast(fl->fl4_dst))
                res->type = RTN_BROADCAST;
        else if (ipv4_is_multicast(fl->fl4_dst))
                res->type = RTN_MULTICAST;
-       else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
+       else if (ipv4_is_zeronet(fl->fl4_dst))
                return -EINVAL;
 
        if (dev_out->flags & IFF_LOOPBACK)
                flags |= RTCF_LOCAL;
 
-       /* get work reference to inet device */
-       in_dev = in_dev_get(dev_out);
+       in_dev = __in_dev_get_rcu(dev_out);
        if (!in_dev)
                return -EINVAL;
 
        if (res->type == RTN_BROADCAST) {
                flags |= RTCF_BROADCAST | RTCF_LOCAL;
-               if (res->fi) {
-                       fib_info_put(res->fi);
-                       res->fi = NULL;
-               }
+               res->fi = NULL;
        } else if (res->type == RTN_MULTICAST) {
-               flags |= RTCF_MULTICAST|RTCF_LOCAL;
+               flags |= RTCF_MULTICAST | RTCF_LOCAL;
                if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
                                 oldflp->proto))
                        flags &= ~RTCF_LOCAL;
                /* If multicast route do not exist use
-                  default one, but do not gateway in this case.
-                  Yes, it is hack.
+                * default one, but do not gateway in this case.
+                * Yes, it is hack.
                 */
-               if (res->fi && res->prefixlen < 4) {
-                       fib_info_put(res->fi);
+               if (res->fi && res->prefixlen < 4)
                        res->fi = NULL;
-               }
        }
 
 
        rth = dst_alloc(&ipv4_dst_ops);
-       if (!rth) {
-               err = -ENOBUFS;
-               goto cleanup;
-       }
+       if (!rth)
+               return -ENOBUFS;
+
+       in_dev_hold(in_dev);
+       rth->idev = in_dev;
 
        atomic_set(&rth->dst.__refcnt, 1);
        rth->dst.flags= DST_HOST;
@@ -2432,7 +2439,6 @@ static int __mkroute_output(struct rtable **result,
           cache entry */
        rth->dst.dev    = dev_out;
        dev_hold(dev_out);
-       rth->idev       = in_dev_get(dev_out);
        rth->rt_gateway = fl->fl4_dst;
        rth->rt_spec_dst= fl->fl4_src;
 
@@ -2467,15 +2473,11 @@ static int __mkroute_output(struct rtable **result,
        rt_set_nexthop(rth, res, 0);
 
        rth->rt_flags = flags;
-
        *result = rth;
- cleanup:
-       /* release work reference to inet device */
-       in_dev_put(in_dev);
-
-       return err;
+       return 0;
 }
 
+/* called with rcu_read_lock() */
 static int ip_mkroute_output(struct rtable **rp,
                             struct fib_result *res,
                             const struct flowi *fl,
@@ -2497,6 +2499,7 @@ static int ip_mkroute_output(struct rtable **rp,
 
 /*
  * Major route resolver routine.
+ * called with rcu_read_lock();
  */
 
 static int ip_route_output_slow(struct net *net, struct rtable **rp,
@@ -2515,9 +2518,8 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
                            .iif = net->loopback_dev->ifindex,
                            .oif = oldflp->oif };
        struct fib_result res;
-       unsigned flags = 0;
+       unsigned int flags = 0;
        struct net_device *dev_out = NULL;
-       int free_res = 0;
        int err;
 
 
@@ -2543,9 +2545,9 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 
                if (oldflp->oif == 0 &&
                    (ipv4_is_multicast(oldflp->fl4_dst) ||
-                    oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
+                    ipv4_is_lbcast(oldflp->fl4_dst))) {
                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
-                       dev_out = ip_dev_find(net, oldflp->fl4_src);
+                       dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
                        if (dev_out == NULL)
                                goto out;
 
@@ -2570,29 +2572,24 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 
                if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
-                       dev_out = ip_dev_find(net, oldflp->fl4_src);
-                       if (dev_out == NULL)
+                       if (!__ip_dev_find(net, oldflp->fl4_src, false))
                                goto out;
-                       dev_put(dev_out);
-                       dev_out = NULL;
                }
        }
 
 
        if (oldflp->oif) {
-               dev_out = dev_get_by_index(net, oldflp->oif);
+               dev_out = dev_get_by_index_rcu(net, oldflp->oif);
                err = -ENODEV;
                if (dev_out == NULL)
                        goto out;
 
                /* RACE: Check return value of inet_select_addr instead. */
-               if (__in_dev_get_rtnl(dev_out) == NULL) {
-                       dev_put(dev_out);
+               if (rcu_dereference(dev_out->ip_ptr) == NULL)
                        goto out;       /* Wrong error code */
-               }
 
                if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
-                   oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
+                   ipv4_is_lbcast(oldflp->fl4_dst)) {
                        if (!fl.fl4_src)
                                fl.fl4_src = inet_select_addr(dev_out, 0,
                                                              RT_SCOPE_LINK);
@@ -2612,10 +2609,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
                fl.fl4_dst = fl.fl4_src;
                if (!fl.fl4_dst)
                        fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
-               if (dev_out)
-                       dev_put(dev_out);
                dev_out = net->loopback_dev;
-               dev_hold(dev_out);
                fl.oif = net->loopback_dev->ifindex;
                res.type = RTN_LOCAL;
                flags |= RTCF_LOCAL;
@@ -2649,23 +2643,15 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
                        res.type = RTN_UNICAST;
                        goto make_route;
                }
-               if (dev_out)
-                       dev_put(dev_out);
                err = -ENETUNREACH;
                goto out;
        }
-       free_res = 1;
 
        if (res.type == RTN_LOCAL) {
                if (!fl.fl4_src)
                        fl.fl4_src = fl.fl4_dst;
-               if (dev_out)
-                       dev_put(dev_out);
                dev_out = net->loopback_dev;
-               dev_hold(dev_out);
                fl.oif = dev_out->ifindex;
-               if (res.fi)
-                       fib_info_put(res.fi);
                res.fi = NULL;
                flags |= RTCF_LOCAL;
                goto make_route;
@@ -2682,28 +2668,21 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
        if (!fl.fl4_src)
                fl.fl4_src = FIB_RES_PREFSRC(res);
 
-       if (dev_out)
-               dev_put(dev_out);
        dev_out = FIB_RES_DEV(res);
-       dev_hold(dev_out);
        fl.oif = dev_out->ifindex;
 
 
 make_route:
        err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
 
-
-       if (free_res)
-               fib_res_put(&res);
-       if (dev_out)
-               dev_put(dev_out);
 out:   return err;
 }
 
 int __ip_route_output_key(struct net *net, struct rtable **rp,
                          const struct flowi *flp)
 {
-       unsigned hash;
+       unsigned int hash;
+       int res;
        struct rtable *rth;
 
        if (!rt_caching(net))
@@ -2734,7 +2713,10 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
        rcu_read_unlock_bh();
 
 slow_output:
-       return ip_route_output_slow(net, rp, flp);
+       rcu_read_lock();
+       res = ip_route_output_slow(net, rp, flp);
+       rcu_read_unlock();
+       return res;
 }
 EXPORT_SYMBOL_GPL(__ip_route_output_key);
 
@@ -2753,7 +2735,6 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
        .destroy                =       ipv4_dst_destroy,
        .check                  =       ipv4_blackhole_dst_check,
        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
-       .entries                =       ATOMIC_INIT(0),
 };
 
 
@@ -2798,7 +2779,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
 
        dst_release(&(*rp)->dst);
        *rp = rt;
-       return (rt ? 0 : -ENOMEM);
+       return rt ? 0 : -ENOMEM;
 }
 
 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
@@ -3323,6 +3304,12 @@ int __init ip_rt_init(void)
 
        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
 
+       if (dst_entries_init(&ipv4_dst_ops) < 0)
+               panic("IP: failed to allocate ipv4_dst_ops counter\n");
+
+       if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
+               panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
+
        rt_hash_table = (struct rt_hash_bucket *)
                alloc_large_system_hash("IP route cache",
                                        sizeof(struct rt_hash_bucket),