]> bbs.cooldavid.org Git - net-next-2.6.git/blobdiff - mm/memcontrol.c
memcg: fix thresholds with use_hierarchy == 1
[net-next-2.6.git] / mm / memcontrol.c
index 0576e9e64586df7dd0aa0d182966c32df92384c2..9be3cf8a5da462d4b1b4103eef61f8d5a9a6e06c 100644 (file)
@@ -47,6 +47,7 @@
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
+#include <linux/oom.h>
 #include "internal.h"
 
 #include <asm/uaccess.h>
@@ -268,6 +269,7 @@ enum move_type {
 
 /* "mc" and its members are protected by cgroup_mutex */
 static struct move_charge_struct {
+       spinlock_t        lock; /* for from, to, moving_task */
        struct mem_cgroup *from;
        struct mem_cgroup *to;
        unsigned long precharge;
@@ -276,6 +278,7 @@ static struct move_charge_struct {
        struct task_struct *moving_task;        /* a task moving charges */
        wait_queue_head_t waitq;                /* a waitq for other context */
 } mc = {
+       .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
        .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 };
 
@@ -836,12 +839,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 {
        int ret;
        struct mem_cgroup *curr = NULL;
+       struct task_struct *p;
 
-       task_lock(task);
-       rcu_read_lock();
-       curr = try_get_mem_cgroup_from_mm(task->mm);
-       rcu_read_unlock();
-       task_unlock(task);
+       p = find_lock_task_mm(task);
+       if (!p)
+               return 0;
+       curr = try_get_mem_cgroup_from_mm(p->mm);
+       task_unlock(p);
        if (!curr)
                return 0;
        /*
@@ -915,7 +919,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
                                       struct zone *zone,
                                       enum lru_list lru)
 {
-       int nid = zone->zone_pgdat->node_id;
+       int nid = zone_to_nid(zone);
        int zid = zone_idx(zone);
        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 
@@ -925,7 +929,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
                                                      struct zone *zone)
 {
-       int nid = zone->zone_pgdat->node_id;
+       int nid = zone_to_nid(zone);
        int zid = zone_idx(zone);
        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 
@@ -970,7 +974,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
        LIST_HEAD(pc_list);
        struct list_head *src;
        struct page_cgroup *pc, *tmp;
-       int nid = z->zone_pgdat->node_id;
+       int nid = zone_to_nid(z);
        int zid = zone_idx(z);
        struct mem_cgroup_per_zone *mz;
        int lru = LRU_FILE * file + active;
@@ -1047,6 +1051,47 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
        return swappiness;
 }
 
+/* A routine for testing mem is not under move_account */
+
+static bool mem_cgroup_under_move(struct mem_cgroup *mem)
+{
+       struct mem_cgroup *from;
+       struct mem_cgroup *to;
+       bool ret = false;
+       /*
+        * Unlike task_move routines, we access mc.to, mc.from not under
+        * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
+        */
+       spin_lock(&mc.lock);
+       from = mc.from;
+       to = mc.to;
+       if (!from)
+               goto unlock;
+       if (from == mem || to == mem
+           || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
+           || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
+               ret = true;
+unlock:
+       spin_unlock(&mc.lock);
+       return ret;
+}
+
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
+{
+       if (mc.moving_task && current != mc.moving_task) {
+               if (mem_cgroup_under_move(mem)) {
+                       DEFINE_WAIT(wait);
+                       prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
+                       /* moving charge context might have finished. */
+                       if (mc.moving_task)
+                               schedule();
+                       finish_wait(&mc.waitq, &wait);
+                       return true;
+               }
+       }
+       return false;
+}
+
 static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
 {
        int *val = data;
@@ -1255,8 +1300,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                /* we use swappiness of local cgroup */
                if (check_soft)
                        ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
-                               noswap, get_swappiness(victim), zone,
-                               zone->zone_pgdat->node_id);
+                               noswap, get_swappiness(victim), zone);
                else
                        ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
                                                noswap, get_swappiness(victim));
@@ -1363,7 +1407,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
 
 static void memcg_oom_recover(struct mem_cgroup *mem)
 {
-       if (atomic_read(&mem->oom_lock))
+       if (mem && atomic_read(&mem->oom_lock))
                memcg_wakeup_oom(mem);
 }
 
@@ -1575,16 +1619,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
        return NOTIFY_OK;
 }
 
+
+/* See __mem_cgroup_try_charge() for details */
+enum {
+       CHARGE_OK,              /* success */
+       CHARGE_RETRY,           /* need to retry but retry is not bad */
+       CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
+       CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
+       CHARGE_OOM_DIE,         /* the current is killed because of OOM */
+};
+
+static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+                               int csize, bool oom_check)
+{
+       struct mem_cgroup *mem_over_limit;
+       struct res_counter *fail_res;
+       unsigned long flags = 0;
+       int ret;
+
+       ret = res_counter_charge(&mem->res, csize, &fail_res);
+
+       if (likely(!ret)) {
+               if (!do_swap_account)
+                       return CHARGE_OK;
+               ret = res_counter_charge(&mem->memsw, csize, &fail_res);
+               if (likely(!ret))
+                       return CHARGE_OK;
+
+               mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+               flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+       } else
+               mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+
+       if (csize > PAGE_SIZE) /* change csize and retry */
+               return CHARGE_RETRY;
+
+       if (!(gfp_mask & __GFP_WAIT))
+               return CHARGE_WOULDBLOCK;
+
+       ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+                                       gfp_mask, flags);
+       /*
+        * try_to_free_mem_cgroup_pages() might not give us a full
+        * picture of reclaim. Some pages are reclaimed and might be
+        * moved to swap cache or just unmapped from the cgroup.
+        * Check the limit again to see if the reclaim reduced the
+        * current usage of the cgroup before giving up
+        */
+       if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+               return CHARGE_RETRY;
+
+       /*
+        * At task move, charge accounts can be doubly counted. So, it's
+        * better to wait until the end of task_move if something is going on.
+        */
+       if (mem_cgroup_wait_acct_move(mem_over_limit))
+               return CHARGE_RETRY;
+
+       /* If we don't need to call oom-killer at el, return immediately */
+       if (!oom_check)
+               return CHARGE_NOMEM;
+       /* check OOM */
+       if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
+               return CHARGE_OOM_DIE;
+
+       return CHARGE_RETRY;
+}
+
 /*
  * Unlike exported interface, "oom" parameter is added. if oom==true,
  * oom-killer can be invoked.
  */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
-                       gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+               gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
 {
-       struct mem_cgroup *mem, *mem_over_limit;
-       int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-       struct res_counter *fail_res;
+       int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+       struct mem_cgroup *mem = NULL;
+       int ret;
        int csize = CHARGE_SIZE;
 
        /*
@@ -1602,126 +1713,108 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
         * thread group leader migrates. It's possible that mm is not
         * set, if so charge the init_mm (happens for pagecache usage).
         */
-       mem = *memcg;
-       if (likely(!mem)) {
-               mem = try_get_mem_cgroup_from_mm(mm);
-               *memcg = mem;
-       } else {
-               css_get(&mem->css);
-       }
-       if (unlikely(!mem))
-               return 0;
-
-       VM_BUG_ON(css_is_removed(&mem->css));
-       if (mem_cgroup_is_root(mem))
-               goto done;
-
-       while (1) {
-               int ret = 0;
-               unsigned long flags = 0;
-
+       if (!*memcg && !mm)
+               goto bypass;
+again:
+       if (*memcg) { /* css should be a valid one */
+               mem = *memcg;
+               VM_BUG_ON(css_is_removed(&mem->css));
+               if (mem_cgroup_is_root(mem))
+                       goto done;
                if (consume_stock(mem))
                        goto done;
+               css_get(&mem->css);
+       } else {
+               struct task_struct *p;
 
-               ret = res_counter_charge(&mem->res, csize, &fail_res);
-               if (likely(!ret)) {
-                       if (!do_swap_account)
-                               break;
-                       ret = res_counter_charge(&mem->memsw, csize, &fail_res);
-                       if (likely(!ret))
-                               break;
-                       /* mem+swap counter fails */
-                       res_counter_uncharge(&mem->res, csize);
-                       flags |= MEM_CGROUP_RECLAIM_NOSWAP;
-                       mem_over_limit = mem_cgroup_from_res_counter(fail_res,
-                                                                       memsw);
-               } else
-                       /* mem counter fails */
-                       mem_over_limit = mem_cgroup_from_res_counter(fail_res,
-                                                                       res);
-
-               /* reduce request size and retry */
-               if (csize > PAGE_SIZE) {
-                       csize = PAGE_SIZE;
-                       continue;
-               }
-               if (!(gfp_mask & __GFP_WAIT))
-                       goto nomem;
-
-               ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-                                               gfp_mask, flags);
-               if (ret)
-                       continue;
-
+               rcu_read_lock();
+               p = rcu_dereference(mm->owner);
+               VM_BUG_ON(!p);
                /*
-                * try_to_free_mem_cgroup_pages() might not give us a full
-                * picture of reclaim. Some pages are reclaimed and might be
-                * moved to swap cache or just unmapped from the cgroup.
-                * Check the limit again to see if the reclaim reduced the
-                * current usage of the cgroup before giving up
-                *
+                * because we don't have task_lock(), "p" can exit while
+                * we're here. In that case, "mem" can point to root
+                * cgroup but never be NULL. (and task_struct itself is freed
+                * by RCU, cgroup itself is RCU safe.) Then, we have small
+                * risk here to get wrong cgroup. But such kind of mis-account
+                * by race always happens because we don't have cgroup_mutex().
+                * It's overkill and we allow that small race, here.
                 */
-               if (mem_cgroup_check_under_limit(mem_over_limit))
-                       continue;
-
-               /* try to avoid oom while someone is moving charge */
-               if (mc.moving_task && current != mc.moving_task) {
-                       struct mem_cgroup *from, *to;
-                       bool do_continue = false;
+               mem = mem_cgroup_from_task(p);
+               VM_BUG_ON(!mem);
+               if (mem_cgroup_is_root(mem)) {
+                       rcu_read_unlock();
+                       goto done;
+               }
+               if (consume_stock(mem)) {
                        /*
-                        * There is a small race that "from" or "to" can be
-                        * freed by rmdir, so we use css_tryget().
+                        * It seems dagerous to access memcg without css_get().
+                        * But considering how consume_stok works, it's not
+                        * necessary. If consume_stock success, some charges
+                        * from this memcg are cached on this cpu. So, we
+                        * don't need to call css_get()/css_tryget() before
+                        * calling consume_stock().
                         */
-                       from = mc.from;
-                       to = mc.to;
-                       if (from && css_tryget(&from->css)) {
-                               if (mem_over_limit->use_hierarchy)
-                                       do_continue = css_is_ancestor(
-                                                       &from->css,
-                                                       &mem_over_limit->css);
-                               else
-                                       do_continue = (from == mem_over_limit);
-                               css_put(&from->css);
-                       }
-                       if (!do_continue && to && css_tryget(&to->css)) {
-                               if (mem_over_limit->use_hierarchy)
-                                       do_continue = css_is_ancestor(
-                                                       &to->css,
-                                                       &mem_over_limit->css);
-                               else
-                                       do_continue = (to == mem_over_limit);
-                               css_put(&to->css);
-                       }
-                       if (do_continue) {
-                               DEFINE_WAIT(wait);
-                               prepare_to_wait(&mc.waitq, &wait,
-                                                       TASK_INTERRUPTIBLE);
-                               /* moving charge context might have finished. */
-                               if (mc.moving_task)
-                                       schedule();
-                               finish_wait(&mc.waitq, &wait);
-                               continue;
-                       }
+                       rcu_read_unlock();
+                       goto done;
+               }
+               /* after here, we may be blocked. we need to get refcnt */
+               if (!css_tryget(&mem->css)) {
+                       rcu_read_unlock();
+                       goto again;
                }
+               rcu_read_unlock();
+       }
 
-               if (!nr_retries--) {
-                       if (!oom)
+       do {
+               bool oom_check;
+
+               /* If killed, bypass charge */
+               if (fatal_signal_pending(current)) {
+                       css_put(&mem->css);
+                       goto bypass;
+               }
+
+               oom_check = false;
+               if (oom && !nr_oom_retries) {
+                       oom_check = true;
+                       nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+               }
+
+               ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
+
+               switch (ret) {
+               case CHARGE_OK:
+                       break;
+               case CHARGE_RETRY: /* not in OOM situation but retry */
+                       csize = PAGE_SIZE;
+                       css_put(&mem->css);
+                       mem = NULL;
+                       goto again;
+               case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
+                       css_put(&mem->css);
+                       goto nomem;
+               case CHARGE_NOMEM: /* OOM routine works */
+                       if (!oom) {
+                               css_put(&mem->css);
                                goto nomem;
-                       if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
-                               nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-                               continue;
                        }
-                       /* When we reach here, current task is dying .*/
+                       /* If oom, we never return -ENOMEM */
+                       nr_oom_retries--;
+                       break;
+               case CHARGE_OOM_DIE: /* Killed by OOM Killer */
                        css_put(&mem->css);
                        goto bypass;
                }
-       }
+       } while (ret != CHARGE_OK);
+
        if (csize > PAGE_SIZE)
                refill_stock(mem, csize - PAGE_SIZE);
+       css_put(&mem->css);
 done:
+       *memcg = mem;
        return 0;
 nomem:
-       css_put(&mem->css);
+       *memcg = NULL;
        return -ENOMEM;
 bypass:
        *memcg = NULL;
@@ -1740,11 +1833,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
                res_counter_uncharge(&mem->res, PAGE_SIZE * count);
                if (do_swap_account)
                        res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
-               VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
-               WARN_ON_ONCE(count > INT_MAX);
-               __css_put(&mem->css, (int)count);
        }
-       /* we don't need css_put for root */
 }
 
 static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
@@ -1972,10 +2061,9 @@ out:
  * < 0 if the cgroup is over its limit
  */
 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
-                               gfp_t gfp_mask, enum charge_type ctype,
-                               struct mem_cgroup *memcg)
+                               gfp_t gfp_mask, enum charge_type ctype)
 {
-       struct mem_cgroup *mem;
+       struct mem_cgroup *mem = NULL;
        struct page_cgroup *pc;
        int ret;
 
@@ -1985,7 +2073,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                return 0;
        prefetchw(pc);
 
-       mem = memcg;
        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
        if (ret || !mem)
                return ret;
@@ -2013,7 +2100,7 @@ int mem_cgroup_newpage_charge(struct page *page,
        if (unlikely(!mm))
                mm = &init_mm;
        return mem_cgroup_charge_common(page, mm, gfp_mask,
-                               MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
+                               MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 
 static void
@@ -2023,7 +2110,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask)
 {
-       struct mem_cgroup *mem = NULL;
        int ret;
 
        if (mem_cgroup_disabled())
@@ -2044,7 +2130,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
        if (!(gfp_mask & __GFP_WAIT)) {
                struct page_cgroup *pc;
 
-
                pc = lookup_page_cgroup(page);
                if (!pc)
                        return 0;
@@ -2056,22 +2141,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                unlock_page_cgroup(pc);
        }
 
-       if (unlikely(!mm && !mem))
+       if (unlikely(!mm))
                mm = &init_mm;
 
        if (page_is_file_cache(page))
                return mem_cgroup_charge_common(page, mm, gfp_mask,
-                               MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
+                               MEM_CGROUP_CHARGE_TYPE_CACHE);
 
        /* shmem */
        if (PageSwapCache(page)) {
+               struct mem_cgroup *mem = NULL;
+
                ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
                if (!ret)
                        __mem_cgroup_commit_charge_swapin(page, mem,
                                        MEM_CGROUP_CHARGE_TYPE_SHMEM);
        } else
                ret = mem_cgroup_charge_common(page, mm, gfp_mask,
-                                       MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
+                                       MEM_CGROUP_CHARGE_TYPE_SHMEM);
 
        return ret;
 }
@@ -2107,7 +2194,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
                goto charge_cur_mm;
        *ptr = mem;
        ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
-       /* drop extra refcnt from tryget */
        css_put(&mem->css);
        return ret;
 charge_cur_mm:
@@ -2238,7 +2324,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
-       struct mem_cgroup_per_zone *mz;
 
        if (mem_cgroup_disabled())
                return NULL;
@@ -2278,10 +2363,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
 
-       if (!mem_cgroup_is_root(mem))
-               __do_uncharge(mem, ctype);
-       if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
-               mem_cgroup_swap_statistics(mem, true);
        mem_cgroup_charge_statistics(mem, pc, false);
 
        ClearPageCgroupUsed(pc);
@@ -2292,13 +2373,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
         * special functions.
         */
 
-       mz = page_cgroup_zoneinfo(pc);
        unlock_page_cgroup(pc);
-
+       /*
+        * even after unlock, we have mem->res.usage here and this memcg
+        * will never be freed.
+        */
        memcg_check_events(mem, page);
-       /* at swapout, this memcg will be accessed to record to swap */
-       if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
-               css_put(&mem->css);
+       if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
+               mem_cgroup_swap_statistics(mem, true);
+               mem_cgroup_get(mem);
+       }
+       if (!mem_cgroup_is_root(mem))
+               __do_uncharge(mem, ctype);
 
        return mem;
 
@@ -2385,13 +2471,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 
        memcg = __mem_cgroup_uncharge_common(page, ctype);
 
-       /* record memcg information */
-       if (do_swap_account && swapout && memcg) {
+       /*
+        * record memcg information,  if swapout && memcg != NULL,
+        * mem_cgroup_get() was called in uncharge().
+        */
+       if (do_swap_account && swapout && memcg)
                swap_cgroup_record(ent, css_id(&memcg->css));
-               mem_cgroup_get(memcg);
-       }
-       if (swapout && memcg)
-               css_put(&memcg->css);
 }
 #endif
 
@@ -2469,7 +2554,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
                         */
                        if (!mem_cgroup_is_root(to))
                                res_counter_uncharge(&to->res, PAGE_SIZE);
-                       css_put(&to->css);
                }
                return 0;
        }
@@ -2604,11 +2688,8 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
        ClearPageCgroupMigration(pc);
        unlock_page_cgroup(pc);
 
-       if (unused != oldpage)
-               pc = lookup_page_cgroup(unused);
        __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
 
-       pc = lookup_page_cgroup(used);
        /*
         * If a page is a file cache, radix-tree replacement is very atomic
         * and we can skip this check. When it was an Anon page, its mapcount
@@ -2784,8 +2865,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 }
 
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
-                                               gfp_t gfp_mask, int nid,
-                                               int zid)
+                                           gfp_t gfp_mask)
 {
        unsigned long nr_reclaimed = 0;
        struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -2797,7 +2877,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
        if (order > 0)
                return 0;
 
-       mctz = soft_limit_tree_node_zone(nid, zid);
+       mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
        /*
         * This loop can run a while, specially if mem_cgroup's continuously
         * keep exceeding their soft limit and putting the system under
@@ -3507,9 +3587,13 @@ unlock:
 
 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
 {
-       __mem_cgroup_threshold(memcg, false);
-       if (do_swap_account)
-               __mem_cgroup_threshold(memcg, true);
+       while (memcg) {
+               __mem_cgroup_threshold(memcg, false);
+               if (do_swap_account)
+                       __mem_cgroup_threshold(memcg, true);
+
+               memcg = parent_mem_cgroup(memcg);
+       }
 }
 
 static int compare_thresholds(const void *a, const void *b)
@@ -3752,8 +3836,6 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
        return 0;
 }
 
-/*
- */
 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
        struct cftype *cft, u64 val)
 {
@@ -4173,9 +4255,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
                        goto one_by_one;
                }
                mc.precharge += count;
-               VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
-               WARN_ON_ONCE(count > INT_MAX);
-               __css_get(&mem->css, (int)count);
                return ret;
        }
 one_by_one:
@@ -4393,11 +4472,13 @@ static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 
 static void mem_cgroup_clear_mc(void)
 {
+       struct mem_cgroup *from = mc.from;
+       struct mem_cgroup *to = mc.to;
+
        /* we must uncharge all the leftover precharges from mc.to */
        if (mc.precharge) {
                __mem_cgroup_cancel_charge(mc.to, mc.precharge);
                mc.precharge = 0;
-               memcg_oom_recover(mc.to);
        }
        /*
         * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4406,11 +4487,9 @@ static void mem_cgroup_clear_mc(void)
        if (mc.moved_charge) {
                __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
                mc.moved_charge = 0;
-               memcg_oom_recover(mc.from);
        }
        /* we must fixup refcnts and charges */
        if (mc.moved_swap) {
-               WARN_ON_ONCE(mc.moved_swap > INT_MAX);
                /* uncharge swap account from the old cgroup */
                if (!mem_cgroup_is_root(mc.from))
                        res_counter_uncharge(&mc.from->memsw,
@@ -4424,16 +4503,18 @@ static void mem_cgroup_clear_mc(void)
                         */
                        res_counter_uncharge(&mc.to->res,
                                                PAGE_SIZE * mc.moved_swap);
-                       VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
-                       __css_put(&mc.to->css, mc.moved_swap);
                }
                /* we've already done mem_cgroup_get(mc.to) */
 
                mc.moved_swap = 0;
        }
+       spin_lock(&mc.lock);
        mc.from = NULL;
        mc.to = NULL;
        mc.moving_task = NULL;
+       spin_unlock(&mc.lock);
+       memcg_oom_recover(from);
+       memcg_oom_recover(to);
        wake_up_all(&mc.waitq);
 }
 
@@ -4462,12 +4543,14 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                        VM_BUG_ON(mc.moved_charge);
                        VM_BUG_ON(mc.moved_swap);
                        VM_BUG_ON(mc.moving_task);
+                       spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = mem;
                        mc.precharge = 0;
                        mc.moved_charge = 0;
                        mc.moved_swap = 0;
                        mc.moving_task = current;
+                       spin_unlock(&mc.lock);
 
                        ret = mem_cgroup_precharge_mc(mm);
                        if (ret)