memcg: fix thresholds with use_hierarchy == 1

[net-next-2.6.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index ea5f5edf00b79a16f9977e5a5f4172a31be74bff..9be3cf8a5da462d4b1b4103eef61f8d5a9a6e06c 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -47,6 +47,7 @@
  #include <linux/mm_inline.h>
  #include <linux/page_cgroup.h>
  #include <linux/cpu.h>
+#include <linux/oom.h>
  #include "internal.h"
  
  #include <asm/uaccess.h>
@@ -838,10 +839,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
  {
         int ret;
         struct mem_cgroup *curr = NULL;
+       struct task_struct *p;
  
-       task_lock(task);
-       curr = try_get_mem_cgroup_from_mm(task->mm);
-       task_unlock(task);
+       p = find_lock_task_mm(task);
+       if (!p)
+               return 0;
+       curr = try_get_mem_cgroup_from_mm(p->mm);
+       task_unlock(p);
         if (!curr)
                 return 0;
         /*
@@ -915,7 +919,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
                                        struct zone *zone,
                                        enum lru_list lru)
  {
-       int nid = zone->zone_pgdat->node_id;
+       int nid = zone_to_nid(zone);
         int zid = zone_idx(zone);
         struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
  
@@ -925,7 +929,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
  struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
                                                       struct zone *zone)
  {
-       int nid = zone->zone_pgdat->node_id;
+       int nid = zone_to_nid(zone);
         int zid = zone_idx(zone);
         struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
  
@@ -970,7 +974,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
         LIST_HEAD(pc_list);
         struct list_head *src;
         struct page_cgroup *pc, *tmp;
-       int nid = z->zone_pgdat->node_id;
+       int nid = zone_to_nid(z);
         int zid = zone_idx(z);
         struct mem_cgroup_per_zone *mz;
         int lru = LRU_FILE * file + active;
@@ -1296,8 +1300,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                 /* we use swappiness of local cgroup */
                 if (check_soft)
                         ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
-                               noswap, get_swappiness(victim), zone,
-                               zone->zone_pgdat->node_id);
+                               noswap, get_swappiness(victim), zone);
                 else
                         ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
                                                 noswap, get_swappiness(victim));
@@ -1710,28 +1713,66 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
          * thread group leader migrates. It's possible that mm is not
          * set, if so charge the init_mm (happens for pagecache usage).
          */
-       if (*memcg) {
+       if (!*memcg && !mm)
+               goto bypass;
+again:
+       if (*memcg) { /* css should be a valid one */
                 mem = *memcg;
+               VM_BUG_ON(css_is_removed(&mem->css));
+               if (mem_cgroup_is_root(mem))
+                       goto done;
+               if (consume_stock(mem))
+                       goto done;
                 css_get(&mem->css);
         } else {
-               mem = try_get_mem_cgroup_from_mm(mm);
-               if (unlikely(!mem))
-                       return 0;
-               *memcg = mem;
-       }
+               struct task_struct *p;
  
-       VM_BUG_ON(css_is_removed(&mem->css));
-       if (mem_cgroup_is_root(mem))
-               goto done;
+               rcu_read_lock();
+               p = rcu_dereference(mm->owner);
+               VM_BUG_ON(!p);
+               /*
+                * because we don't have task_lock(), "p" can exit while
+                * we're here. In that case, "mem" can point to root
+                * cgroup but never be NULL. (and task_struct itself is freed
+                * by RCU, cgroup itself is RCU safe.) Then, we have small
+                * risk here to get wrong cgroup. But such kind of mis-account
+                * by race always happens because we don't have cgroup_mutex().
+                * It's overkill and we allow that small race, here.
+                */
+               mem = mem_cgroup_from_task(p);
+               VM_BUG_ON(!mem);
+               if (mem_cgroup_is_root(mem)) {
+                       rcu_read_unlock();
+                       goto done;
+               }
+               if (consume_stock(mem)) {
+                       /*
+                        * It seems dagerous to access memcg without css_get().
+                        * But considering how consume_stok works, it's not
+                        * necessary. If consume_stock success, some charges
+                        * from this memcg are cached on this cpu. So, we
+                        * don't need to call css_get()/css_tryget() before
+                        * calling consume_stock().
+                        */
+                       rcu_read_unlock();
+                       goto done;
+               }
+               /* after here, we may be blocked. we need to get refcnt */
+               if (!css_tryget(&mem->css)) {
+                       rcu_read_unlock();
+                       goto again;
+               }
+               rcu_read_unlock();
+       }
  
         do {
                 bool oom_check;
  
-               if (consume_stock(mem))
-                       goto done; /* don't need to fill stock */
                 /* If killed, bypass charge */
-               if (fatal_signal_pending(current))
+               if (fatal_signal_pending(current)) {
+                       css_put(&mem->css);
                         goto bypass;
+               }
  
                 oom_check = false;
                 if (oom && !nr_oom_retries) {
@@ -1746,30 +1787,36 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                         break;
                 case CHARGE_RETRY: /* not in OOM situation but retry */
                         csize = PAGE_SIZE;
-                       break;
+                       css_put(&mem->css);
+                       mem = NULL;
+                       goto again;
                 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
+                       css_put(&mem->css);
                         goto nomem;
                 case CHARGE_NOMEM: /* OOM routine works */
-                       if (!oom)
+                       if (!oom) {
+                               css_put(&mem->css);
                                 goto nomem;
+                       }
                         /* If oom, we never return -ENOMEM */
                         nr_oom_retries--;
                         break;
                 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
+                       css_put(&mem->css);
                         goto bypass;
                 }
         } while (ret != CHARGE_OK);
  
         if (csize > PAGE_SIZE)
                 refill_stock(mem, csize - PAGE_SIZE);
+       css_put(&mem->css);
  done:
+       *memcg = mem;
         return 0;
  nomem:
-       css_put(&mem->css);
+       *memcg = NULL;
         return -ENOMEM;
  bypass:
-       if (mem)
-               css_put(&mem->css);
         *memcg = NULL;
         return 0;
  }
@@ -1786,11 +1833,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
                 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
                 if (do_swap_account)
                         res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
-               VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
-               WARN_ON_ONCE(count > INT_MAX);
-               __css_put(&mem->css, (int)count);
         }
-       /* we don't need css_put for root */
  }
  
  static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
@@ -2151,7 +2194,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
                 goto charge_cur_mm;
         *ptr = mem;
         ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
-       /* drop extra refcnt from tryget */
         css_put(&mem->css);
         return ret;
  charge_cur_mm:
@@ -2321,10 +2363,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                 break;
         }
  
-       if (!mem_cgroup_is_root(mem))
-               __do_uncharge(mem, ctype);
-       if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
-               mem_cgroup_swap_statistics(mem, true);
         mem_cgroup_charge_statistics(mem, pc, false);
  
         ClearPageCgroupUsed(pc);
@@ -2336,11 +2374,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
          */
  
         unlock_page_cgroup(pc);
-
+       /*
+        * even after unlock, we have mem->res.usage here and this memcg
+        * will never be freed.
+        */
         memcg_check_events(mem, page);
-       /* at swapout, this memcg will be accessed to record to swap */
-       if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
-               css_put(&mem->css);
+       if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
+               mem_cgroup_swap_statistics(mem, true);
+               mem_cgroup_get(mem);
+       }
+       if (!mem_cgroup_is_root(mem))
+               __do_uncharge(mem, ctype);
  
         return mem;
  
@@ -2427,13 +2471,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
  
         memcg = __mem_cgroup_uncharge_common(page, ctype);
  
-       /* record memcg information */
-       if (do_swap_account && swapout && memcg) {
+       /*
+        * record memcg information,  if swapout && memcg != NULL,
+        * mem_cgroup_get() was called in uncharge().
+        */
+       if (do_swap_account && swapout && memcg)
                 swap_cgroup_record(ent, css_id(&memcg->css));
-               mem_cgroup_get(memcg);
-       }
-       if (swapout && memcg)
-               css_put(&memcg->css);
  }
  #endif
  
@@ -2511,7 +2554,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
                          */
                         if (!mem_cgroup_is_root(to))
                                 res_counter_uncharge(&to->res, PAGE_SIZE);
-                       css_put(&to->css);
                 }
                 return 0;
         }
@@ -2823,8 +2865,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
  }
  
  unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
-                                               gfp_t gfp_mask, int nid,
-                                               int zid)
+                                           gfp_t gfp_mask)
  {
         unsigned long nr_reclaimed = 0;
         struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -2836,7 +2877,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
         if (order > 0)
                 return 0;
  
-       mctz = soft_limit_tree_node_zone(nid, zid);
+       mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
         /*
          * This loop can run a while, specially if mem_cgroup's continuously
          * keep exceeding their soft limit and putting the system under
@@ -3546,9 +3587,13 @@ unlock:
  
  static void mem_cgroup_threshold(struct mem_cgroup *memcg)
  {
-       __mem_cgroup_threshold(memcg, false);
-       if (do_swap_account)
-               __mem_cgroup_threshold(memcg, true);
+       while (memcg) {
+               __mem_cgroup_threshold(memcg, false);
+               if (do_swap_account)
+                       __mem_cgroup_threshold(memcg, true);
+
+               memcg = parent_mem_cgroup(memcg);
+       }
  }
  
  static int compare_thresholds(const void *a, const void *b)
@@ -4210,9 +4255,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
                         goto one_by_one;
                 }
                 mc.precharge += count;
-               VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
-               WARN_ON_ONCE(count > INT_MAX);
-               __css_get(&mem->css, (int)count);
                 return ret;
         }
  one_by_one:
@@ -4448,7 +4490,6 @@ static void mem_cgroup_clear_mc(void)
         }
         /* we must fixup refcnts and charges */
         if (mc.moved_swap) {
-               WARN_ON_ONCE(mc.moved_swap > INT_MAX);
                 /* uncharge swap account from the old cgroup */
                 if (!mem_cgroup_is_root(mc.from))
                         res_counter_uncharge(&mc.from->memsw,
@@ -4462,8 +4503,6 @@ static void mem_cgroup_clear_mc(void)
                          */
                         res_counter_uncharge(&mc.to->res,
                                                 PAGE_SIZE * mc.moved_swap);
-                       VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
-                       __css_put(&mc.to->css, mc.moved_swap);
                 }
                 /* we've already done mem_cgroup_get(mc.to) */