mm/memcontrol.c

   1 /* memcontrol.c - Memory Controller
   2  *
   3  * Copyright IBM Corporation, 2007
   4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5  *
   6  * Copyright 2007 OpenVZ SWsoft Inc
   7  * Author: Pavel Emelianov <xemul@openvz.org>
   8  *
   9  * Memory thresholds
  10  * Copyright (C) 2009 Nokia Corporation
  11  * Author: Kirill A. Shutemov
  12  *
  13  * This program is free software; you can redistribute it and/or modify
  14  * it under the terms of the GNU General Public License as published by
  15  * the Free Software Foundation; either version 2 of the License, or
  16  * (at your option) any later version.
  17  *
  18  * This program is distributed in the hope that it will be useful,
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21  * GNU General Public License for more details.
  22  */
  23
  24 #include <linux/res_counter.h>
  25 #include <linux/memcontrol.h>
  26 #include <linux/cgroup.h>
  27 #include <linux/mm.h>
  28 #include <linux/hugetlb.h>
  29 #include <linux/pagemap.h>
  30 #include <linux/smp.h>
  31 #include <linux/page-flags.h>
  32 #include <linux/backing-dev.h>
  33 #include <linux/bit_spinlock.h>
  34 #include <linux/rcupdate.h>
  35 #include <linux/limits.h>
  36 #include <linux/mutex.h>
  37 #include <linux/rbtree.h>
  38 #include <linux/slab.h>
  39 #include <linux/swap.h>
  40 #include <linux/swapops.h>
  41 #include <linux/spinlock.h>
  42 #include <linux/eventfd.h>
  43 #include <linux/sort.h>
  44 #include <linux/fs.h>
  45 #include <linux/seq_file.h>
  46 #include <linux/vmalloc.h>
  47 #include <linux/mm_inline.h>
  48 #include <linux/page_cgroup.h>
  49 #include <linux/cpu.h>
  50 #include "internal.h"
  51
  52 #include <asm/uaccess.h>
  53
  54 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
  55 #define MEM_CGROUP_RECLAIM_RETRIES      5
  56 struct mem_cgroup *root_mem_cgroup __read_mostly;
  57
  58 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  59 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
  60 int do_swap_account __read_mostly;
  61 static int really_do_swap_account __initdata = 1; /* for remember boot option*/
  62 #else
  63 #define do_swap_account         (0)
  64 #endif
  65
  66 /*
  67  * Per memcg event counter is incremented at every pagein/pageout. This counter
  68  * is used for trigger some periodic events. This is straightforward and better
  69  * than using jiffies etc. to handle periodic memcg event.
  70  *
  71  * These values will be used as !((event) & ((1 <<(thresh)) - 1))
  72  */
  73 #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
  74 #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
  75
  76 /*
  77  * Statistics for memory cgroup.
  78  */
  79 enum mem_cgroup_stat_index {
  80         /*
  81          * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
  82          */
  83         MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
  84         MEM_CGROUP_STAT_RSS,       /* # of pages charged as anon rss */
  85         MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
  86         MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
  87         MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
  88         MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
  89         MEM_CGROUP_EVENTS,      /* incremented at every  pagein/pageout */
  90
  91         MEM_CGROUP_STAT_NSTATS,
  92 };
  93
  94 struct mem_cgroup_stat_cpu {
  95         s64 count[MEM_CGROUP_STAT_NSTATS];
  96 };
  97
  98 /*
  99  * per-zone information in memory controller.
 100  */
 101 struct mem_cgroup_per_zone {
 102         /*
 103          * spin_lock to protect the per cgroup LRU
 104          */
 105         struct list_head        lists[NR_LRU_LISTS];
 106         unsigned long           count[NR_LRU_LISTS];
 107
 108         struct zone_reclaim_stat reclaim_stat;
 109         struct rb_node          tree_node;      /* RB tree node */
 110         unsigned long long      usage_in_excess;/* Set to the value by which */
 111                                                 /* the soft limit is exceeded*/
 112         bool                    on_tree;
 113         struct mem_cgroup       *mem;           /* Back pointer, we cannot */
 114                                                 /* use container_of        */
 115 };
 116 /* Macro for accessing counter */
 117 #define MEM_CGROUP_ZSTAT(mz, idx)       ((mz)->count[(idx)])
 118
 119 struct mem_cgroup_per_node {
 120         struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 121 };
 122
 123 struct mem_cgroup_lru_info {
 124         struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 125 };
 126
 127 /*
 128  * Cgroups above their limits are maintained in a RB-Tree, independent of
 129  * their hierarchy representation
 130  */
 131
 132 struct mem_cgroup_tree_per_zone {
 133         struct rb_root rb_root;
 134         spinlock_t lock;
 135 };
 136
 137 struct mem_cgroup_tree_per_node {
 138         struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
 139 };
 140
 141 struct mem_cgroup_tree {
 142         struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 143 };
 144
 145 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 146
 147 struct mem_cgroup_threshold {
 148         struct eventfd_ctx *eventfd;
 149         u64 threshold;
 150 };
 151
 152 /* For threshold */
 153 struct mem_cgroup_threshold_ary {
 154         /* An array index points to threshold just below usage. */
 155         atomic_t current_threshold;
 156         /* Size of entries[] */
 157         unsigned int size;
 158         /* Array of thresholds */
 159         struct mem_cgroup_threshold entries[0];
 160 };
 161 /* for OOM */
 162 struct mem_cgroup_eventfd_list {
 163         struct list_head list;
 164         struct eventfd_ctx *eventfd;
 165 };
 166
 167 static void mem_cgroup_threshold(struct mem_cgroup *mem);
 168 static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
 169
 170 /*
 171  * The memory controller data structure. The memory controller controls both
 172  * page cache and RSS per cgroup. We would eventually like to provide
 173  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 174  * to help the administrator determine what knobs to tune.
 175  *
 176  * TODO: Add a water mark for the memory controller. Reclaim will begin when
 177  * we hit the water mark. May be even add a low water mark, such that
 178  * no reclaim occurs from a cgroup at it's low water mark, this is
 179  * a feature that will be implemented much later in the future.
 180  */
 181 struct mem_cgroup {
 182         struct cgroup_subsys_state css;
 183         /*
 184          * the counter to account for memory usage
 185          */
 186         struct res_counter res;
 187         /*
 188          * the counter to account for mem+swap usage.
 189          */
 190         struct res_counter memsw;
 191         /*
 192          * Per cgroup active and inactive list, similar to the
 193          * per zone LRU lists.
 194          */
 195         struct mem_cgroup_lru_info info;
 196
 197         /*
 198           protect against reclaim related member.
 199         */
 200         spinlock_t reclaim_param_lock;
 201
 202         int     prev_priority;  /* for recording reclaim priority */
 203
 204         /*
 205          * While reclaiming in a hierarchy, we cache the last child we
 206          * reclaimed from.
 207          */
 208         int last_scanned_child;
 209         /*
 210          * Should the accounting and control be hierarchical, per subtree?
 211          */
 212         bool use_hierarchy;
 213         atomic_t        oom_lock;
 214         atomic_t        refcnt;
 215
 216         unsigned int    swappiness;
 217
 218         /* set when res.limit == memsw.limit */
 219         bool            memsw_is_minimum;
 220
 221         /* protect arrays of thresholds */
 222         struct mutex thresholds_lock;
 223
 224         /* thresholds for memory usage. RCU-protected */
 225         struct mem_cgroup_threshold_ary *thresholds;
 226
 227         /* thresholds for mem+swap usage. RCU-protected */
 228         struct mem_cgroup_threshold_ary *memsw_thresholds;
 229
 230         /* For oom notifier event fd */
 231         struct list_head oom_notify;
 232
 233         /*
 234          * Should we move charges of a task when a task is moved into this
 235          * mem_cgroup ? And what type of charges should we move ?
 236          */
 237         unsigned long   move_charge_at_immigrate;
 238
 239         /*
 240          * percpu counter.
 241          */
 242         struct mem_cgroup_stat_cpu *stat;
 243 };
 244
 245 /* Stuffs for move charges at task migration. */
 246 /*
 247  * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
 248  * left-shifted bitmap of these types.
 249  */
 250 enum move_type {
 251         MOVE_CHARGE_TYPE_ANON,  /* private anonymous page and swap of it */
 252         NR_MOVE_TYPE,
 253 };
 254
 255 /* "mc" and its members are protected by cgroup_mutex */
 256 static struct move_charge_struct {
 257         struct mem_cgroup *from;
 258         struct mem_cgroup *to;
 259         unsigned long precharge;
 260         unsigned long moved_charge;
 261         unsigned long moved_swap;
 262         struct task_struct *moving_task;        /* a task moving charges */
 263         wait_queue_head_t waitq;                /* a waitq for other context */
 264 } mc = {
 265         .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 266 };
 267
 268 /*
 269  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 270  * limit reclaim to prevent infinite loops, if they ever occur.
 271  */
 272 #define MEM_CGROUP_MAX_RECLAIM_LOOPS            (100)
 273 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
 274
 275 enum charge_type {
 276         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 277         MEM_CGROUP_CHARGE_TYPE_MAPPED,
 278         MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
 279         MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
 280         MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
 281         MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
 282         NR_CHARGE_TYPE,
 283 };
 284
 285 /* only for here (for easy reading.) */
 286 #define PCGF_CACHE      (1UL << PCG_CACHE)
 287 #define PCGF_USED       (1UL << PCG_USED)
 288 #define PCGF_LOCK       (1UL << PCG_LOCK)
 289 /* Not used, but added here for completeness */
 290 #define PCGF_ACCT       (1UL << PCG_ACCT)
 291
 292 /* for encoding cft->private value on file */
 293 #define _MEM                    (0)
 294 #define _MEMSWAP                (1)
 295 #define _OOM_TYPE               (2)
 296 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
 297 #define MEMFILE_TYPE(val)       (((val) >> 16) & 0xffff)
 298 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
 299 /* Used for OOM nofiier */
 300 #define OOM_CONTROL             (0)
 301
 302 /*
 303  * Reclaim flags for mem_cgroup_hierarchical_reclaim
 304  */
 305 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT   0x0
 306 #define MEM_CGROUP_RECLAIM_NOSWAP       (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
 307 #define MEM_CGROUP_RECLAIM_SHRINK_BIT   0x1
 308 #define MEM_CGROUP_RECLAIM_SHRINK       (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
 309 #define MEM_CGROUP_RECLAIM_SOFT_BIT     0x2
 310 #define MEM_CGROUP_RECLAIM_SOFT         (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
 311
 312 static void mem_cgroup_get(struct mem_cgroup *mem);
 313 static void mem_cgroup_put(struct mem_cgroup *mem);
 314 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
 315 static void drain_all_stock_async(void);
 316
 317 static struct mem_cgroup_per_zone *
 318 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
 319 {
 320         return &mem->info.nodeinfo[nid]->zoneinfo[zid];
 321 }
 322
 323 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
 324 {
 325         return &mem->css;
 326 }
 327
 328 static struct mem_cgroup_per_zone *
 329 page_cgroup_zoneinfo(struct page_cgroup *pc)
 330 {
 331         struct mem_cgroup *mem = pc->mem_cgroup;
 332         int nid = page_cgroup_nid(pc);
 333         int zid = page_cgroup_zid(pc);
 334
 335         if (!mem)
 336                 return NULL;
 337
 338         return mem_cgroup_zoneinfo(mem, nid, zid);
 339 }
 340
 341 static struct mem_cgroup_tree_per_zone *
 342 soft_limit_tree_node_zone(int nid, int zid)
 343 {
 344         return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 345 }
 346
 347 static struct mem_cgroup_tree_per_zone *
 348 soft_limit_tree_from_page(struct page *page)
 349 {
 350         int nid = page_to_nid(page);
 351         int zid = page_zonenum(page);
 352
 353         return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 354 }
 355
 356 static void
 357 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
 358                                 struct mem_cgroup_per_zone *mz,
 359                                 struct mem_cgroup_tree_per_zone *mctz,
 360                                 unsigned long long new_usage_in_excess)
 361 {
 362         struct rb_node **p = &mctz->rb_root.rb_node;
 363         struct rb_node *parent = NULL;
 364         struct mem_cgroup_per_zone *mz_node;
 365
 366         if (mz->on_tree)
 367                 return;
 368
 369         mz->usage_in_excess = new_usage_in_excess;
 370         if (!mz->usage_in_excess)
 371                 return;
 372         while (*p) {
 373                 parent = *p;
 374                 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
 375                                         tree_node);
 376                 if (mz->usage_in_excess < mz_node->usage_in_excess)
 377                         p = &(*p)->rb_left;
 378                 /*
 379                  * We can't avoid mem cgroups that are over their soft
 380                  * limit by the same amount
 381                  */
 382                 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 383                         p = &(*p)->rb_right;
 384         }
 385         rb_link_node(&mz->tree_node, parent, p);
 386         rb_insert_color(&mz->tree_node, &mctz->rb_root);
 387         mz->on_tree = true;
 388 }
 389
 390 static void
 391 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
 392                                 struct mem_cgroup_per_zone *mz,
 393                                 struct mem_cgroup_tree_per_zone *mctz)
 394 {
 395         if (!mz->on_tree)
 396                 return;
 397         rb_erase(&mz->tree_node, &mctz->rb_root);
 398         mz->on_tree = false;
 399 }
 400
 401 static void
 402 mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
 403                                 struct mem_cgroup_per_zone *mz,
 404                                 struct mem_cgroup_tree_per_zone *mctz)
 405 {
 406         spin_lock(&mctz->lock);
 407         __mem_cgroup_remove_exceeded(mem, mz, mctz);
 408         spin_unlock(&mctz->lock);
 409 }
 410
 411
 412 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
 413 {
 414         unsigned long long excess;
 415         struct mem_cgroup_per_zone *mz;
 416         struct mem_cgroup_tree_per_zone *mctz;
 417         int nid = page_to_nid(page);
 418         int zid = page_zonenum(page);
 419         mctz = soft_limit_tree_from_page(page);
 420
 421         /*
 422          * Necessary to update all ancestors when hierarchy is used.
 423          * because their event counter is not touched.
 424          */
 425         for (; mem; mem = parent_mem_cgroup(mem)) {
 426                 mz = mem_cgroup_zoneinfo(mem, nid, zid);
 427                 excess = res_counter_soft_limit_excess(&mem->res);
 428                 /*
 429                  * We have to update the tree if mz is on RB-tree or
 430                  * mem is over its softlimit.
 431                  */
 432                 if (excess || mz->on_tree) {
 433                         spin_lock(&mctz->lock);
 434                         /* if on-tree, remove it */
 435                         if (mz->on_tree)
 436                                 __mem_cgroup_remove_exceeded(mem, mz, mctz);
 437                         /*
 438                          * Insert again. mz->usage_in_excess will be updated.
 439                          * If excess is 0, no tree ops.
 440                          */
 441                         __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
 442                         spin_unlock(&mctz->lock);
 443                 }
 444         }
 445 }
 446
 447 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
 448 {
 449         int node, zone;
 450         struct mem_cgroup_per_zone *mz;
 451         struct mem_cgroup_tree_per_zone *mctz;
 452
 453         for_each_node_state(node, N_POSSIBLE) {
 454                 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 455                         mz = mem_cgroup_zoneinfo(mem, node, zone);
 456                         mctz = soft_limit_tree_node_zone(node, zone);
 457                         mem_cgroup_remove_exceeded(mem, mz, mctz);
 458                 }
 459         }
 460 }
 461
 462 static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
 463 {
 464         return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
 465 }
 466
 467 static struct mem_cgroup_per_zone *
 468 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 469 {
 470         struct rb_node *rightmost = NULL;
 471         struct mem_cgroup_per_zone *mz;
 472
 473 retry:
 474         mz = NULL;
 475         rightmost = rb_last(&mctz->rb_root);
 476         if (!rightmost)
 477                 goto done;              /* Nothing to reclaim from */
 478
 479         mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
 480         /*
 481          * Remove the node now but someone else can add it back,
 482          * we will to add it back at the end of reclaim to its correct
 483          * position in the tree.
 484          */
 485         __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
 486         if (!res_counter_soft_limit_excess(&mz->mem->res) ||
 487                 !css_tryget(&mz->mem->css))
 488                 goto retry;
 489 done:
 490         return mz;
 491 }
 492
 493 static struct mem_cgroup_per_zone *
 494 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 495 {
 496         struct mem_cgroup_per_zone *mz;
 497
 498         spin_lock(&mctz->lock);
 499         mz = __mem_cgroup_largest_soft_limit_node(mctz);
 500         spin_unlock(&mctz->lock);
 501         return mz;
 502 }
 503
 504 static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
 505                 enum mem_cgroup_stat_index idx)
 506 {
 507         int cpu;
 508         s64 val = 0;
 509
 510         for_each_possible_cpu(cpu)
 511                 val += per_cpu(mem->stat->count[idx], cpu);
 512         return val;
 513 }
 514
 515 static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
 516 {
 517         s64 ret;
 518
 519         ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
 520         ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
 521         return ret;
 522 }
 523
 524 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
 525                                          bool charge)
 526 {
 527         int val = (charge) ? 1 : -1;
 528         this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
 529 }
 530
 531 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 532                                          struct page_cgroup *pc,
 533                                          bool charge)
 534 {
 535         int val = (charge) ? 1 : -1;
 536
 537         preempt_disable();
 538
 539         if (PageCgroupCache(pc))
 540                 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
 541         else
 542                 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
 543
 544         if (charge)
 545                 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
 546         else
 547                 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
 548         __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
 549
 550         preempt_enable();
 551 }
 552
 553 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
 554                                         enum lru_list idx)
 555 {
 556         int nid, zid;
 557         struct mem_cgroup_per_zone *mz;
 558         u64 total = 0;
 559
 560         for_each_online_node(nid)
 561                 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 562                         mz = mem_cgroup_zoneinfo(mem, nid, zid);
 563                         total += MEM_CGROUP_ZSTAT(mz, idx);
 564                 }
 565         return total;
 566 }
 567
 568 static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
 569 {
 570         s64 val;
 571
 572         val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
 573
 574         return !(val & ((1 << event_mask_shift) - 1));
 575 }
 576
 577 /*
 578  * Check events in order.
 579  *
 580  */
 581 static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
 582 {
 583         /* threshold event is triggered in finer grain than soft limit */
 584         if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
 585                 mem_cgroup_threshold(mem);
 586                 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
 587                         mem_cgroup_update_tree(mem, page);
 588         }
 589 }
 590
 591 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 592 {
 593         return container_of(cgroup_subsys_state(cont,
 594                                 mem_cgroup_subsys_id), struct mem_cgroup,
 595                                 css);
 596 }
 597
 598 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 599 {
 600         /*
 601          * mm_update_next_owner() may clear mm->owner to NULL
 602          * if it races with swapoff, page migration, etc.
 603          * So this can be called with p == NULL.
 604          */
 605         if (unlikely(!p))
 606                 return NULL;
 607
 608         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 609                                 struct mem_cgroup, css);
 610 }
 611
 612 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 613 {
 614         struct mem_cgroup *mem = NULL;
 615
 616         if (!mm)
 617                 return NULL;
 618         /*
 619          * Because we have no locks, mm->owner's may be being moved to other
 620          * cgroup. We use css_tryget() here even if this looks
 621          * pessimistic (rather than adding locks here).
 622          */
 623         rcu_read_lock();
 624         do {
 625                 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 626                 if (unlikely(!mem))
 627                         break;
 628         } while (!css_tryget(&mem->css));
 629         rcu_read_unlock();
 630         return mem;
 631 }
 632
 633 /*
 634  * Call callback function against all cgroup under hierarchy tree.
 635  */
 636 static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
 637                           int (*func)(struct mem_cgroup *, void *))
 638 {
 639         int found, ret, nextid;
 640         struct cgroup_subsys_state *css;
 641         struct mem_cgroup *mem;
 642
 643         if (!root->use_hierarchy)
 644                 return (*func)(root, data);
 645
 646         nextid = 1;
 647         do {
 648                 ret = 0;
 649                 mem = NULL;
 650
 651                 rcu_read_lock();
 652                 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
 653                                    &found);
 654                 if (css && css_tryget(css))
 655                         mem = container_of(css, struct mem_cgroup, css);
 656                 rcu_read_unlock();
 657
 658                 if (mem) {
 659                         ret = (*func)(mem, data);
 660                         css_put(&mem->css);
 661                 }
 662                 nextid = found + 1;
 663         } while (!ret && css);
 664
 665         return ret;
 666 }
 667
 668 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
 669 {
 670         return (mem == root_mem_cgroup);
 671 }
 672
 673 /*
 674  * Following LRU functions are allowed to be used without PCG_LOCK.
 675  * Operations are called by routine of global LRU independently from memcg.
 676  * What we have to take care of here is validness of pc->mem_cgroup.
 677  *
 678  * Changes to pc->mem_cgroup happens when
 679  * 1. charge
 680  * 2. moving account
 681  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
 682  * It is added to LRU before charge.
 683  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
 684  * When moving account, the page is not on LRU. It's isolated.
 685  */
 686
 687 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
 688 {
 689         struct page_cgroup *pc;
 690         struct mem_cgroup_per_zone *mz;
 691
 692         if (mem_cgroup_disabled())
 693                 return;
 694         pc = lookup_page_cgroup(page);
 695         /* can happen while we handle swapcache. */
 696         if (!TestClearPageCgroupAcctLRU(pc))
 697                 return;
 698         VM_BUG_ON(!pc->mem_cgroup);
 699         /*
 700          * We don't check PCG_USED bit. It's cleared when the "page" is finally
 701          * removed from global LRU.
 702          */
 703         mz = page_cgroup_zoneinfo(pc);
 704         MEM_CGROUP_ZSTAT(mz, lru) -= 1;
 705         if (mem_cgroup_is_root(pc->mem_cgroup))
 706                 return;
 707         VM_BUG_ON(list_empty(&pc->lru));
 708         list_del_init(&pc->lru);
 709         return;
 710 }
 711
 712 void mem_cgroup_del_lru(struct page *page)
 713 {
 714         mem_cgroup_del_lru_list(page, page_lru(page));
 715 }
 716
 717 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
 718 {
 719         struct mem_cgroup_per_zone *mz;
 720         struct page_cgroup *pc;
 721
 722         if (mem_cgroup_disabled())
 723                 return;
 724
 725         pc = lookup_page_cgroup(page);
 726         /*
 727          * Used bit is set without atomic ops but after smp_wmb().
 728          * For making pc->mem_cgroup visible, insert smp_rmb() here.
 729          */
 730         smp_rmb();
 731         /* unused or root page is not rotated. */
 732         if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
 733                 return;
 734         mz = page_cgroup_zoneinfo(pc);
 735         list_move(&pc->lru, &mz->lists[lru]);
 736 }
 737
 738 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 739 {
 740         struct page_cgroup *pc;
 741         struct mem_cgroup_per_zone *mz;
 742
 743         if (mem_cgroup_disabled())
 744                 return;
 745         pc = lookup_page_cgroup(page);
 746         VM_BUG_ON(PageCgroupAcctLRU(pc));
 747         /*
 748          * Used bit is set without atomic ops but after smp_wmb().
 749          * For making pc->mem_cgroup visible, insert smp_rmb() here.
 750          */
 751         smp_rmb();
 752         if (!PageCgroupUsed(pc))
 753                 return;
 754
 755         mz = page_cgroup_zoneinfo(pc);
 756         MEM_CGROUP_ZSTAT(mz, lru) += 1;
 757         SetPageCgroupAcctLRU(pc);
 758         if (mem_cgroup_is_root(pc->mem_cgroup))
 759                 return;
 760         list_add(&pc->lru, &mz->lists[lru]);
 761 }
 762
 763 /*
 764  * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
 765  * lru because the page may.be reused after it's fully uncharged (because of
 766  * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
 767  * it again. This function is only used to charge SwapCache. It's done under
 768  * lock_page and expected that zone->lru_lock is never held.
 769  */
 770 static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
 771 {
 772         unsigned long flags;
 773         struct zone *zone = page_zone(page);
 774         struct page_cgroup *pc = lookup_page_cgroup(page);
 775
 776         spin_lock_irqsave(&zone->lru_lock, flags);
 777         /*
 778          * Forget old LRU when this page_cgroup is *not* used. This Used bit
 779          * is guarded by lock_page() because the page is SwapCache.
 780          */
 781         if (!PageCgroupUsed(pc))
 782                 mem_cgroup_del_lru_list(page, page_lru(page));
 783         spin_unlock_irqrestore(&zone->lru_lock, flags);
 784 }
 785
 786 static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
 787 {
 788         unsigned long flags;
 789         struct zone *zone = page_zone(page);
 790         struct page_cgroup *pc = lookup_page_cgroup(page);
 791
 792         spin_lock_irqsave(&zone->lru_lock, flags);
 793         /* link when the page is linked to LRU but page_cgroup isn't */
 794         if (PageLRU(page) && !PageCgroupAcctLRU(pc))
 795                 mem_cgroup_add_lru_list(page, page_lru(page));
 796         spin_unlock_irqrestore(&zone->lru_lock, flags);
 797 }
 798
 799
 800 void mem_cgroup_move_lists(struct page *page,
 801                            enum lru_list from, enum lru_list to)
 802 {
 803         if (mem_cgroup_disabled())
 804                 return;
 805         mem_cgroup_del_lru_list(page, from);
 806         mem_cgroup_add_lru_list(page, to);
 807 }
 808
 809 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 810 {
 811         int ret;
 812         struct mem_cgroup *curr = NULL;
 813
 814         task_lock(task);
 815         rcu_read_lock();
 816         curr = try_get_mem_cgroup_from_mm(task->mm);
 817         rcu_read_unlock();
 818         task_unlock(task);
 819         if (!curr)
 820                 return 0;
 821         /*
 822          * We should check use_hierarchy of "mem" not "curr". Because checking
 823          * use_hierarchy of "curr" here make this function true if hierarchy is
 824          * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
 825          * hierarchy(even if use_hierarchy is disabled in "mem").
 826          */
 827         if (mem->use_hierarchy)
 828                 ret = css_is_ancestor(&curr->css, &mem->css);
 829         else
 830                 ret = (curr == mem);
 831         css_put(&curr->css);
 832         return ret;
 833 }
 834
 835 /*
 836  * prev_priority control...this will be used in memory reclaim path.
 837  */
 838 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
 839 {
 840         int prev_priority;
 841
 842         spin_lock(&mem->reclaim_param_lock);
 843         prev_priority = mem->prev_priority;
 844         spin_unlock(&mem->reclaim_param_lock);
 845
 846         return prev_priority;
 847 }
 848
 849 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
 850 {
 851         spin_lock(&mem->reclaim_param_lock);
 852         if (priority < mem->prev_priority)
 853                 mem->prev_priority = priority;
 854         spin_unlock(&mem->reclaim_param_lock);
 855 }
 856
 857 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
 858 {
 859         spin_lock(&mem->reclaim_param_lock);
 860         mem->prev_priority = priority;
 861         spin_unlock(&mem->reclaim_param_lock);
 862 }
 863
 864 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
 865 {
 866         unsigned long active;
 867         unsigned long inactive;
 868         unsigned long gb;
 869         unsigned long inactive_ratio;
 870
 871         inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
 872         active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
 873
 874         gb = (inactive + active) >> (30 - PAGE_SHIFT);
 875         if (gb)
 876                 inactive_ratio = int_sqrt(10 * gb);
 877         else
 878                 inactive_ratio = 1;
 879
 880         if (present_pages) {
 881                 present_pages[0] = inactive;
 882                 present_pages[1] = active;
 883         }
 884
 885         return inactive_ratio;
 886 }
 887
 888 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
 889 {
 890         unsigned long active;
 891         unsigned long inactive;
 892         unsigned long present_pages[2];
 893         unsigned long inactive_ratio;
 894
 895         inactive_ratio = calc_inactive_ratio(memcg, present_pages);
 896
 897         inactive = present_pages[0];
 898         active = present_pages[1];
 899
 900         if (inactive * inactive_ratio < active)
 901                 return 1;
 902
 903         return 0;
 904 }
 905
 906 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
 907 {
 908         unsigned long active;
 909         unsigned long inactive;
 910
 911         inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
 912         active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
 913
 914         return (active > inactive);
 915 }
 916
 917 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 918                                        struct zone *zone,
 919                                        enum lru_list lru)
 920 {
 921         int nid = zone->zone_pgdat->node_id;
 922         int zid = zone_idx(zone);
 923         struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 924
 925         return MEM_CGROUP_ZSTAT(mz, lru);
 926 }
 927
 928 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
 929                                                       struct zone *zone)
 930 {
 931         int nid = zone->zone_pgdat->node_id;
 932         int zid = zone_idx(zone);
 933         struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 934
 935         return &mz->reclaim_stat;
 936 }
 937
 938 struct zone_reclaim_stat *
 939 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
 940 {
 941         struct page_cgroup *pc;
 942         struct mem_cgroup_per_zone *mz;
 943
 944         if (mem_cgroup_disabled())
 945                 return NULL;
 946
 947         pc = lookup_page_cgroup(page);
 948         /*
 949          * Used bit is set without atomic ops but after smp_wmb().
 950          * For making pc->mem_cgroup visible, insert smp_rmb() here.
 951          */
 952         smp_rmb();
 953         if (!PageCgroupUsed(pc))
 954                 return NULL;
 955
 956         mz = page_cgroup_zoneinfo(pc);
 957         if (!mz)
 958                 return NULL;
 959
 960         return &mz->reclaim_stat;
 961 }
 962
 963 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 964                                         struct list_head *dst,
 965                                         unsigned long *scanned, int order,
 966                                         int mode, struct zone *z,
 967                                         struct mem_cgroup *mem_cont,
 968                                         int active, int file)
 969 {
 970         unsigned long nr_taken = 0;
 971         struct page *page;
 972         unsigned long scan;
 973         LIST_HEAD(pc_list);
 974         struct list_head *src;
 975         struct page_cgroup *pc, *tmp;
 976         int nid = z->zone_pgdat->node_id;
 977         int zid = zone_idx(z);
 978         struct mem_cgroup_per_zone *mz;
 979         int lru = LRU_FILE * file + active;
 980         int ret;
 981
 982         BUG_ON(!mem_cont);
 983         mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
 984         src = &mz->lists[lru];
 985
 986         scan = 0;
 987         list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
 988                 if (scan >= nr_to_scan)
 989                         break;
 990
 991                 page = pc->page;
 992                 if (unlikely(!PageCgroupUsed(pc)))
 993                         continue;
 994                 if (unlikely(!PageLRU(page)))
 995                         continue;
 996
 997                 scan++;
 998                 ret = __isolate_lru_page(page, mode, file);
 999                 switch (ret) {
1000                 case 0:
1001                         list_move(&page->lru, dst);
1002                         mem_cgroup_del_lru(page);
1003                         nr_taken++;
1004                         break;
1005                 case -EBUSY:
1006                         /* we don't affect global LRU but rotate in our LRU */
1007                         mem_cgroup_rotate_lru_list(page, page_lru(page));
1008                         break;
1009                 default:
1010                         break;
1011                 }
1012         }
1013
1014         *scanned = scan;
1015         return nr_taken;
1016 }
1017
1018 #define mem_cgroup_from_res_counter(counter, member)    \
1019         container_of(counter, struct mem_cgroup, member)
1020
1021 static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
1022 {
1023         if (do_swap_account) {
1024                 if (res_counter_check_under_limit(&mem->res) &&
1025                         res_counter_check_under_limit(&mem->memsw))
1026                         return true;
1027         } else
1028                 if (res_counter_check_under_limit(&mem->res))
1029                         return true;
1030         return false;
1031 }
1032
1033 static unsigned int get_swappiness(struct mem_cgroup *memcg)
1034 {
1035         struct cgroup *cgrp = memcg->css.cgroup;
1036         unsigned int swappiness;
1037
1038         /* root ? */
1039         if (cgrp->parent == NULL)
1040                 return vm_swappiness;
1041
1042         spin_lock(&memcg->reclaim_param_lock);
1043         swappiness = memcg->swappiness;
1044         spin_unlock(&memcg->reclaim_param_lock);
1045
1046         return swappiness;
1047 }
1048
1049 static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1050 {
1051         int *val = data;
1052         (*val)++;
1053         return 0;
1054 }
1055
1056 /**
1057  * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1058  * @memcg: The memory cgroup that went over limit
1059  * @p: Task that is going to be killed
1060  *
1061  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1062  * enabled
1063  */
1064 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1065 {
1066         struct cgroup *task_cgrp;
1067         struct cgroup *mem_cgrp;
1068         /*
1069          * Need a buffer in BSS, can't rely on allocations. The code relies
1070          * on the assumption that OOM is serialized for memory controller.
1071          * If this assumption is broken, revisit this code.
1072          */
1073         static char memcg_name[PATH_MAX];
1074         int ret;
1075
1076         if (!memcg || !p)
1077                 return;
1078
1079
1080         rcu_read_lock();
1081
1082         mem_cgrp = memcg->css.cgroup;
1083         task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1084
1085         ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1086         if (ret < 0) {
1087                 /*
1088                  * Unfortunately, we are unable to convert to a useful name
1089                  * But we'll still print out the usage information
1090                  */
1091                 rcu_read_unlock();
1092                 goto done;
1093         }
1094         rcu_read_unlock();
1095
1096         printk(KERN_INFO "Task in %s killed", memcg_name);
1097
1098         rcu_read_lock();
1099         ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1100         if (ret < 0) {
1101                 rcu_read_unlock();
1102                 goto done;
1103         }
1104         rcu_read_unlock();
1105
1106         /*
1107          * Continues from above, so we don't need an KERN_ level
1108          */
1109         printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1110 done:
1111
1112         printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1113                 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1114                 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1115                 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1116         printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1117                 "failcnt %llu\n",
1118                 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1119                 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1120                 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1121 }
1122
1123 /*
1124  * This function returns the number of memcg under hierarchy tree. Returns
1125  * 1(self count) if no children.
1126  */
1127 static int mem_cgroup_count_children(struct mem_cgroup *mem)
1128 {
1129         int num = 0;
1130         mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
1131         return num;
1132 }
1133
1134 /*
1135  * Visit the first child (need not be the first child as per the ordering
1136  * of the cgroup list, since we track last_scanned_child) of @mem and use
1137  * that to reclaim free pages from.
1138  */
1139 static struct mem_cgroup *
1140 mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1141 {
1142         struct mem_cgroup *ret = NULL;
1143         struct cgroup_subsys_state *css;
1144         int nextid, found;
1145
1146         if (!root_mem->use_hierarchy) {
1147                 css_get(&root_mem->css);
1148                 ret = root_mem;
1149         }
1150
1151         while (!ret) {
1152                 rcu_read_lock();
1153                 nextid = root_mem->last_scanned_child + 1;
1154                 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
1155                                    &found);
1156                 if (css && css_tryget(css))
1157                         ret = container_of(css, struct mem_cgroup, css);
1158
1159                 rcu_read_unlock();
1160                 /* Updates scanning parameter */
1161                 spin_lock(&root_mem->reclaim_param_lock);
1162                 if (!css) {
1163                         /* this means start scan from ID:1 */
1164                         root_mem->last_scanned_child = 0;
1165                 } else
1166                         root_mem->last_scanned_child = found;
1167                 spin_unlock(&root_mem->reclaim_param_lock);
1168         }
1169
1170         return ret;
1171 }
1172
1173 /*
1174  * Scan the hierarchy if needed to reclaim memory. We remember the last child
1175  * we reclaimed from, so that we don't end up penalizing one child extensively
1176  * based on its position in the children list.
1177  *
1178  * root_mem is the original ancestor that we've been reclaim from.
1179  *
1180  * We give up and return to the caller when we visit root_mem twice.
1181  * (other groups can be removed while we're walking....)
1182  *
1183  * If shrink==true, for avoiding to free too much, this returns immedieately.
1184  */
1185 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1186                                                 struct zone *zone,
1187                                                 gfp_t gfp_mask,
1188                                                 unsigned long reclaim_options)
1189 {
1190         struct mem_cgroup *victim;
1191         int ret, total = 0;
1192         int loop = 0;
1193         bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1194         bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1195         bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1196         unsigned long excess = mem_cgroup_get_excess(root_mem);
1197
1198         /* If memsw_is_minimum==1, swap-out is of-no-use. */
1199         if (root_mem->memsw_is_minimum)
1200                 noswap = true;
1201
1202         while (1) {
1203                 victim = mem_cgroup_select_victim(root_mem);
1204                 if (victim == root_mem) {
1205                         loop++;
1206                         if (loop >= 1)
1207                                 drain_all_stock_async();
1208                         if (loop >= 2) {
1209                                 /*
1210                                  * If we have not been able to reclaim
1211                                  * anything, it might because there are
1212                                  * no reclaimable pages under this hierarchy
1213                                  */
1214                                 if (!check_soft || !total) {
1215                                         css_put(&victim->css);
1216                                         break;
1217                                 }
1218                                 /*
1219                                  * We want to do more targetted reclaim.
1220                                  * excess >> 2 is not to excessive so as to
1221                                  * reclaim too much, nor too less that we keep
1222                                  * coming back to reclaim from this cgroup
1223                                  */
1224                                 if (total >= (excess >> 2) ||
1225                                         (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1226                                         css_put(&victim->css);
1227                                         break;
1228                                 }
1229                         }
1230                 }
1231                 if (!mem_cgroup_local_usage(victim)) {
1232                         /* this cgroup's local usage == 0 */
1233                         css_put(&victim->css);
1234                         continue;
1235                 }
1236                 /* we use swappiness of local cgroup */
1237                 if (check_soft)
1238                         ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1239                                 noswap, get_swappiness(victim), zone,
1240                                 zone->zone_pgdat->node_id);
1241                 else
1242                         ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1243                                                 noswap, get_swappiness(victim));
1244                 css_put(&victim->css);
1245                 /*
1246                  * At shrinking usage, we can't check we should stop here or
1247                  * reclaim more. It's depends on callers. last_scanned_child
1248                  * will work enough for keeping fairness under tree.
1249                  */
1250                 if (shrink)
1251                         return ret;
1252                 total += ret;
1253                 if (check_soft) {
1254                         if (res_counter_check_under_soft_limit(&root_mem->res))
1255                                 return total;
1256                 } else if (mem_cgroup_check_under_limit(root_mem))
1257                         return 1 + total;
1258         }
1259         return total;
1260 }
1261
1262 static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1263 {
1264         int *val = (int *)data;
1265         int x;
1266         /*
1267          * Logically, we can stop scanning immediately when we find
1268          * a memcg is already locked. But condidering unlock ops and
1269          * creation/removal of memcg, scan-all is simple operation.
1270          */
1271         x = atomic_inc_return(&mem->oom_lock);
1272         *val = max(x, *val);
1273         return 0;
1274 }
1275 /*
1276  * Check OOM-Killer is already running under our hierarchy.
1277  * If someone is running, return false.
1278  */
1279 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1280 {
1281         int lock_count = 0;
1282
1283         mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
1284
1285         if (lock_count == 1)
1286                 return true;
1287         return false;
1288 }
1289
1290 static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
1291 {
1292         /*
1293          * When a new child is created while the hierarchy is under oom,
1294          * mem_cgroup_oom_lock() may not be called. We have to use
1295          * atomic_add_unless() here.
1296          */
1297         atomic_add_unless(&mem->oom_lock, -1, 0);
1298         return 0;
1299 }
1300
1301 static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1302 {
1303         mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1304 }
1305
1306 static DEFINE_MUTEX(memcg_oom_mutex);
1307 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1308
1309 struct oom_wait_info {
1310         struct mem_cgroup *mem;
1311         wait_queue_t    wait;
1312 };
1313
1314 static int memcg_oom_wake_function(wait_queue_t *wait,
1315         unsigned mode, int sync, void *arg)
1316 {
1317         struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
1318         struct oom_wait_info *oom_wait_info;
1319
1320         oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1321
1322         if (oom_wait_info->mem == wake_mem)
1323                 goto wakeup;
1324         /* if no hierarchy, no match */
1325         if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
1326                 return 0;
1327         /*
1328          * Both of oom_wait_info->mem and wake_mem are stable under us.
1329          * Then we can use css_is_ancestor without taking care of RCU.
1330          */
1331         if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
1332             !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
1333                 return 0;
1334
1335 wakeup:
1336         return autoremove_wake_function(wait, mode, sync, arg);
1337 }
1338
1339 static void memcg_wakeup_oom(struct mem_cgroup *mem)
1340 {
1341         /* for filtering, pass "mem" as argument. */
1342         __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
1343 }
1344
1345 /*
1346  * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1347  */
1348 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1349 {
1350         struct oom_wait_info owait;
1351         bool locked;
1352
1353         owait.mem = mem;
1354         owait.wait.flags = 0;
1355         owait.wait.func = memcg_oom_wake_function;
1356         owait.wait.private = current;
1357         INIT_LIST_HEAD(&owait.wait.task_list);
1358
1359         /* At first, try to OOM lock hierarchy under mem.*/
1360         mutex_lock(&memcg_oom_mutex);
1361         locked = mem_cgroup_oom_lock(mem);
1362         /*
1363          * Even if signal_pending(), we can't quit charge() loop without
1364          * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1365          * under OOM is always welcomed, use TASK_KILLABLE here.
1366          */
1367         if (!locked)
1368                 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1369         else
1370                 mem_cgroup_oom_notify(mem);
1371         mutex_unlock(&memcg_oom_mutex);
1372
1373         if (locked)
1374                 mem_cgroup_out_of_memory(mem, mask);
1375         else {
1376                 schedule();
1377                 finish_wait(&memcg_oom_waitq, &owait.wait);
1378         }
1379         mutex_lock(&memcg_oom_mutex);
1380         mem_cgroup_oom_unlock(mem);
1381         memcg_wakeup_oom(mem);
1382         mutex_unlock(&memcg_oom_mutex);
1383
1384         if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1385                 return false;
1386         /* Give chance to dying process */
1387         schedule_timeout(1);
1388         return true;
1389 }
1390
1391 /*
1392  * Currently used to update mapped file statistics, but the routine can be
1393  * generalized to update other statistics as well.
1394  */
1395 void mem_cgroup_update_file_mapped(struct page *page, int val)
1396 {
1397         struct mem_cgroup *mem;
1398         struct page_cgroup *pc;
1399
1400         pc = lookup_page_cgroup(page);
1401         if (unlikely(!pc))
1402                 return;
1403
1404         lock_page_cgroup(pc);
1405         mem = pc->mem_cgroup;
1406         if (!mem || !PageCgroupUsed(pc))
1407                 goto done;
1408
1409         /*
1410          * Preemption is already disabled. We can use __this_cpu_xxx
1411          */
1412         if (val > 0) {
1413                 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1414                 SetPageCgroupFileMapped(pc);
1415         } else {
1416                 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1417                 ClearPageCgroupFileMapped(pc);
1418         }
1419
1420 done:
1421         unlock_page_cgroup(pc);
1422 }
1423
1424 /*
1425  * size of first charge trial. "32" comes from vmscan.c's magic value.
1426  * TODO: maybe necessary to use big numbers in big irons.
1427  */
1428 #define CHARGE_SIZE     (32 * PAGE_SIZE)
1429 struct memcg_stock_pcp {
1430         struct mem_cgroup *cached; /* this never be root cgroup */
1431         int charge;
1432         struct work_struct work;
1433 };
1434 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1435 static atomic_t memcg_drain_count;
1436
1437 /*
1438  * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
1439  * from local stock and true is returned. If the stock is 0 or charges from a
1440  * cgroup which is not current target, returns false. This stock will be
1441  * refilled.
1442  */
1443 static bool consume_stock(struct mem_cgroup *mem)
1444 {
1445         struct memcg_stock_pcp *stock;
1446         bool ret = true;
1447
1448         stock = &get_cpu_var(memcg_stock);
1449         if (mem == stock->cached && stock->charge)
1450                 stock->charge -= PAGE_SIZE;
1451         else /* need to call res_counter_charge */
1452                 ret = false;
1453         put_cpu_var(memcg_stock);
1454         return ret;
1455 }
1456
1457 /*
1458  * Returns stocks cached in percpu to res_counter and reset cached information.
1459  */
1460 static void drain_stock(struct memcg_stock_pcp *stock)
1461 {
1462         struct mem_cgroup *old = stock->cached;
1463
1464         if (stock->charge) {
1465                 res_counter_uncharge(&old->res, stock->charge);
1466                 if (do_swap_account)
1467                         res_counter_uncharge(&old->memsw, stock->charge);
1468         }
1469         stock->cached = NULL;
1470         stock->charge = 0;
1471 }
1472
1473 /*
1474  * This must be called under preempt disabled or must be called by
1475  * a thread which is pinned to local cpu.
1476  */
1477 static void drain_local_stock(struct work_struct *dummy)
1478 {
1479         struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1480         drain_stock(stock);
1481 }
1482
1483 /*
1484  * Cache charges(val) which is from res_counter, to local per_cpu area.
1485  * This will be consumed by consume_stock() function, later.
1486  */
1487 static void refill_stock(struct mem_cgroup *mem, int val)
1488 {
1489         struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1490
1491         if (stock->cached != mem) { /* reset if necessary */
1492                 drain_stock(stock);
1493                 stock->cached = mem;
1494         }
1495         stock->charge += val;
1496         put_cpu_var(memcg_stock);
1497 }
1498
1499 /*
1500  * Tries to drain stocked charges in other cpus. This function is asynchronous
1501  * and just put a work per cpu for draining localy on each cpu. Caller can
1502  * expects some charges will be back to res_counter later but cannot wait for
1503  * it.
1504  */
1505 static void drain_all_stock_async(void)
1506 {
1507         int cpu;
1508         /* This function is for scheduling "drain" in asynchronous way.
1509          * The result of "drain" is not directly handled by callers. Then,
1510          * if someone is calling drain, we don't have to call drain more.
1511          * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
1512          * there is a race. We just do loose check here.
1513          */
1514         if (atomic_read(&memcg_drain_count))
1515                 return;
1516         /* Notify other cpus that system-wide "drain" is running */
1517         atomic_inc(&memcg_drain_count);
1518         get_online_cpus();
1519         for_each_online_cpu(cpu) {
1520                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1521                 schedule_work_on(cpu, &stock->work);
1522         }
1523         put_online_cpus();
1524         atomic_dec(&memcg_drain_count);
1525         /* We don't wait for flush_work */
1526 }
1527
1528 /* This is a synchronous drain interface. */
1529 static void drain_all_stock_sync(void)
1530 {
1531         /* called when force_empty is called */
1532         atomic_inc(&memcg_drain_count);
1533         schedule_on_each_cpu(drain_local_stock);
1534         atomic_dec(&memcg_drain_count);
1535 }
1536
1537 static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1538                                         unsigned long action,
1539                                         void *hcpu)
1540 {
1541         int cpu = (unsigned long)hcpu;
1542         struct memcg_stock_pcp *stock;
1543
1544         if (action != CPU_DEAD)
1545                 return NOTIFY_OK;
1546         stock = &per_cpu(memcg_stock, cpu);
1547         drain_stock(stock);
1548         return NOTIFY_OK;
1549 }
1550
1551 /*
1552  * Unlike exported interface, "oom" parameter is added. if oom==true,
1553  * oom-killer can be invoked.
1554  */
1555 static int __mem_cgroup_try_charge(struct mm_struct *mm,
1556                         gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
1557 {
1558         struct mem_cgroup *mem, *mem_over_limit;
1559         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1560         struct res_counter *fail_res;
1561         int csize = CHARGE_SIZE;
1562
1563         /*
1564          * Unlike gloval-vm's OOM-kill, we're not in memory shortage
1565          * in system level. So, allow to go ahead dying process in addition to
1566          * MEMDIE process.
1567          */
1568         if (unlikely(test_thread_flag(TIF_MEMDIE)
1569                      || fatal_signal_pending(current)))
1570                 goto bypass;
1571
1572         /*
1573          * We always charge the cgroup the mm_struct belongs to.
1574          * The mm_struct's mem_cgroup changes on task migration if the
1575          * thread group leader migrates. It's possible that mm is not
1576          * set, if so charge the init_mm (happens for pagecache usage).
1577          */
1578         mem = *memcg;
1579         if (likely(!mem)) {
1580                 mem = try_get_mem_cgroup_from_mm(mm);
1581                 *memcg = mem;
1582         } else {
1583                 css_get(&mem->css);
1584         }
1585         if (unlikely(!mem))
1586                 return 0;
1587
1588         VM_BUG_ON(css_is_removed(&mem->css));
1589         if (mem_cgroup_is_root(mem))
1590                 goto done;
1591
1592         while (1) {
1593                 int ret = 0;
1594                 unsigned long flags = 0;
1595
1596                 if (consume_stock(mem))
1597                         goto done;
1598
1599                 ret = res_counter_charge(&mem->res, csize, &fail_res);
1600                 if (likely(!ret)) {
1601                         if (!do_swap_account)
1602                                 break;
1603                         ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1604                         if (likely(!ret))
1605                                 break;
1606                         /* mem+swap counter fails */
1607                         res_counter_uncharge(&mem->res, csize);
1608                         flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1609                         mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1610                                                                         memsw);
1611                 } else
1612                         /* mem counter fails */
1613                         mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1614                                                                         res);
1615
1616                 /* reduce request size and retry */
1617                 if (csize > PAGE_SIZE) {
1618                         csize = PAGE_SIZE;
1619                         continue;
1620                 }
1621                 if (!(gfp_mask & __GFP_WAIT))
1622                         goto nomem;
1623
1624                 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1625                                                 gfp_mask, flags);
1626                 if (ret)
1627                         continue;
1628
1629                 /*
1630                  * try_to_free_mem_cgroup_pages() might not give us a full
1631                  * picture of reclaim. Some pages are reclaimed and might be
1632                  * moved to swap cache or just unmapped from the cgroup.
1633                  * Check the limit again to see if the reclaim reduced the
1634                  * current usage of the cgroup before giving up
1635                  *
1636                  */
1637                 if (mem_cgroup_check_under_limit(mem_over_limit))
1638                         continue;
1639
1640                 /* try to avoid oom while someone is moving charge */
1641                 if (mc.moving_task && current != mc.moving_task) {
1642                         struct mem_cgroup *from, *to;
1643                         bool do_continue = false;
1644                         /*
1645                          * There is a small race that "from" or "to" can be
1646                          * freed by rmdir, so we use css_tryget().
1647                          */
1648                         from = mc.from;
1649                         to = mc.to;
1650                         if (from && css_tryget(&from->css)) {
1651                                 if (mem_over_limit->use_hierarchy)
1652                                         do_continue = css_is_ancestor(
1653                                                         &from->css,
1654                                                         &mem_over_limit->css);
1655                                 else
1656                                         do_continue = (from == mem_over_limit);
1657                                 css_put(&from->css);
1658                         }
1659                         if (!do_continue && to && css_tryget(&to->css)) {
1660                                 if (mem_over_limit->use_hierarchy)
1661                                         do_continue = css_is_ancestor(
1662                                                         &to->css,
1663                                                         &mem_over_limit->css);
1664                                 else
1665                                         do_continue = (to == mem_over_limit);
1666                                 css_put(&to->css);
1667                         }
1668                         if (do_continue) {
1669                                 DEFINE_WAIT(wait);
1670                                 prepare_to_wait(&mc.waitq, &wait,
1671                                                         TASK_INTERRUPTIBLE);
1672                                 /* moving charge context might have finished. */
1673                                 if (mc.moving_task)
1674                                         schedule();
1675                                 finish_wait(&mc.waitq, &wait);
1676                                 continue;
1677                         }
1678                 }
1679
1680                 if (!nr_retries--) {
1681                         if (!oom)
1682                                 goto nomem;
1683                         if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
1684                                 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1685                                 continue;
1686                         }
1687                         /* When we reach here, current task is dying .*/
1688                         css_put(&mem->css);
1689                         goto bypass;
1690                 }
1691         }
1692         if (csize > PAGE_SIZE)
1693                 refill_stock(mem, csize - PAGE_SIZE);
1694 done:
1695         return 0;
1696 nomem:
1697         css_put(&mem->css);
1698         return -ENOMEM;
1699 bypass:
1700         *memcg = NULL;
1701         return 0;
1702 }
1703
1704 /*
1705  * Somemtimes we have to undo a charge we got by try_charge().
1706  * This function is for that and do uncharge, put css's refcnt.
1707  * gotten by try_charge().
1708  */
1709 static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1710                                                         unsigned long count)
1711 {
1712         if (!mem_cgroup_is_root(mem)) {
1713                 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1714                 if (do_swap_account)
1715                         res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1716                 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
1717                 WARN_ON_ONCE(count > INT_MAX);
1718                 __css_put(&mem->css, (int)count);
1719         }
1720         /* we don't need css_put for root */
1721 }
1722
1723 static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1724 {
1725         __mem_cgroup_cancel_charge(mem, 1);
1726 }
1727
1728 /*
1729  * A helper function to get mem_cgroup from ID. must be called under
1730  * rcu_read_lock(). The caller must check css_is_removed() or some if
1731  * it's concern. (dropping refcnt from swap can be called against removed
1732  * memcg.)
1733  */
1734 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1735 {
1736         struct cgroup_subsys_state *css;
1737
1738         /* ID 0 is unused ID */
1739         if (!id)
1740                 return NULL;
1741         css = css_lookup(&mem_cgroup_subsys, id);
1742         if (!css)
1743                 return NULL;
1744         return container_of(css, struct mem_cgroup, css);
1745 }
1746
1747 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
1748 {
1749         struct mem_cgroup *mem = NULL;
1750         struct page_cgroup *pc;
1751         unsigned short id;
1752         swp_entry_t ent;
1753
1754         VM_BUG_ON(!PageLocked(page));
1755
1756         pc = lookup_page_cgroup(page);
1757         lock_page_cgroup(pc);
1758         if (PageCgroupUsed(pc)) {
1759                 mem = pc->mem_cgroup;
1760                 if (mem && !css_tryget(&mem->css))
1761                         mem = NULL;
1762         } else if (PageSwapCache(page)) {
1763                 ent.val = page_private(page);
1764                 id = lookup_swap_cgroup(ent);
1765                 rcu_read_lock();
1766                 mem = mem_cgroup_lookup(id);
1767                 if (mem && !css_tryget(&mem->css))
1768                         mem = NULL;
1769                 rcu_read_unlock();
1770         }
1771         unlock_page_cgroup(pc);
1772         return mem;
1773 }
1774
1775 /*
1776  * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
1777  * USED state. If already USED, uncharge and return.
1778  */
1779
1780 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1781                                      struct page_cgroup *pc,
1782                                      enum charge_type ctype)
1783 {
1784         /* try_charge() can return NULL to *memcg, taking care of it. */
1785         if (!mem)
1786                 return;
1787
1788         lock_page_cgroup(pc);
1789         if (unlikely(PageCgroupUsed(pc))) {
1790                 unlock_page_cgroup(pc);
1791                 mem_cgroup_cancel_charge(mem);
1792                 return;
1793         }
1794
1795         pc->mem_cgroup = mem;
1796         /*
1797          * We access a page_cgroup asynchronously without lock_page_cgroup().
1798          * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
1799          * is accessed after testing USED bit. To make pc->mem_cgroup visible
1800          * before USED bit, we need memory barrier here.
1801          * See mem_cgroup_add_lru_list(), etc.
1802          */
1803         smp_wmb();
1804         switch (ctype) {
1805         case MEM_CGROUP_CHARGE_TYPE_CACHE:
1806         case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1807                 SetPageCgroupCache(pc);
1808                 SetPageCgroupUsed(pc);
1809                 break;
1810         case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1811                 ClearPageCgroupCache(pc);
1812                 SetPageCgroupUsed(pc);
1813                 break;
1814         default:
1815                 break;
1816         }
1817
1818         mem_cgroup_charge_statistics(mem, pc, true);
1819
1820         unlock_page_cgroup(pc);
1821         /*
1822          * "charge_statistics" updated event counter. Then, check it.
1823          * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1824          * if they exceeds softlimit.
1825          */
1826         memcg_check_events(mem, pc->page);
1827 }
1828
1829 /**
1830  * __mem_cgroup_move_account - move account of the page
1831  * @pc: page_cgroup of the page.
1832  * @from: mem_cgroup which the page is moved from.
1833  * @to: mem_cgroup which the page is moved to. @from != @to.
1834  * @uncharge: whether we should call uncharge and css_put against @from.
1835  *
1836  * The caller must confirm following.
1837  * - page is not on LRU (isolate_page() is useful.)
1838  * - the pc is locked, used, and ->mem_cgroup points to @from.
1839  *
1840  * This function doesn't do "charge" nor css_get to new cgroup. It should be
1841  * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
1842  * true, this function does "uncharge" from old cgroup, but it doesn't if
1843  * @uncharge is false, so a caller should do "uncharge".
1844  */
1845
1846 static void __mem_cgroup_move_account(struct page_cgroup *pc,
1847         struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1848 {
1849         VM_BUG_ON(from == to);
1850         VM_BUG_ON(PageLRU(pc->page));
1851         VM_BUG_ON(!PageCgroupLocked(pc));
1852         VM_BUG_ON(!PageCgroupUsed(pc));
1853         VM_BUG_ON(pc->mem_cgroup != from);
1854
1855         if (PageCgroupFileMapped(pc)) {
1856                 /* Update mapped_file data for mem_cgroup */
1857                 preempt_disable();
1858                 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1859                 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1860                 preempt_enable();
1861         }
1862         mem_cgroup_charge_statistics(from, pc, false);
1863         if (uncharge)
1864                 /* This is not "cancel", but cancel_charge does all we need. */
1865                 mem_cgroup_cancel_charge(from);
1866
1867         /* caller should have done css_get */
1868         pc->mem_cgroup = to;
1869         mem_cgroup_charge_statistics(to, pc, true);
1870         /*
1871          * We charges against "to" which may not have any tasks. Then, "to"
1872          * can be under rmdir(). But in current implementation, caller of
1873          * this function is just force_empty() and move charge, so it's
1874          * garanteed that "to" is never removed. So, we don't check rmdir
1875          * status here.
1876          */
1877 }
1878
1879 /*
1880  * check whether the @pc is valid for moving account and call
1881  * __mem_cgroup_move_account()
1882  */
1883 static int mem_cgroup_move_account(struct page_cgroup *pc,
1884                 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1885 {
1886         int ret = -EINVAL;
1887         lock_page_cgroup(pc);
1888         if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
1889                 __mem_cgroup_move_account(pc, from, to, uncharge);
1890                 ret = 0;
1891         }
1892         unlock_page_cgroup(pc);
1893         /*
1894          * check events
1895          */
1896         memcg_check_events(to, pc->page);
1897         memcg_check_events(from, pc->page);
1898         return ret;
1899 }
1900
1901 /*
1902  * move charges to its parent.
1903  */
1904
1905 static int mem_cgroup_move_parent(struct page_cgroup *pc,
1906                                   struct mem_cgroup *child,
1907                                   gfp_t gfp_mask)
1908 {
1909         struct page *page = pc->page;
1910         struct cgroup *cg = child->css.cgroup;
1911         struct cgroup *pcg = cg->parent;
1912         struct mem_cgroup *parent;
1913         int ret;
1914
1915         /* Is ROOT ? */
1916         if (!pcg)
1917                 return -EINVAL;
1918
1919         ret = -EBUSY;
1920         if (!get_page_unless_zero(page))
1921                 goto out;
1922         if (isolate_lru_page(page))
1923                 goto put;
1924
1925         parent = mem_cgroup_from_cont(pcg);
1926         ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
1927         if (ret || !parent)
1928                 goto put_back;
1929
1930         ret = mem_cgroup_move_account(pc, child, parent, true);
1931         if (ret)
1932                 mem_cgroup_cancel_charge(parent);
1933 put_back:
1934         putback_lru_page(page);
1935 put:
1936         put_page(page);
1937 out:
1938         return ret;
1939 }
1940
1941 /*
1942  * Charge the memory controller for page usage.
1943  * Return
1944  * 0 if the charge was successful
1945  * < 0 if the cgroup is over its limit
1946  */
1947 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1948                                 gfp_t gfp_mask, enum charge_type ctype,
1949                                 struct mem_cgroup *memcg)
1950 {
1951         struct mem_cgroup *mem;
1952         struct page_cgroup *pc;
1953         int ret;
1954
1955         pc = lookup_page_cgroup(page);
1956         /* can happen at boot */
1957         if (unlikely(!pc))
1958                 return 0;
1959         prefetchw(pc);
1960
1961         mem = memcg;
1962         ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1963         if (ret || !mem)
1964                 return ret;
1965
1966         __mem_cgroup_commit_charge(mem, pc, ctype);
1967         return 0;
1968 }
1969
1970 int mem_cgroup_newpage_charge(struct page *page,
1971                               struct mm_struct *mm, gfp_t gfp_mask)
1972 {
1973         if (mem_cgroup_disabled())
1974                 return 0;
1975         if (PageCompound(page))
1976                 return 0;
1977         /*
1978          * If already mapped, we don't have to account.
1979          * If page cache, page->mapping has address_space.
1980          * But page->mapping may have out-of-use anon_vma pointer,
1981          * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
1982          * is NULL.
1983          */
1984         if (page_mapped(page) || (page->mapping && !PageAnon(page)))
1985                 return 0;
1986         if (unlikely(!mm))
1987                 mm = &init_mm;
1988         return mem_cgroup_charge_common(page, mm, gfp_mask,
1989                                 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
1990 }
1991
1992 static void
1993 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1994                                         enum charge_type ctype);
1995
1996 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1997                                 gfp_t gfp_mask)
1998 {
1999         struct mem_cgroup *mem = NULL;
2000         int ret;
2001
2002         if (mem_cgroup_disabled())
2003                 return 0;
2004         if (PageCompound(page))
2005                 return 0;
2006         /*
2007          * Corner case handling. This is called from add_to_page_cache()
2008          * in usual. But some FS (shmem) precharges this page before calling it
2009          * and call add_to_page_cache() with GFP_NOWAIT.
2010          *
2011          * For GFP_NOWAIT case, the page may be pre-charged before calling
2012          * add_to_page_cache(). (See shmem.c) check it here and avoid to call
2013          * charge twice. (It works but has to pay a bit larger cost.)
2014          * And when the page is SwapCache, it should take swap information
2015          * into account. This is under lock_page() now.
2016          */
2017         if (!(gfp_mask & __GFP_WAIT)) {
2018                 struct page_cgroup *pc;
2019
2020
2021                 pc = lookup_page_cgroup(page);
2022                 if (!pc)
2023                         return 0;
2024                 lock_page_cgroup(pc);
2025                 if (PageCgroupUsed(pc)) {
2026                         unlock_page_cgroup(pc);
2027                         return 0;
2028                 }
2029                 unlock_page_cgroup(pc);
2030         }
2031
2032         if (unlikely(!mm && !mem))
2033                 mm = &init_mm;
2034
2035         if (page_is_file_cache(page))
2036                 return mem_cgroup_charge_common(page, mm, gfp_mask,
2037                                 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
2038
2039         /* shmem */
2040         if (PageSwapCache(page)) {
2041                 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2042                 if (!ret)
2043                         __mem_cgroup_commit_charge_swapin(page, mem,
2044                                         MEM_CGROUP_CHARGE_TYPE_SHMEM);
2045         } else
2046                 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2047                                         MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
2048
2049         return ret;
2050 }
2051
2052 /*
2053  * While swap-in, try_charge -> commit or cancel, the page is locked.
2054  * And when try_charge() successfully returns, one refcnt to memcg without
2055  * struct page_cgroup is acquired. This refcnt will be consumed by
2056  * "commit()" or removed by "cancel()"
2057  */
2058 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2059                                  struct page *page,
2060                                  gfp_t mask, struct mem_cgroup **ptr)
2061 {
2062         struct mem_cgroup *mem;
2063         int ret;
2064
2065         if (mem_cgroup_disabled())
2066                 return 0;
2067
2068         if (!do_swap_account)
2069                 goto charge_cur_mm;
2070         /*
2071          * A racing thread's fault, or swapoff, may have already updated
2072          * the pte, and even removed page from swap cache: in those cases
2073          * do_swap_page()'s pte_same() test will fail; but there's also a
2074          * KSM case which does need to charge the page.
2075          */
2076         if (!PageSwapCache(page))
2077                 goto charge_cur_mm;
2078         mem = try_get_mem_cgroup_from_page(page);
2079         if (!mem)
2080                 goto charge_cur_mm;
2081         *ptr = mem;
2082         ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
2083         /* drop extra refcnt from tryget */
2084         css_put(&mem->css);
2085         return ret;
2086 charge_cur_mm:
2087         if (unlikely(!mm))
2088                 mm = &init_mm;
2089         return __mem_cgroup_try_charge(mm, mask, ptr, true);
2090 }
2091
2092 static void
2093 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2094                                         enum charge_type ctype)
2095 {
2096         struct page_cgroup *pc;
2097
2098         if (mem_cgroup_disabled())
2099                 return;
2100         if (!ptr)
2101                 return;
2102         cgroup_exclude_rmdir(&ptr->css);
2103         pc = lookup_page_cgroup(page);
2104         mem_cgroup_lru_del_before_commit_swapcache(page);
2105         __mem_cgroup_commit_charge(ptr, pc, ctype);
2106         mem_cgroup_lru_add_after_commit_swapcache(page);
2107         /*
2108          * Now swap is on-memory. This means this page may be
2109          * counted both as mem and swap....double count.
2110          * Fix it by uncharging from memsw. Basically, this SwapCache is stable
2111          * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
2112          * may call delete_from_swap_cache() before reach here.
2113          */
2114         if (do_swap_account && PageSwapCache(page)) {
2115                 swp_entry_t ent = {.val = page_private(page)};
2116                 unsigned short id;
2117                 struct mem_cgroup *memcg;
2118
2119                 id = swap_cgroup_record(ent, 0);
2120                 rcu_read_lock();
2121                 memcg = mem_cgroup_lookup(id);
2122                 if (memcg) {
2123                         /*
2124                          * This recorded memcg can be obsolete one. So, avoid
2125                          * calling css_tryget
2126                          */
2127                         if (!mem_cgroup_is_root(memcg))
2128                                 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2129                         mem_cgroup_swap_statistics(memcg, false);
2130                         mem_cgroup_put(memcg);
2131                 }
2132                 rcu_read_unlock();
2133         }
2134         /*
2135          * At swapin, we may charge account against cgroup which has no tasks.
2136          * So, rmdir()->pre_destroy() can be called while we do this charge.
2137          * In that case, we need to call pre_destroy() again. check it here.
2138          */
2139         cgroup_release_and_wakeup_rmdir(&ptr->css);
2140 }
2141
2142 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
2143 {
2144         __mem_cgroup_commit_charge_swapin(page, ptr,
2145                                         MEM_CGROUP_CHARGE_TYPE_MAPPED);
2146 }
2147
2148 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2149 {
2150         if (mem_cgroup_disabled())
2151                 return;
2152         if (!mem)
2153                 return;
2154         mem_cgroup_cancel_charge(mem);
2155 }
2156
2157 static void
2158 __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2159 {
2160         struct memcg_batch_info *batch = NULL;
2161         bool uncharge_memsw = true;
2162         /* If swapout, usage of swap doesn't decrease */
2163         if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2164                 uncharge_memsw = false;
2165         /*
2166          * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2167          * In those cases, all pages freed continously can be expected to be in
2168          * the same cgroup and we have chance to coalesce uncharges.
2169          * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2170          * because we want to do uncharge as soon as possible.
2171          */
2172         if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
2173                 goto direct_uncharge;
2174
2175         batch = &current->memcg_batch;
2176         /*
2177          * In usual, we do css_get() when we remember memcg pointer.
2178          * But in this case, we keep res->usage until end of a series of
2179          * uncharges. Then, it's ok to ignore memcg's refcnt.
2180          */
2181         if (!batch->memcg)
2182                 batch->memcg = mem;
2183         /*
2184          * In typical case, batch->memcg == mem. This means we can
2185          * merge a series of uncharges to an uncharge of res_counter.
2186          * If not, we uncharge res_counter ony by one.
2187          */
2188         if (batch->memcg != mem)
2189                 goto direct_uncharge;
2190         /* remember freed charge and uncharge it later */
2191         batch->bytes += PAGE_SIZE;
2192         if (uncharge_memsw)
2193                 batch->memsw_bytes += PAGE_SIZE;
2194         return;
2195 direct_uncharge:
2196         res_counter_uncharge(&mem->res, PAGE_SIZE);
2197         if (uncharge_memsw)
2198                 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
2199         return;
2200 }
2201
2202 /*
2203  * uncharge if !page_mapped(page)
2204  */
2205 static struct mem_cgroup *
2206 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2207 {
2208         struct page_cgroup *pc;
2209         struct mem_cgroup *mem = NULL;
2210         struct mem_cgroup_per_zone *mz;
2211
2212         if (mem_cgroup_disabled())
2213                 return NULL;
2214
2215         if (PageSwapCache(page))
2216                 return NULL;
2217
2218         /*
2219          * Check if our page_cgroup is valid
2220          */
2221         pc = lookup_page_cgroup(page);
2222         if (unlikely(!pc || !PageCgroupUsed(pc)))
2223                 return NULL;
2224
2225         lock_page_cgroup(pc);
2226
2227         mem = pc->mem_cgroup;
2228
2229         if (!PageCgroupUsed(pc))
2230                 goto unlock_out;
2231
2232         switch (ctype) {
2233         case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2234         case MEM_CGROUP_CHARGE_TYPE_DROP:
2235                 if (page_mapped(page))
2236                         goto unlock_out;
2237                 break;
2238         case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
2239                 if (!PageAnon(page)) {  /* Shared memory */
2240                         if (page->mapping && !page_is_file_cache(page))
2241                                 goto unlock_out;
2242                 } else if (page_mapped(page)) /* Anon */
2243                                 goto unlock_out;
2244                 break;
2245         default:
2246                 break;
2247         }
2248
2249         if (!mem_cgroup_is_root(mem))
2250                 __do_uncharge(mem, ctype);
2251         if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2252                 mem_cgroup_swap_statistics(mem, true);
2253         mem_cgroup_charge_statistics(mem, pc, false);
2254
2255         ClearPageCgroupUsed(pc);
2256         /*
2257          * pc->mem_cgroup is not cleared here. It will be accessed when it's
2258          * freed from LRU. This is safe because uncharged page is expected not
2259          * to be reused (freed soon). Exception is SwapCache, it's handled by
2260          * special functions.
2261          */
2262
2263         mz = page_cgroup_zoneinfo(pc);
2264         unlock_page_cgroup(pc);
2265
2266         memcg_check_events(mem, page);
2267         /* at swapout, this memcg will be accessed to record to swap */
2268         if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2269                 css_put(&mem->css);
2270
2271         return mem;
2272
2273 unlock_out:
2274         unlock_page_cgroup(pc);
2275         return NULL;
2276 }
2277
2278 void mem_cgroup_uncharge_page(struct page *page)
2279 {
2280         /* early check. */
2281         if (page_mapped(page))
2282                 return;
2283         if (page->mapping && !PageAnon(page))
2284                 return;
2285         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
2286 }
2287
2288 void mem_cgroup_uncharge_cache_page(struct page *page)
2289 {
2290         VM_BUG_ON(page_mapped(page));
2291         VM_BUG_ON(page->mapping);
2292         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
2293 }
2294
2295 /*
2296  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
2297  * In that cases, pages are freed continuously and we can expect pages
2298  * are in the same memcg. All these calls itself limits the number of
2299  * pages freed at once, then uncharge_start/end() is called properly.
2300  * This may be called prural(2) times in a context,
2301  */
2302
2303 void mem_cgroup_uncharge_start(void)
2304 {
2305         current->memcg_batch.do_batch++;
2306         /* We can do nest. */
2307         if (current->memcg_batch.do_batch == 1) {
2308                 current->memcg_batch.memcg = NULL;
2309                 current->memcg_batch.bytes = 0;
2310                 current->memcg_batch.memsw_bytes = 0;
2311         }
2312 }
2313
2314 void mem_cgroup_uncharge_end(void)
2315 {
2316         struct memcg_batch_info *batch = &current->memcg_batch;
2317
2318         if (!batch->do_batch)
2319                 return;
2320
2321         batch->do_batch--;
2322         if (batch->do_batch) /* If stacked, do nothing. */
2323                 return;
2324
2325         if (!batch->memcg)
2326                 return;
2327         /*
2328          * This "batch->memcg" is valid without any css_get/put etc...
2329          * bacause we hide charges behind us.
2330          */
2331         if (batch->bytes)
2332                 res_counter_uncharge(&batch->memcg->res, batch->bytes);
2333         if (batch->memsw_bytes)
2334                 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2335         /* forget this pointer (for sanity check) */
2336         batch->memcg = NULL;
2337 }
2338
2339 #ifdef CONFIG_SWAP
2340 /*
2341  * called after __delete_from_swap_cache() and drop "page" account.
2342  * memcg information is recorded to swap_cgroup of "ent"
2343  */
2344 void
2345 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2346 {
2347         struct mem_cgroup *memcg;
2348         int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
2349
2350         if (!swapout) /* this was a swap cache but the swap is unused ! */
2351                 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
2352
2353         memcg = __mem_cgroup_uncharge_common(page, ctype);
2354
2355         /* record memcg information */
2356         if (do_swap_account && swapout && memcg) {
2357                 swap_cgroup_record(ent, css_id(&memcg->css));
2358                 mem_cgroup_get(memcg);
2359         }
2360         if (swapout && memcg)
2361                 css_put(&memcg->css);
2362 }
2363 #endif
2364
2365 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2366 /*
2367  * called from swap_entry_free(). remove record in swap_cgroup and
2368  * uncharge "memsw" account.
2369  */
2370 void mem_cgroup_uncharge_swap(swp_entry_t ent)
2371 {
2372         struct mem_cgroup *memcg;
2373         unsigned short id;
2374
2375         if (!do_swap_account)
2376                 return;
2377
2378         id = swap_cgroup_record(ent, 0);
2379         rcu_read_lock();
2380         memcg = mem_cgroup_lookup(id);
2381         if (memcg) {
2382                 /*
2383                  * We uncharge this because swap is freed.
2384                  * This memcg can be obsolete one. We avoid calling css_tryget
2385                  */
2386                 if (!mem_cgroup_is_root(memcg))
2387                         res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2388                 mem_cgroup_swap_statistics(memcg, false);
2389                 mem_cgroup_put(memcg);
2390         }
2391         rcu_read_unlock();
2392 }
2393
2394 /**
2395  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2396  * @entry: swap entry to be moved
2397  * @from:  mem_cgroup which the entry is moved from
2398  * @to:  mem_cgroup which the entry is moved to
2399  * @need_fixup: whether we should fixup res_counters and refcounts.
2400  *
2401  * It succeeds only when the swap_cgroup's record for this entry is the same
2402  * as the mem_cgroup's id of @from.
2403  *
2404  * Returns 0 on success, -EINVAL on failure.
2405  *
2406  * The caller must have charged to @to, IOW, called res_counter_charge() about
2407  * both res and memsw, and called css_get().
2408  */
2409 static int mem_cgroup_move_swap_account(swp_entry_t entry,
2410                 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2411 {
2412         unsigned short old_id, new_id;
2413
2414         old_id = css_id(&from->css);
2415         new_id = css_id(&to->css);
2416
2417         if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2418                 mem_cgroup_swap_statistics(from, false);
2419                 mem_cgroup_swap_statistics(to, true);
2420                 /*
2421                  * This function is only called from task migration context now.
2422                  * It postpones res_counter and refcount handling till the end
2423                  * of task migration(mem_cgroup_clear_mc()) for performance
2424                  * improvement. But we cannot postpone mem_cgroup_get(to)
2425                  * because if the process that has been moved to @to does
2426                  * swap-in, the refcount of @to might be decreased to 0.
2427                  */
2428                 mem_cgroup_get(to);
2429                 if (need_fixup) {
2430                         if (!mem_cgroup_is_root(from))
2431                                 res_counter_uncharge(&from->memsw, PAGE_SIZE);
2432                         mem_cgroup_put(from);
2433                         /*
2434                          * we charged both to->res and to->memsw, so we should
2435                          * uncharge to->res.
2436                          */
2437                         if (!mem_cgroup_is_root(to))
2438                                 res_counter_uncharge(&to->res, PAGE_SIZE);
2439                         css_put(&to->css);
2440                 }
2441                 return 0;
2442         }
2443         return -EINVAL;
2444 }
2445 #else
2446 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2447                 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2448 {
2449         return -EINVAL;
2450 }
2451 #endif
2452
2453 /*
2454  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
2455  * page belongs to.
2456  */
2457 int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
2458 {
2459         struct page_cgroup *pc;
2460         struct mem_cgroup *mem = NULL;
2461         int ret = 0;
2462
2463         if (mem_cgroup_disabled())
2464                 return 0;
2465
2466         pc = lookup_page_cgroup(page);
2467         lock_page_cgroup(pc);
2468         if (PageCgroupUsed(pc)) {
2469                 mem = pc->mem_cgroup;
2470                 css_get(&mem->css);
2471         }
2472         unlock_page_cgroup(pc);
2473
2474         *ptr = mem;
2475         if (mem) {
2476                 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
2477                 css_put(&mem->css);
2478         }
2479         return ret;
2480 }
2481
2482 /* remove redundant charge if migration failed*/
2483 void mem_cgroup_end_migration(struct mem_cgroup *mem,
2484                 struct page *oldpage, struct page *newpage)
2485 {
2486         struct page *target, *unused;
2487         struct page_cgroup *pc;
2488         enum charge_type ctype;
2489
2490         if (!mem)
2491                 return;
2492         cgroup_exclude_rmdir(&mem->css);
2493         /* at migration success, oldpage->mapping is NULL. */
2494         if (oldpage->mapping) {
2495                 target = oldpage;
2496                 unused = NULL;
2497         } else {
2498                 target = newpage;
2499                 unused = oldpage;
2500         }
2501
2502         if (PageAnon(target))
2503                 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
2504         else if (page_is_file_cache(target))
2505                 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2506         else
2507                 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2508
2509         /* unused page is not on radix-tree now. */
2510         if (unused)
2511                 __mem_cgroup_uncharge_common(unused, ctype);
2512
2513         pc = lookup_page_cgroup(target);
2514         /*
2515          * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
2516          * So, double-counting is effectively avoided.
2517          */
2518         __mem_cgroup_commit_charge(mem, pc, ctype);
2519
2520         /*
2521          * Both of oldpage and newpage are still under lock_page().
2522          * Then, we don't have to care about race in radix-tree.
2523          * But we have to be careful that this page is unmapped or not.
2524          *
2525          * There is a case for !page_mapped(). At the start of
2526          * migration, oldpage was mapped. But now, it's zapped.
2527          * But we know *target* page is not freed/reused under us.
2528          * mem_cgroup_uncharge_page() does all necessary checks.
2529          */
2530         if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
2531                 mem_cgroup_uncharge_page(target);
2532         /*
2533          * At migration, we may charge account against cgroup which has no tasks
2534          * So, rmdir()->pre_destroy() can be called while we do this charge.
2535          * In that case, we need to call pre_destroy() again. check it here.
2536          */
2537         cgroup_release_and_wakeup_rmdir(&mem->css);
2538 }
2539
2540 /*
2541  * A call to try to shrink memory usage on charge failure at shmem's swapin.
2542  * Calling hierarchical_reclaim is not enough because we should update
2543  * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
2544  * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
2545  * not from the memcg which this page would be charged to.
2546  * try_charge_swapin does all of these works properly.
2547  */
2548 int mem_cgroup_shmem_charge_fallback(struct page *page,
2549                             struct mm_struct *mm,
2550                             gfp_t gfp_mask)
2551 {
2552         struct mem_cgroup *mem = NULL;
2553         int ret;
2554
2555         if (mem_cgroup_disabled())
2556                 return 0;
2557
2558         ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2559         if (!ret)
2560                 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
2561
2562         return ret;
2563 }
2564
2565 static DEFINE_MUTEX(set_limit_mutex);
2566
2567 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2568                                 unsigned long long val)
2569 {
2570         int retry_count;
2571         u64 memswlimit;
2572         int ret = 0;
2573         int children = mem_cgroup_count_children(memcg);
2574         u64 curusage, oldusage;
2575
2576         /*
2577          * For keeping hierarchical_reclaim simple, how long we should retry
2578          * is depends on callers. We set our retry-count to be function
2579          * of # of children which we should visit in this loop.
2580          */
2581         retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
2582
2583         oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2584
2585         while (retry_count) {
2586                 if (signal_pending(current)) {
2587                         ret = -EINTR;
2588                         break;
2589                 }
2590                 /*
2591                  * Rather than hide all in some function, I do this in
2592                  * open coded manner. You see what this really does.
2593                  * We have to guarantee mem->res.limit < mem->memsw.limit.
2594                  */
2595                 mutex_lock(&set_limit_mutex);
2596                 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
2597                 if (memswlimit < val) {
2598                         ret = -EINVAL;
2599                         mutex_unlock(&set_limit_mutex);
2600                         break;
2601                 }
2602                 ret = res_counter_set_limit(&memcg->res, val);
2603                 if (!ret) {
2604                         if (memswlimit == val)
2605                                 memcg->memsw_is_minimum = true;
2606                         else
2607                                 memcg->memsw_is_minimum = false;
2608                 }
2609                 mutex_unlock(&set_limit_mutex);
2610
2611                 if (!ret)
2612                         break;
2613
2614                 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2615                                                 MEM_CGROUP_RECLAIM_SHRINK);
2616                 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2617                 /* Usage is reduced ? */
2618                 if (curusage >= oldusage)
2619                         retry_count--;
2620                 else
2621                         oldusage = curusage;
2622         }
2623
2624         return ret;
2625 }
2626
2627 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2628                                         unsigned long long val)
2629 {
2630         int retry_count;
2631         u64 memlimit, oldusage, curusage;
2632         int children = mem_cgroup_count_children(memcg);
2633         int ret = -EBUSY;
2634
2635         /* see mem_cgroup_resize_res_limit */
2636         retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
2637         oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2638         while (retry_count) {
2639                 if (signal_pending(current)) {
2640                         ret = -EINTR;
2641                         break;
2642                 }
2643                 /*
2644                  * Rather than hide all in some function, I do this in
2645                  * open coded manner. You see what this really does.
2646                  * We have to guarantee mem->res.limit < mem->memsw.limit.
2647                  */
2648                 mutex_lock(&set_limit_mutex);
2649                 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
2650                 if (memlimit > val) {
2651                         ret = -EINVAL;
2652                         mutex_unlock(&set_limit_mutex);
2653                         break;
2654                 }
2655                 ret = res_counter_set_limit(&memcg->memsw, val);
2656                 if (!ret) {
2657                         if (memlimit == val)
2658                                 memcg->memsw_is_minimum = true;
2659                         else
2660                                 memcg->memsw_is_minimum = false;
2661                 }
2662                 mutex_unlock(&set_limit_mutex);
2663
2664                 if (!ret)
2665                         break;
2666
2667                 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2668                                                 MEM_CGROUP_RECLAIM_NOSWAP |
2669                                                 MEM_CGROUP_RECLAIM_SHRINK);
2670                 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2671                 /* Usage is reduced ? */
2672                 if (curusage >= oldusage)
2673                         retry_count--;
2674                 else
2675                         oldusage = curusage;
2676         }
2677         return ret;
2678 }
2679
2680 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2681                                                 gfp_t gfp_mask, int nid,
2682                                                 int zid)
2683 {
2684         unsigned long nr_reclaimed = 0;
2685         struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2686         unsigned long reclaimed;
2687         int loop = 0;
2688         struct mem_cgroup_tree_per_zone *mctz;
2689         unsigned long long excess;
2690
2691         if (order > 0)
2692                 return 0;
2693
2694         mctz = soft_limit_tree_node_zone(nid, zid);
2695         /*
2696          * This loop can run a while, specially if mem_cgroup's continuously
2697          * keep exceeding their soft limit and putting the system under
2698          * pressure
2699          */
2700         do {
2701                 if (next_mz)
2702                         mz = next_mz;
2703                 else
2704                         mz = mem_cgroup_largest_soft_limit_node(mctz);
2705                 if (!mz)
2706                         break;
2707
2708                 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2709                                                 gfp_mask,
2710                                                 MEM_CGROUP_RECLAIM_SOFT);
2711                 nr_reclaimed += reclaimed;
2712                 spin_lock(&mctz->lock);
2713
2714                 /*
2715                  * If we failed to reclaim anything from this memory cgroup
2716                  * it is time to move on to the next cgroup
2717                  */
2718                 next_mz = NULL;
2719                 if (!reclaimed) {
2720                         do {
2721                                 /*
2722                                  * Loop until we find yet another one.
2723                                  *
2724                                  * By the time we get the soft_limit lock
2725                                  * again, someone might have aded the
2726                                  * group back on the RB tree. Iterate to
2727                                  * make sure we get a different mem.
2728                                  * mem_cgroup_largest_soft_limit_node returns
2729                                  * NULL if no other cgroup is present on
2730                                  * the tree
2731                                  */
2732                                 next_mz =
2733                                 __mem_cgroup_largest_soft_limit_node(mctz);
2734                                 if (next_mz == mz) {
2735                                         css_put(&next_mz->mem->css);
2736                                         next_mz = NULL;
2737                                 } else /* next_mz == NULL or other memcg */
2738                                         break;
2739                         } while (1);
2740                 }
2741                 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2742                 excess = res_counter_soft_limit_excess(&mz->mem->res);
2743                 /*
2744                  * One school of thought says that we should not add
2745                  * back the node to the tree if reclaim returns 0.
2746                  * But our reclaim could return 0, simply because due
2747                  * to priority we are exposing a smaller subset of
2748                  * memory to reclaim from. Consider this as a longer
2749                  * term TODO.
2750                  */
2751                 /* If excess == 0, no tree ops */
2752                 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
2753                 spin_unlock(&mctz->lock);
2754                 css_put(&mz->mem->css);
2755                 loop++;
2756                 /*
2757                  * Could not reclaim anything and there are no more
2758                  * mem cgroups to try or we seem to be looping without
2759                  * reclaiming anything.
2760                  */
2761                 if (!nr_reclaimed &&
2762                         (next_mz == NULL ||
2763                         loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2764                         break;
2765         } while (!nr_reclaimed);
2766         if (next_mz)
2767                 css_put(&next_mz->mem->css);
2768         return nr_reclaimed;
2769 }
2770
2771 /*
2772  * This routine traverse page_cgroup in given list and drop them all.
2773  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
2774  */
2775 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
2776                                 int node, int zid, enum lru_list lru)
2777 {
2778         struct zone *zone;
2779         struct mem_cgroup_per_zone *mz;
2780         struct page_cgroup *pc, *busy;
2781         unsigned long flags, loop;
2782         struct list_head *list;
2783         int ret = 0;
2784
2785         zone = &NODE_DATA(node)->node_zones[zid];
2786         mz = mem_cgroup_zoneinfo(mem, node, zid);
2787         list = &mz->lists[lru];
2788
2789         loop = MEM_CGROUP_ZSTAT(mz, lru);
2790         /* give some margin against EBUSY etc...*/
2791         loop += 256;
2792         busy = NULL;
2793         while (loop--) {
2794                 ret = 0;
2795                 spin_lock_irqsave(&zone->lru_lock, flags);
2796                 if (list_empty(list)) {
2797                         spin_unlock_irqrestore(&zone->lru_lock, flags);
2798                         break;
2799                 }
2800                 pc = list_entry(list->prev, struct page_cgroup, lru);
2801                 if (busy == pc) {
2802                         list_move(&pc->lru, list);
2803                         busy = NULL;
2804                         spin_unlock_irqrestore(&zone->lru_lock, flags);
2805                         continue;
2806                 }
2807                 spin_unlock_irqrestore(&zone->lru_lock, flags);
2808
2809                 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
2810                 if (ret == -ENOMEM)
2811                         break;
2812
2813                 if (ret == -EBUSY || ret == -EINVAL) {
2814                         /* found lock contention or "pc" is obsolete. */
2815                         busy = pc;
2816                         cond_resched();
2817                 } else
2818                         busy = NULL;
2819         }
2820
2821         if (!ret && !list_empty(list))
2822                 return -EBUSY;
2823         return ret;
2824 }
2825
2826 /*
2827  * make mem_cgroup's charge to be 0 if there is no task.
2828  * This enables deleting this mem_cgroup.
2829  */
2830 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
2831 {
2832         int ret;
2833         int node, zid, shrink;
2834         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2835         struct cgroup *cgrp = mem->css.cgroup;
2836
2837         css_get(&mem->css);
2838
2839         shrink = 0;
2840         /* should free all ? */
2841         if (free_all)
2842                 goto try_to_free;
2843 move_account:
2844         do {
2845                 ret = -EBUSY;
2846                 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
2847                         goto out;
2848                 ret = -EINTR;
2849                 if (signal_pending(current))
2850                         goto out;
2851                 /* This is for making all *used* pages to be on LRU. */
2852                 lru_add_drain_all();
2853                 drain_all_stock_sync();
2854                 ret = 0;
2855                 for_each_node_state(node, N_HIGH_MEMORY) {
2856                         for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
2857                                 enum lru_list l;
2858                                 for_each_lru(l) {
2859                                         ret = mem_cgroup_force_empty_list(mem,
2860                                                         node, zid, l);
2861                                         if (ret)
2862                                                 break;
2863                                 }
2864                         }
2865                         if (ret)
2866                                 break;
2867                 }
2868                 /* it seems parent cgroup doesn't have enough mem */
2869                 if (ret == -ENOMEM)
2870                         goto try_to_free;
2871                 cond_resched();
2872         /* "ret" should also be checked to ensure all lists are empty. */
2873         } while (mem->res.usage > 0 || ret);
2874 out:
2875         css_put(&mem->css);
2876         return ret;
2877
2878 try_to_free:
2879         /* returns EBUSY if there is a task or if we come here twice. */
2880         if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
2881                 ret = -EBUSY;
2882                 goto out;
2883         }
2884         /* we call try-to-free pages for make this cgroup empty */
2885         lru_add_drain_all();
2886         /* try to free all pages in this cgroup */
2887         shrink = 1;
2888         while (nr_retries && mem->res.usage > 0) {
2889                 int progress;
2890
2891                 if (signal_pending(current)) {
2892                         ret = -EINTR;
2893                         goto out;
2894                 }
2895                 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
2896                                                 false, get_swappiness(mem));
2897                 if (!progress) {
2898                         nr_retries--;
2899                         /* maybe some writeback is necessary */
2900                         congestion_wait(BLK_RW_ASYNC, HZ/10);
2901                 }
2902
2903         }
2904         lru_add_drain();
2905         /* try move_account...there may be some *locked* pages. */
2906         goto move_account;
2907 }
2908
2909 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
2910 {
2911         return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
2912 }
2913
2914
2915 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
2916 {
2917         return mem_cgroup_from_cont(cont)->use_hierarchy;
2918 }
2919
2920 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
2921                                         u64 val)
2922 {
2923         int retval = 0;
2924         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2925         struct cgroup *parent = cont->parent;
2926         struct mem_cgroup *parent_mem = NULL;
2927
2928         if (parent)
2929                 parent_mem = mem_cgroup_from_cont(parent);
2930
2931         cgroup_lock();
2932         /*
2933          * If parent's use_hierarchy is set, we can't make any modifications
2934          * in the child subtrees. If it is unset, then the change can
2935          * occur, provided the current cgroup has no children.
2936          *
2937          * For the root cgroup, parent_mem is NULL, we allow value to be
2938          * set if there are no children.
2939          */
2940         if ((!parent_mem || !parent_mem->use_hierarchy) &&
2941                                 (val == 1 || val == 0)) {
2942                 if (list_empty(&cont->children))
2943                         mem->use_hierarchy = val;
2944                 else
2945                         retval = -EBUSY;
2946         } else
2947                 retval = -EINVAL;
2948         cgroup_unlock();
2949
2950         return retval;
2951 }
2952
2953 struct mem_cgroup_idx_data {
2954         s64 val;
2955         enum mem_cgroup_stat_index idx;
2956 };
2957
2958 static int
2959 mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2960 {
2961         struct mem_cgroup_idx_data *d = data;
2962         d->val += mem_cgroup_read_stat(mem, d->idx);
2963         return 0;
2964 }
2965
2966 static void
2967 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
2968                                 enum mem_cgroup_stat_index idx, s64 *val)
2969 {
2970         struct mem_cgroup_idx_data d;
2971         d.idx = idx;
2972         d.val = 0;
2973         mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
2974         *val = d.val;
2975 }
2976
2977 static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
2978 {
2979         u64 idx_val, val;
2980
2981         if (!mem_cgroup_is_root(mem)) {
2982                 if (!swap)
2983                         return res_counter_read_u64(&mem->res, RES_USAGE);
2984                 else
2985                         return res_counter_read_u64(&mem->memsw, RES_USAGE);
2986         }
2987
2988         mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val);
2989         val = idx_val;
2990         mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
2991         val += idx_val;
2992
2993         if (swap) {
2994                 mem_cgroup_get_recursive_idx_stat(mem,
2995                                 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2996                 val += idx_val;
2997         }
2998
2999         return val << PAGE_SHIFT;
3000 }
3001
3002 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3003 {
3004         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3005         u64 val;
3006         int type, name;
3007
3008         type = MEMFILE_TYPE(cft->private);
3009         name = MEMFILE_ATTR(cft->private);
3010         switch (type) {
3011         case _MEM:
3012                 if (name == RES_USAGE)
3013                         val = mem_cgroup_usage(mem, false);
3014                 else
3015                         val = res_counter_read_u64(&mem->res, name);
3016                 break;
3017         case _MEMSWAP:
3018                 if (name == RES_USAGE)
3019                         val = mem_cgroup_usage(mem, true);
3020                 else
3021                         val = res_counter_read_u64(&mem->memsw, name);
3022                 break;
3023         default:
3024                 BUG();
3025                 break;
3026         }
3027         return val;
3028 }
3029 /*
3030  * The user of this function is...
3031  * RES_LIMIT.
3032  */
3033 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3034                             const char *buffer)
3035 {
3036         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3037         int type, name;
3038         unsigned long long val;
3039         int ret;
3040
3041         type = MEMFILE_TYPE(cft->private);
3042         name = MEMFILE_ATTR(cft->private);
3043         switch (name) {
3044         case RES_LIMIT:
3045                 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3046                         ret = -EINVAL;
3047                         break;
3048                 }
3049                 /* This function does all necessary parse...reuse it */
3050                 ret = res_counter_memparse_write_strategy(buffer, &val);
3051                 if (ret)
3052                         break;
3053                 if (type == _MEM)
3054                         ret = mem_cgroup_resize_limit(memcg, val);
3055                 else
3056                         ret = mem_cgroup_resize_memsw_limit(memcg, val);
3057                 break;
3058         case RES_SOFT_LIMIT:
3059                 ret = res_counter_memparse_write_strategy(buffer, &val);
3060                 if (ret)
3061                         break;
3062                 /*
3063                  * For memsw, soft limits are hard to implement in terms
3064                  * of semantics, for now, we support soft limits for
3065                  * control without swap
3066                  */
3067                 if (type == _MEM)
3068                         ret = res_counter_set_soft_limit(&memcg->res, val);
3069                 else
3070                         ret = -EINVAL;
3071                 break;
3072         default:
3073                 ret = -EINVAL; /* should be BUG() ? */
3074                 break;
3075         }
3076         return ret;
3077 }
3078
3079 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3080                 unsigned long long *mem_limit, unsigned long long *memsw_limit)
3081 {
3082         struct cgroup *cgroup;
3083         unsigned long long min_limit, min_memsw_limit, tmp;
3084
3085         min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3086         min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3087         cgroup = memcg->css.cgroup;
3088         if (!memcg->use_hierarchy)
3089                 goto out;
3090
3091         while (cgroup->parent) {
3092                 cgroup = cgroup->parent;
3093                 memcg = mem_cgroup_from_cont(cgroup);
3094                 if (!memcg->use_hierarchy)
3095                         break;
3096                 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
3097                 min_limit = min(min_limit, tmp);
3098                 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3099                 min_memsw_limit = min(min_memsw_limit, tmp);
3100         }
3101 out:
3102         *mem_limit = min_limit;
3103         *memsw_limit = min_memsw_limit;
3104         return;
3105 }
3106
3107 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
3108 {
3109         struct mem_cgroup *mem;
3110         int type, name;
3111
3112         mem = mem_cgroup_from_cont(cont);
3113         type = MEMFILE_TYPE(event);
3114         name = MEMFILE_ATTR(event);
3115         switch (name) {
3116         case RES_MAX_USAGE:
3117                 if (type == _MEM)
3118                         res_counter_reset_max(&mem->res);
3119                 else
3120                         res_counter_reset_max(&mem->memsw);
3121                 break;
3122         case RES_FAILCNT:
3123                 if (type == _MEM)
3124                         res_counter_reset_failcnt(&mem->res);
3125                 else
3126                         res_counter_reset_failcnt(&mem->memsw);
3127                 break;
3128         }
3129
3130         return 0;
3131 }
3132
3133 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3134                                         struct cftype *cft)
3135 {
3136         return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3137 }
3138
3139 #ifdef CONFIG_MMU
3140 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3141                                         struct cftype *cft, u64 val)
3142 {
3143         struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3144
3145         if (val >= (1 << NR_MOVE_TYPE))
3146                 return -EINVAL;
3147         /*
3148          * We check this value several times in both in can_attach() and
3149          * attach(), so we need cgroup lock to prevent this value from being
3150          * inconsistent.
3151          */
3152         cgroup_lock();
3153         mem->move_charge_at_immigrate = val;
3154         cgroup_unlock();
3155
3156         return 0;
3157 }
3158 #else
3159 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3160                                         struct cftype *cft, u64 val)
3161 {
3162         return -ENOSYS;
3163 }
3164 #endif
3165
3166
3167 /* For read statistics */
3168 enum {
3169         MCS_CACHE,
3170         MCS_RSS,
3171         MCS_FILE_MAPPED,
3172         MCS_PGPGIN,
3173         MCS_PGPGOUT,
3174         MCS_SWAP,
3175         MCS_INACTIVE_ANON,
3176         MCS_ACTIVE_ANON,
3177         MCS_INACTIVE_FILE,
3178         MCS_ACTIVE_FILE,
3179         MCS_UNEVICTABLE,
3180         NR_MCS_STAT,
3181 };
3182
3183 struct mcs_total_stat {
3184         s64 stat[NR_MCS_STAT];
3185 };
3186
3187 struct {
3188         char *local_name;
3189         char *total_name;
3190 } memcg_stat_strings[NR_MCS_STAT] = {
3191         {"cache", "total_cache"},
3192         {"rss", "total_rss"},
3193         {"mapped_file", "total_mapped_file"},
3194         {"pgpgin", "total_pgpgin"},
3195         {"pgpgout", "total_pgpgout"},
3196         {"swap", "total_swap"},
3197         {"inactive_anon", "total_inactive_anon"},
3198         {"active_anon", "total_active_anon"},
3199         {"inactive_file", "total_inactive_file"},
3200         {"active_file", "total_active_file"},
3201         {"unevictable", "total_unevictable"}
3202 };
3203
3204
3205 static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
3206 {
3207         struct mcs_total_stat *s = data;
3208         s64 val;
3209
3210         /* per cpu stat */
3211         val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
3212         s->stat[MCS_CACHE] += val * PAGE_SIZE;
3213         val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
3214         s->stat[MCS_RSS] += val * PAGE_SIZE;
3215         val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
3216         s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
3217         val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
3218         s->stat[MCS_PGPGIN] += val;
3219         val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
3220         s->stat[MCS_PGPGOUT] += val;
3221         if (do_swap_account) {
3222                 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3223                 s->stat[MCS_SWAP] += val * PAGE_SIZE;
3224         }
3225
3226         /* per zone stat */
3227         val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
3228         s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
3229         val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
3230         s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
3231         val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
3232         s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
3233         val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
3234         s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
3235         val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
3236         s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
3237         return 0;
3238 }
3239
3240 static void
3241 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3242 {
3243         mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
3244 }
3245
3246 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
3247                                  struct cgroup_map_cb *cb)
3248 {
3249         struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
3250         struct mcs_total_stat mystat;
3251         int i;
3252
3253         memset(&mystat, 0, sizeof(mystat));
3254         mem_cgroup_get_local_stat(mem_cont, &mystat);
3255
3256         for (i = 0; i < NR_MCS_STAT; i++) {
3257                 if (i == MCS_SWAP && !do_swap_account)
3258                         continue;
3259                 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
3260         }
3261
3262         /* Hierarchical information */
3263         {
3264                 unsigned long long limit, memsw_limit;
3265                 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
3266                 cb->fill(cb, "hierarchical_memory_limit", limit);
3267                 if (do_swap_account)
3268                         cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
3269         }
3270
3271         memset(&mystat, 0, sizeof(mystat));
3272         mem_cgroup_get_total_stat(mem_cont, &mystat);
3273         for (i = 0; i < NR_MCS_STAT; i++) {
3274                 if (i == MCS_SWAP && !do_swap_account)
3275                         continue;
3276                 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
3277         }
3278
3279 #ifdef CONFIG_DEBUG_VM
3280         cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
3281
3282         {
3283                 int nid, zid;
3284                 struct mem_cgroup_per_zone *mz;
3285                 unsigned long recent_rotated[2] = {0, 0};
3286                 unsigned long recent_scanned[2] = {0, 0};
3287
3288                 for_each_online_node(nid)
3289                         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3290                                 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
3291
3292                                 recent_rotated[0] +=
3293                                         mz->reclaim_stat.recent_rotated[0];
3294                                 recent_rotated[1] +=
3295                                         mz->reclaim_stat.recent_rotated[1];
3296                                 recent_scanned[0] +=
3297                                         mz->reclaim_stat.recent_scanned[0];
3298                                 recent_scanned[1] +=
3299                                         mz->reclaim_stat.recent_scanned[1];
3300                         }
3301                 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
3302                 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
3303                 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
3304                 cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
3305         }
3306 #endif
3307
3308         return 0;
3309 }
3310
3311 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
3312 {
3313         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3314
3315         return get_swappiness(memcg);
3316 }
3317
3318 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3319                                        u64 val)
3320 {
3321         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3322         struct mem_cgroup *parent;
3323
3324         if (val > 100)
3325                 return -EINVAL;
3326
3327         if (cgrp->parent == NULL)
3328                 return -EINVAL;
3329
3330         parent = mem_cgroup_from_cont(cgrp->parent);
3331
3332         cgroup_lock();
3333
3334         /* If under hierarchy, only empty-root can set this value */
3335         if ((parent->use_hierarchy) ||
3336             (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
3337                 cgroup_unlock();
3338                 return -EINVAL;
3339         }
3340
3341         spin_lock(&memcg->reclaim_param_lock);
3342         memcg->swappiness = val;
3343         spin_unlock(&memcg->reclaim_param_lock);
3344
3345         cgroup_unlock();
3346
3347         return 0;
3348 }
3349
3350 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3351 {
3352         struct mem_cgroup_threshold_ary *t;
3353         u64 usage;
3354         int i;
3355
3356         rcu_read_lock();
3357         if (!swap)
3358                 t = rcu_dereference(memcg->thresholds);
3359         else
3360                 t = rcu_dereference(memcg->memsw_thresholds);
3361
3362         if (!t)
3363                 goto unlock;
3364
3365         usage = mem_cgroup_usage(memcg, swap);
3366
3367         /*
3368          * current_threshold points to threshold just below usage.
3369          * If it's not true, a threshold was crossed after last
3370          * call of __mem_cgroup_threshold().
3371          */
3372         i = atomic_read(&t->current_threshold);
3373
3374         /*
3375          * Iterate backward over array of thresholds starting from
3376          * current_threshold and check if a threshold is crossed.
3377          * If none of thresholds below usage is crossed, we read
3378          * only one element of the array here.
3379          */
3380         for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3381                 eventfd_signal(t->entries[i].eventfd, 1);
3382
3383         /* i = current_threshold + 1 */
3384         i++;
3385
3386         /*
3387          * Iterate forward over array of thresholds starting from
3388          * current_threshold+1 and check if a threshold is crossed.
3389          * If none of thresholds above usage is crossed, we read
3390          * only one element of the array here.
3391          */
3392         for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3393                 eventfd_signal(t->entries[i].eventfd, 1);
3394
3395         /* Update current_threshold */
3396         atomic_set(&t->current_threshold, i - 1);
3397 unlock:
3398         rcu_read_unlock();
3399 }
3400
3401 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3402 {
3403         __mem_cgroup_threshold(memcg, false);
3404         if (do_swap_account)
3405                 __mem_cgroup_threshold(memcg, true);
3406 }
3407
3408 static int compare_thresholds(const void *a, const void *b)
3409 {
3410         const struct mem_cgroup_threshold *_a = a;
3411         const struct mem_cgroup_threshold *_b = b;
3412
3413         return _a->threshold - _b->threshold;
3414 }
3415
3416 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
3417 {
3418         struct mem_cgroup_eventfd_list *ev;
3419
3420         list_for_each_entry(ev, &mem->oom_notify, list)
3421                 eventfd_signal(ev->eventfd, 1);
3422         return 0;
3423 }
3424
3425 static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
3426 {
3427         mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb);
3428 }
3429
3430 static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
3431         struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3432 {
3433         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3434         struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
3435         int type = MEMFILE_TYPE(cft->private);
3436         u64 threshold, usage;
3437         int size;
3438         int i, ret;
3439
3440         ret = res_counter_memparse_write_strategy(args, &threshold);
3441         if (ret)
3442                 return ret;
3443
3444         mutex_lock(&memcg->thresholds_lock);
3445         if (type == _MEM)
3446                 thresholds = memcg->thresholds;
3447         else if (type == _MEMSWAP)
3448                 thresholds = memcg->memsw_thresholds;
3449         else
3450                 BUG();
3451
3452         usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3453
3454         /* Check if a threshold crossed before adding a new one */
3455         if (thresholds)
3456                 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3457
3458         if (thresholds)
3459                 size = thresholds->size + 1;
3460         else
3461                 size = 1;
3462
3463         /* Allocate memory for new array of thresholds */
3464         thresholds_new = kmalloc(sizeof(*thresholds_new) +
3465                         size * sizeof(struct mem_cgroup_threshold),
3466                         GFP_KERNEL);
3467         if (!thresholds_new) {
3468                 ret = -ENOMEM;
3469                 goto unlock;
3470         }
3471         thresholds_new->size = size;
3472
3473         /* Copy thresholds (if any) to new array */
3474         if (thresholds)
3475                 memcpy(thresholds_new->entries, thresholds->entries,
3476                                 thresholds->size *
3477                                 sizeof(struct mem_cgroup_threshold));
3478         /* Add new threshold */
3479         thresholds_new->entries[size - 1].eventfd = eventfd;
3480         thresholds_new->entries[size - 1].threshold = threshold;
3481
3482         /* Sort thresholds. Registering of new threshold isn't time-critical */
3483         sort(thresholds_new->entries, size,
3484                         sizeof(struct mem_cgroup_threshold),
3485                         compare_thresholds, NULL);
3486
3487         /* Find current threshold */
3488         atomic_set(&thresholds_new->current_threshold, -1);
3489         for (i = 0; i < size; i++) {
3490                 if (thresholds_new->entries[i].threshold < usage) {
3491                         /*
3492                          * thresholds_new->current_threshold will not be used
3493                          * until rcu_assign_pointer(), so it's safe to increment
3494                          * it here.
3495                          */
3496                         atomic_inc(&thresholds_new->current_threshold);
3497                 }
3498         }
3499
3500         if (type == _MEM)
3501                 rcu_assign_pointer(memcg->thresholds, thresholds_new);
3502         else
3503                 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
3504
3505         /* To be sure that nobody uses thresholds before freeing it */
3506         synchronize_rcu();
3507
3508         kfree(thresholds);
3509 unlock:
3510         mutex_unlock(&memcg->thresholds_lock);
3511
3512         return ret;
3513 }
3514
3515 static int mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
3516         struct cftype *cft, struct eventfd_ctx *eventfd)
3517 {
3518         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3519         struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
3520         int type = MEMFILE_TYPE(cft->private);
3521         u64 usage;
3522         int size = 0;
3523         int i, j, ret;
3524
3525         mutex_lock(&memcg->thresholds_lock);
3526         if (type == _MEM)
3527                 thresholds = memcg->thresholds;
3528         else if (type == _MEMSWAP)
3529                 thresholds = memcg->memsw_thresholds;
3530         else
3531                 BUG();
3532
3533         /*
3534          * Something went wrong if we trying to unregister a threshold
3535          * if we don't have thresholds
3536          */
3537         BUG_ON(!thresholds);
3538
3539         usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3540
3541         /* Check if a threshold crossed before removing */
3542         __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3543
3544         /* Calculate new number of threshold */
3545         for (i = 0; i < thresholds->size; i++) {
3546                 if (thresholds->entries[i].eventfd != eventfd)
3547                         size++;
3548         }
3549
3550         /* Set thresholds array to NULL if we don't have thresholds */
3551         if (!size) {
3552                 thresholds_new = NULL;
3553                 goto assign;
3554         }
3555
3556         /* Allocate memory for new array of thresholds */
3557         thresholds_new = kmalloc(sizeof(*thresholds_new) +
3558                         size * sizeof(struct mem_cgroup_threshold),
3559                         GFP_KERNEL);
3560         if (!thresholds_new) {
3561                 ret = -ENOMEM;
3562                 goto unlock;
3563         }
3564         thresholds_new->size = size;
3565
3566         /* Copy thresholds and find current threshold */
3567         atomic_set(&thresholds_new->current_threshold, -1);
3568         for (i = 0, j = 0; i < thresholds->size; i++) {
3569                 if (thresholds->entries[i].eventfd == eventfd)
3570                         continue;
3571
3572                 thresholds_new->entries[j] = thresholds->entries[i];
3573                 if (thresholds_new->entries[j].threshold < usage) {
3574                         /*
3575                          * thresholds_new->current_threshold will not be used
3576                          * until rcu_assign_pointer(), so it's safe to increment
3577                          * it here.
3578                          */
3579                         atomic_inc(&thresholds_new->current_threshold);
3580                 }
3581                 j++;
3582         }
3583
3584 assign:
3585         if (type == _MEM)
3586                 rcu_assign_pointer(memcg->thresholds, thresholds_new);
3587         else
3588                 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
3589
3590         /* To be sure that nobody uses thresholds before freeing it */
3591         synchronize_rcu();
3592
3593         kfree(thresholds);
3594 unlock:
3595         mutex_unlock(&memcg->thresholds_lock);
3596
3597         return ret;
3598 }
3599
3600 static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
3601         struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
3602 {
3603         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3604         struct mem_cgroup_eventfd_list *event;
3605         int type = MEMFILE_TYPE(cft->private);
3606
3607         BUG_ON(type != _OOM_TYPE);
3608         event = kmalloc(sizeof(*event), GFP_KERNEL);
3609         if (!event)
3610                 return -ENOMEM;
3611
3612         mutex_lock(&memcg_oom_mutex);
3613
3614         event->eventfd = eventfd;
3615         list_add(&event->list, &memcg->oom_notify);
3616
3617         /* already in OOM ? */
3618         if (atomic_read(&memcg->oom_lock))
3619                 eventfd_signal(eventfd, 1);
3620         mutex_unlock(&memcg_oom_mutex);
3621
3622         return 0;
3623 }
3624
3625 static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
3626         struct cftype *cft, struct eventfd_ctx *eventfd)
3627 {
3628         struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3629         struct mem_cgroup_eventfd_list *ev, *tmp;
3630         int type = MEMFILE_TYPE(cft->private);
3631
3632         BUG_ON(type != _OOM_TYPE);
3633
3634         mutex_lock(&memcg_oom_mutex);
3635
3636         list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
3637                 if (ev->eventfd == eventfd) {
3638                         list_del(&ev->list);
3639                         kfree(ev);
3640                 }
3641         }
3642
3643         mutex_unlock(&memcg_oom_mutex);
3644
3645         return 0;
3646 }
3647
3648 static struct cftype mem_cgroup_files[] = {
3649         {
3650                 .name = "usage_in_bytes",
3651                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3652                 .read_u64 = mem_cgroup_read,
3653                 .register_event = mem_cgroup_usage_register_event,
3654                 .unregister_event = mem_cgroup_usage_unregister_event,
3655         },
3656         {
3657                 .name = "max_usage_in_bytes",
3658                 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
3659                 .trigger = mem_cgroup_reset,
3660                 .read_u64 = mem_cgroup_read,
3661         },
3662         {
3663                 .name = "limit_in_bytes",
3664                 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
3665                 .write_string = mem_cgroup_write,
3666                 .read_u64 = mem_cgroup_read,
3667         },
3668         {
3669                 .name = "soft_limit_in_bytes",
3670                 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
3671                 .write_string = mem_cgroup_write,
3672                 .read_u64 = mem_cgroup_read,
3673         },
3674         {
3675                 .name = "failcnt",
3676                 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
3677                 .trigger = mem_cgroup_reset,
3678                 .read_u64 = mem_cgroup_read,
3679         },
3680         {
3681                 .name = "stat",
3682                 .read_map = mem_control_stat_show,
3683         },
3684         {
3685                 .name = "force_empty",
3686                 .trigger = mem_cgroup_force_empty_write,
3687         },
3688         {
3689                 .name = "use_hierarchy",
3690                 .write_u64 = mem_cgroup_hierarchy_write,
3691                 .read_u64 = mem_cgroup_hierarchy_read,
3692         },
3693         {
3694                 .name = "swappiness",
3695                 .read_u64 = mem_cgroup_swappiness_read,
3696                 .write_u64 = mem_cgroup_swappiness_write,
3697         },
3698         {
3699                 .name = "move_charge_at_immigrate",
3700                 .read_u64 = mem_cgroup_move_charge_read,
3701                 .write_u64 = mem_cgroup_move_charge_write,
3702         },
3703         {
3704                 .name = "oom_control",
3705                 .register_event = mem_cgroup_oom_register_event,
3706                 .unregister_event = mem_cgroup_oom_unregister_event,
3707                 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
3708         },
3709 };
3710
3711 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3712 static struct cftype memsw_cgroup_files[] = {
3713         {
3714                 .name = "memsw.usage_in_bytes",
3715                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3716                 .read_u64 = mem_cgroup_read,
3717                 .register_event = mem_cgroup_usage_register_event,
3718                 .unregister_event = mem_cgroup_usage_unregister_event,
3719         },
3720         {
3721                 .name = "memsw.max_usage_in_bytes",
3722                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
3723                 .trigger = mem_cgroup_reset,
3724                 .read_u64 = mem_cgroup_read,
3725         },
3726         {
3727                 .name = "memsw.limit_in_bytes",
3728                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
3729                 .write_string = mem_cgroup_write,
3730                 .read_u64 = mem_cgroup_read,
3731         },
3732         {
3733                 .name = "memsw.failcnt",
3734                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
3735                 .trigger = mem_cgroup_reset,
3736                 .read_u64 = mem_cgroup_read,
3737         },
3738 };
3739
3740 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
3741 {
3742         if (!do_swap_account)
3743                 return 0;
3744         return cgroup_add_files(cont, ss, memsw_cgroup_files,
3745                                 ARRAY_SIZE(memsw_cgroup_files));
3746 };
3747 #else
3748 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
3749 {
3750         return 0;
3751 }
3752 #endif
3753
3754 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3755 {
3756         struct mem_cgroup_per_node *pn;
3757         struct mem_cgroup_per_zone *mz;
3758         enum lru_list l;
3759         int zone, tmp = node;
3760         /*
3761          * This routine is called against possible nodes.
3762          * But it's BUG to call kmalloc() against offline node.
3763          *
3764          * TODO: this routine can waste much memory for nodes which will
3765          *       never be onlined. It's better to use memory hotplug callback
3766          *       function.
3767          */
3768         if (!node_state(node, N_NORMAL_MEMORY))
3769                 tmp = -1;
3770         pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
3771         if (!pn)
3772                 return 1;
3773
3774         mem->info.nodeinfo[node] = pn;
3775         memset(pn, 0, sizeof(*pn));
3776
3777         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3778                 mz = &pn->zoneinfo[zone];
3779                 for_each_lru(l)
3780                         INIT_LIST_HEAD(&mz->lists[l]);
3781                 mz->usage_in_excess = 0;
3782                 mz->on_tree = false;
3783                 mz->mem = mem;
3784         }
3785         return 0;
3786 }
3787
3788 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
3789 {
3790         kfree(mem->info.nodeinfo[node]);
3791 }
3792
3793 static struct mem_cgroup *mem_cgroup_alloc(void)
3794 {
3795         struct mem_cgroup *mem;
3796         int size = sizeof(struct mem_cgroup);
3797
3798         /* Can be very big if MAX_NUMNODES is very big */
3799         if (size < PAGE_SIZE)
3800                 mem = kmalloc(size, GFP_KERNEL);
3801         else
3802                 mem = vmalloc(size);
3803
3804         if (!mem)
3805                 return NULL;
3806
3807         memset(mem, 0, size);
3808         mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
3809         if (!mem->stat) {
3810                 if (size < PAGE_SIZE)
3811                         kfree(mem);
3812                 else
3813                         vfree(mem);
3814                 mem = NULL;
3815         }
3816         return mem;
3817 }
3818
3819 /*
3820  * At destroying mem_cgroup, references from swap_cgroup can remain.
3821  * (scanning all at force_empty is too costly...)
3822  *
3823  * Instead of clearing all references at force_empty, we remember
3824  * the number of reference from swap_cgroup and free mem_cgroup when
3825  * it goes down to 0.
3826  *
3827  * Removal of cgroup itself succeeds regardless of refs from swap.
3828  */
3829
3830 static void __mem_cgroup_free(struct mem_cgroup *mem)
3831 {
3832         int node;
3833
3834         mem_cgroup_remove_from_trees(mem);
3835         free_css_id(&mem_cgroup_subsys, &mem->css);
3836
3837         for_each_node_state(node, N_POSSIBLE)
3838                 free_mem_cgroup_per_zone_info(mem, node);
3839
3840         free_percpu(mem->stat);
3841         if (sizeof(struct mem_cgroup) < PAGE_SIZE)
3842                 kfree(mem);
3843         else
3844                 vfree(mem);
3845 }
3846
3847 static void mem_cgroup_get(struct mem_cgroup *mem)
3848 {
3849         atomic_inc(&mem->refcnt);
3850 }
3851
3852 static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
3853 {
3854         if (atomic_sub_and_test(count, &mem->refcnt)) {
3855                 struct mem_cgroup *parent = parent_mem_cgroup(mem);
3856                 __mem_cgroup_free(mem);
3857                 if (parent)
3858                         mem_cgroup_put(parent);
3859         }
3860 }
3861
3862 static void mem_cgroup_put(struct mem_cgroup *mem)
3863 {
3864         __mem_cgroup_put(mem, 1);
3865 }
3866
3867 /*
3868  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
3869  */
3870 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
3871 {
3872         if (!mem->res.parent)
3873                 return NULL;
3874         return mem_cgroup_from_res_counter(mem->res.parent, res);
3875 }
3876
3877 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3878 static void __init enable_swap_cgroup(void)
3879 {
3880         if (!mem_cgroup_disabled() && really_do_swap_account)
3881                 do_swap_account = 1;
3882 }
3883 #else
3884 static void __init enable_swap_cgroup(void)
3885 {
3886 }
3887 #endif
3888
3889 static int mem_cgroup_soft_limit_tree_init(void)
3890 {
3891         struct mem_cgroup_tree_per_node *rtpn;
3892         struct mem_cgroup_tree_per_zone *rtpz;
3893         int tmp, node, zone;
3894
3895         for_each_node_state(node, N_POSSIBLE) {
3896                 tmp = node;
3897                 if (!node_state(node, N_NORMAL_MEMORY))
3898                         tmp = -1;
3899                 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
3900                 if (!rtpn)
3901                         return 1;
3902
3903                 soft_limit_tree.rb_tree_per_node[node] = rtpn;
3904
3905                 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3906                         rtpz = &rtpn->rb_tree_per_zone[zone];
3907                         rtpz->rb_root = RB_ROOT;
3908                         spin_lock_init(&rtpz->lock);
3909                 }
3910         }
3911         return 0;
3912 }
3913
3914 static struct cgroup_subsys_state * __ref
3915 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3916 {
3917         struct mem_cgroup *mem, *parent;
3918         long error = -ENOMEM;
3919         int node;
3920
3921         mem = mem_cgroup_alloc();
3922         if (!mem)
3923                 return ERR_PTR(error);
3924
3925         for_each_node_state(node, N_POSSIBLE)
3926                 if (alloc_mem_cgroup_per_zone_info(mem, node))
3927                         goto free_out;
3928
3929         /* root ? */
3930         if (cont->parent == NULL) {
3931                 int cpu;
3932                 enable_swap_cgroup();
3933                 parent = NULL;
3934                 root_mem_cgroup = mem;
3935                 if (mem_cgroup_soft_limit_tree_init())
3936                         goto free_out;
3937                 for_each_possible_cpu(cpu) {
3938                         struct memcg_stock_pcp *stock =
3939                                                 &per_cpu(memcg_stock, cpu);
3940                         INIT_WORK(&stock->work, drain_local_stock);
3941                 }
3942                 hotcpu_notifier(memcg_stock_cpu_callback, 0);
3943         } else {
3944                 parent = mem_cgroup_from_cont(cont->parent);
3945                 mem->use_hierarchy = parent->use_hierarchy;
3946         }
3947
3948         if (parent && parent->use_hierarchy) {
3949                 res_counter_init(&mem->res, &parent->res);
3950                 res_counter_init(&mem->memsw, &parent->memsw);
3951                 /*
3952                  * We increment refcnt of the parent to ensure that we can
3953                  * safely access it on res_counter_charge/uncharge.
3954                  * This refcnt will be decremented when freeing this
3955                  * mem_cgroup(see mem_cgroup_put).
3956                  */
3957                 mem_cgroup_get(parent);
3958         } else {
3959                 res_counter_init(&mem->res, NULL);
3960                 res_counter_init(&mem->memsw, NULL);
3961         }
3962         mem->last_scanned_child = 0;
3963         spin_lock_init(&mem->reclaim_param_lock);
3964         INIT_LIST_HEAD(&mem->oom_notify);
3965
3966         if (parent)
3967                 mem->swappiness = get_swappiness(parent);
3968         atomic_set(&mem->refcnt, 1);
3969         mem->move_charge_at_immigrate = 0;
3970         mutex_init(&mem->thresholds_lock);
3971         return &mem->css;
3972 free_out:
3973         __mem_cgroup_free(mem);
3974         root_mem_cgroup = NULL;
3975         return ERR_PTR(error);
3976 }
3977
3978 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
3979                                         struct cgroup *cont)
3980 {
3981         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3982
3983         return mem_cgroup_force_empty(mem, false);
3984 }
3985
3986 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
3987                                 struct cgroup *cont)
3988 {
3989         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3990
3991         mem_cgroup_put(mem);
3992 }
3993
3994 static int mem_cgroup_populate(struct cgroup_subsys *ss,
3995                                 struct cgroup *cont)
3996 {
3997         int ret;
3998
3999         ret = cgroup_add_files(cont, ss, mem_cgroup_files,
4000                                 ARRAY_SIZE(mem_cgroup_files));
4001
4002         if (!ret)
4003                 ret = register_memsw_files(cont, ss);
4004         return ret;
4005 }
4006
4007 #ifdef CONFIG_MMU
4008 /* Handlers for move charge at task migration. */
4009 #define PRECHARGE_COUNT_AT_ONCE 256
4010 static int mem_cgroup_do_precharge(unsigned long count)
4011 {
4012         int ret = 0;
4013         int batch_count = PRECHARGE_COUNT_AT_ONCE;
4014         struct mem_cgroup *mem = mc.to;
4015
4016         if (mem_cgroup_is_root(mem)) {
4017                 mc.precharge += count;
4018                 /* we don't need css_get for root */
4019                 return ret;
4020         }
4021         /* try to charge at once */
4022         if (count > 1) {
4023                 struct res_counter *dummy;
4024                 /*
4025                  * "mem" cannot be under rmdir() because we've already checked
4026                  * by cgroup_lock_live_cgroup() that it is not removed and we
4027                  * are still under the same cgroup_mutex. So we can postpone
4028                  * css_get().
4029                  */
4030                 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
4031                         goto one_by_one;
4032                 if (do_swap_account && res_counter_charge(&mem->memsw,
4033                                                 PAGE_SIZE * count, &dummy)) {
4034                         res_counter_uncharge(&mem->res, PAGE_SIZE * count);
4035                         goto one_by_one;
4036                 }
4037                 mc.precharge += count;
4038                 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
4039                 WARN_ON_ONCE(count > INT_MAX);
4040                 __css_get(&mem->css, (int)count);
4041                 return ret;
4042         }
4043 one_by_one:
4044         /* fall back to one by one charge */
4045         while (count--) {
4046                 if (signal_pending(current)) {
4047                         ret = -EINTR;
4048                         break;
4049                 }
4050                 if (!batch_count--) {
4051                         batch_count = PRECHARGE_COUNT_AT_ONCE;
4052                         cond_resched();
4053                 }
4054                 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
4055                 if (ret || !mem)
4056                         /* mem_cgroup_clear_mc() will do uncharge later */
4057                         return -ENOMEM;
4058                 mc.precharge++;
4059         }
4060         return ret;
4061 }
4062
4063 /**
4064  * is_target_pte_for_mc - check a pte whether it is valid for move charge
4065  * @vma: the vma the pte to be checked belongs
4066  * @addr: the address corresponding to the pte to be checked
4067  * @ptent: the pte to be checked
4068  * @target: the pointer the target page or swap ent will be stored(can be NULL)
4069  *
4070  * Returns
4071  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
4072  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
4073  *     move charge. if @target is not NULL, the page is stored in target->page
4074  *     with extra refcnt got(Callers should handle it).
4075  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4076  *     target for charge migration. if @target is not NULL, the entry is stored
4077  *     in target->ent.
4078  *
4079  * Called with pte lock held.
4080  */
4081 union mc_target {
4082         struct page     *page;
4083         swp_entry_t     ent;
4084 };
4085
4086 enum mc_target_type {
4087         MC_TARGET_NONE, /* not used */
4088         MC_TARGET_PAGE,
4089         MC_TARGET_SWAP,
4090 };
4091
4092 static int is_target_pte_for_mc(struct vm_area_struct *vma,
4093                 unsigned long addr, pte_t ptent, union mc_target *target)
4094 {
4095         struct page *page = NULL;
4096         struct page_cgroup *pc;
4097         int ret = 0;
4098         swp_entry_t ent = { .val = 0 };
4099         int usage_count = 0;
4100         bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON,
4101                                         &mc.to->move_charge_at_immigrate);
4102
4103         if (!pte_present(ptent)) {
4104                 /* TODO: handle swap of shmes/tmpfs */
4105                 if (pte_none(ptent) || pte_file(ptent))
4106                         return 0;
4107                 else if (is_swap_pte(ptent)) {
4108                         ent = pte_to_swp_entry(ptent);
4109                         if (!move_anon || non_swap_entry(ent))
4110                                 return 0;
4111                         usage_count = mem_cgroup_count_swap_user(ent, &page);
4112                 }
4113         } else {
4114                 page = vm_normal_page(vma, addr, ptent);
4115                 if (!page || !page_mapped(page))
4116                         return 0;
4117                 /*
4118                  * TODO: We don't move charges of file(including shmem/tmpfs)
4119                  * pages for now.
4120                  */
4121                 if (!move_anon || !PageAnon(page))
4122                         return 0;
4123                 if (!get_page_unless_zero(page))
4124                         return 0;
4125                 usage_count = page_mapcount(page);
4126         }
4127         if (usage_count > 1) {
4128                 /*
4129                  * TODO: We don't move charges of shared(used by multiple
4130                  * processes) pages for now.
4131                  */
4132                 if (page)
4133                         put_page(page);
4134                 return 0;
4135         }
4136         if (page) {
4137                 pc = lookup_page_cgroup(page);
4138                 /*
4139                  * Do only loose check w/o page_cgroup lock.
4140                  * mem_cgroup_move_account() checks the pc is valid or not under
4141                  * the lock.
4142                  */
4143                 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
4144                         ret = MC_TARGET_PAGE;
4145                         if (target)
4146                                 target->page = page;
4147                 }
4148                 if (!ret || !target)
4149                         put_page(page);
4150         }
4151         /* throught */
4152         if (ent.val && do_swap_account && !ret &&
4153                         css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
4154                 ret = MC_TARGET_SWAP;
4155                 if (target)
4156                         target->ent = ent;
4157         }
4158         return ret;
4159 }
4160
4161 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4162                                         unsigned long addr, unsigned long end,
4163                                         struct mm_walk *walk)
4164 {
4165         struct vm_area_struct *vma = walk->private;
4166         pte_t *pte;
4167         spinlock_t *ptl;
4168
4169         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4170         for (; addr != end; pte++, addr += PAGE_SIZE)
4171                 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
4172                         mc.precharge++; /* increment precharge temporarily */
4173         pte_unmap_unlock(pte - 1, ptl);
4174         cond_resched();
4175
4176         return 0;
4177 }
4178
4179 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4180 {
4181         unsigned long precharge;
4182         struct vm_area_struct *vma;
4183
4184         down_read(&mm->mmap_sem);
4185         for (vma = mm->mmap; vma; vma = vma->vm_next) {
4186                 struct mm_walk mem_cgroup_count_precharge_walk = {
4187                         .pmd_entry = mem_cgroup_count_precharge_pte_range,
4188                         .mm = mm,
4189                         .private = vma,
4190                 };
4191                 if (is_vm_hugetlb_page(vma))
4192                         continue;
4193                 /* TODO: We don't move charges of shmem/tmpfs pages for now. */
4194                 if (vma->vm_flags & VM_SHARED)
4195                         continue;
4196                 walk_page_range(vma->vm_start, vma->vm_end,
4197                                         &mem_cgroup_count_precharge_walk);
4198         }
4199         up_read(&mm->mmap_sem);
4200
4201         precharge = mc.precharge;
4202         mc.precharge = 0;
4203
4204         return precharge;
4205 }
4206
4207 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4208 {
4209         return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
4210 }
4211
4212 static void mem_cgroup_clear_mc(void)
4213 {
4214         /* we must uncharge all the leftover precharges from mc.to */
4215         if (mc.precharge) {
4216                 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4217                 mc.precharge = 0;
4218         }
4219         /*
4220          * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
4221          * we must uncharge here.
4222          */
4223         if (mc.moved_charge) {
4224                 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4225                 mc.moved_charge = 0;
4226         }
4227         /* we must fixup refcnts and charges */
4228         if (mc.moved_swap) {
4229                 WARN_ON_ONCE(mc.moved_swap > INT_MAX);
4230                 /* uncharge swap account from the old cgroup */
4231                 if (!mem_cgroup_is_root(mc.from))
4232                         res_counter_uncharge(&mc.from->memsw,
4233                                                 PAGE_SIZE * mc.moved_swap);
4234                 __mem_cgroup_put(mc.from, mc.moved_swap);
4235
4236                 if (!mem_cgroup_is_root(mc.to)) {
4237                         /*
4238                          * we charged both to->res and to->memsw, so we should
4239                          * uncharge to->res.
4240                          */
4241                         res_counter_uncharge(&mc.to->res,
4242                                                 PAGE_SIZE * mc.moved_swap);
4243                         VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
4244                         __css_put(&mc.to->css, mc.moved_swap);
4245                 }
4246                 /* we've already done mem_cgroup_get(mc.to) */
4247
4248                 mc.moved_swap = 0;
4249         }
4250         mc.from = NULL;
4251         mc.to = NULL;
4252         mc.moving_task = NULL;
4253         wake_up_all(&mc.waitq);
4254 }
4255
4256 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4257                                 struct cgroup *cgroup,
4258                                 struct task_struct *p,
4259                                 bool threadgroup)
4260 {
4261         int ret = 0;
4262         struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
4263
4264         if (mem->move_charge_at_immigrate) {
4265                 struct mm_struct *mm;
4266                 struct mem_cgroup *from = mem_cgroup_from_task(p);
4267
4268                 VM_BUG_ON(from == mem);
4269
4270                 mm = get_task_mm(p);
4271                 if (!mm)
4272                         return 0;
4273                 /* We move charges only when we move a owner of the mm */
4274                 if (mm->owner == p) {
4275                         VM_BUG_ON(mc.from);
4276                         VM_BUG_ON(mc.to);
4277                         VM_BUG_ON(mc.precharge);
4278                         VM_BUG_ON(mc.moved_charge);
4279                         VM_BUG_ON(mc.moved_swap);
4280                         VM_BUG_ON(mc.moving_task);
4281                         mc.from = from;
4282                         mc.to = mem;
4283                         mc.precharge = 0;
4284                         mc.moved_charge = 0;
4285                         mc.moved_swap = 0;
4286                         mc.moving_task = current;
4287
4288                         ret = mem_cgroup_precharge_mc(mm);
4289                         if (ret)
4290                                 mem_cgroup_clear_mc();
4291                 }
4292                 mmput(mm);
4293         }
4294         return ret;
4295 }
4296
4297 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4298                                 struct cgroup *cgroup,
4299                                 struct task_struct *p,
4300                                 bool threadgroup)
4301 {
4302         mem_cgroup_clear_mc();
4303 }
4304
4305 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4306                                 unsigned long addr, unsigned long end,
4307                                 struct mm_walk *walk)
4308 {
4309         int ret = 0;
4310         struct vm_area_struct *vma = walk->private;
4311         pte_t *pte;
4312         spinlock_t *ptl;
4313
4314 retry:
4315         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4316         for (; addr != end; addr += PAGE_SIZE) {
4317                 pte_t ptent = *(pte++);
4318                 union mc_target target;
4319                 int type;
4320                 struct page *page;
4321                 struct page_cgroup *pc;
4322                 swp_entry_t ent;
4323
4324                 if (!mc.precharge)
4325                         break;
4326
4327                 type = is_target_pte_for_mc(vma, addr, ptent, &target);
4328                 switch (type) {
4329                 case MC_TARGET_PAGE:
4330                         page = target.page;
4331                         if (isolate_lru_page(page))
4332                                 goto put;
4333                         pc = lookup_page_cgroup(page);
4334                         if (!mem_cgroup_move_account(pc,
4335                                                 mc.from, mc.to, false)) {
4336                                 mc.precharge--;
4337                                 /* we uncharge from mc.from later. */
4338                                 mc.moved_charge++;
4339                         }
4340                         putback_lru_page(page);
4341 put:                    /* is_target_pte_for_mc() gets the page */
4342                         put_page(page);
4343                         break;
4344                 case MC_TARGET_SWAP:
4345                         ent = target.ent;
4346                         if (!mem_cgroup_move_swap_account(ent,
4347                                                 mc.from, mc.to, false)) {
4348                                 mc.precharge--;
4349                                 /* we fixup refcnts and charges later. */
4350                                 mc.moved_swap++;
4351                         }
4352                         break;
4353                 default:
4354                         break;
4355                 }
4356         }
4357         pte_unmap_unlock(pte - 1, ptl);
4358         cond_resched();
4359
4360         if (addr != end) {
4361                 /*
4362                  * We have consumed all precharges we got in can_attach().
4363                  * We try charge one by one, but don't do any additional
4364                  * charges to mc.to if we have failed in charge once in attach()
4365                  * phase.
4366                  */
4367                 ret = mem_cgroup_do_precharge(1);
4368                 if (!ret)
4369                         goto retry;
4370         }
4371
4372         return ret;
4373 }
4374
4375 static void mem_cgroup_move_charge(struct mm_struct *mm)
4376 {
4377         struct vm_area_struct *vma;
4378
4379         lru_add_drain_all();
4380         down_read(&mm->mmap_sem);
4381         for (vma = mm->mmap; vma; vma = vma->vm_next) {
4382                 int ret;
4383                 struct mm_walk mem_cgroup_move_charge_walk = {
4384                         .pmd_entry = mem_cgroup_move_charge_pte_range,
4385                         .mm = mm,
4386                         .private = vma,
4387                 };
4388                 if (is_vm_hugetlb_page(vma))
4389                         continue;
4390                 /* TODO: We don't move charges of shmem/tmpfs pages for now. */
4391                 if (vma->vm_flags & VM_SHARED)
4392                         continue;
4393                 ret = walk_page_range(vma->vm_start, vma->vm_end,
4394                                                 &mem_cgroup_move_charge_walk);
4395                 if (ret)
4396                         /*
4397                          * means we have consumed all precharges and failed in
4398                          * doing additional charge. Just abandon here.
4399                          */
4400                         break;
4401         }
4402         up_read(&mm->mmap_sem);
4403 }
4404
4405 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4406                                 struct cgroup *cont,
4407                                 struct cgroup *old_cont,
4408                                 struct task_struct *p,
4409                                 bool threadgroup)
4410 {
4411         struct mm_struct *mm;
4412
4413         if (!mc.to)
4414                 /* no need to move charge */
4415                 return;
4416
4417         mm = get_task_mm(p);
4418         if (mm) {
4419                 mem_cgroup_move_charge(mm);
4420                 mmput(mm);
4421         }
4422         mem_cgroup_clear_mc();
4423 }
4424 #else   /* !CONFIG_MMU */
4425 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4426                                 struct cgroup *cgroup,
4427                                 struct task_struct *p,
4428                                 bool threadgroup)
4429 {
4430         return 0;
4431 }
4432 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4433                                 struct cgroup *cgroup,
4434                                 struct task_struct *p,
4435                                 bool threadgroup)
4436 {
4437 }
4438 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4439                                 struct cgroup *cont,
4440                                 struct cgroup *old_cont,
4441                                 struct task_struct *p,
4442                                 bool threadgroup)
4443 {
4444 }
4445 #endif
4446
4447 struct cgroup_subsys mem_cgroup_subsys = {
4448         .name = "memory",
4449         .subsys_id = mem_cgroup_subsys_id,
4450         .create = mem_cgroup_create,
4451         .pre_destroy = mem_cgroup_pre_destroy,
4452         .destroy = mem_cgroup_destroy,
4453         .populate = mem_cgroup_populate,
4454         .can_attach = mem_cgroup_can_attach,
4455         .cancel_attach = mem_cgroup_cancel_attach,
4456         .attach = mem_cgroup_move_task,
4457         .early_init = 0,
4458         .use_id = 1,
4459 };
4460
4461 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4462
4463 static int __init disable_swap_account(char *s)
4464 {
4465         really_do_swap_account = 0;
4466         return 1;
4467 }
4468 __setup("noswapaccount", disable_swap_account);
4469 #endif