]> bbs.cooldavid.org Git - net-next-2.6.git/blame - mm/memcontrol.c
memcg: add mem_cgroup_zone_nr_pages()
[net-next-2.6.git] / mm / memcontrol.c
CommitLineData
8cdea7c0
BS
1/* memcontrol.c - Memory Controller
2 *
3 * Copyright IBM Corporation, 2007
4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5 *
78fb7466
PE
6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org>
8 *
8cdea7c0
BS
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 */
19
20#include <linux/res_counter.h>
21#include <linux/memcontrol.h>
22#include <linux/cgroup.h>
78fb7466 23#include <linux/mm.h>
d13d1443 24#include <linux/pagemap.h>
d52aa412 25#include <linux/smp.h>
8a9f3ccd 26#include <linux/page-flags.h>
66e1707b 27#include <linux/backing-dev.h>
8a9f3ccd
BS
28#include <linux/bit_spinlock.h>
29#include <linux/rcupdate.h>
8c7c6e34 30#include <linux/mutex.h>
b6ac57d5 31#include <linux/slab.h>
66e1707b
BS
32#include <linux/swap.h>
33#include <linux/spinlock.h>
34#include <linux/fs.h>
d2ceb9b7 35#include <linux/seq_file.h>
33327948 36#include <linux/vmalloc.h>
b69408e8 37#include <linux/mm_inline.h>
52d4b9ac 38#include <linux/page_cgroup.h>
08e552c6 39#include "internal.h"
8cdea7c0 40
8697d331
BS
41#include <asm/uaccess.h>
42
a181b0e8 43struct cgroup_subsys mem_cgroup_subsys __read_mostly;
a181b0e8 44#define MEM_CGROUP_RECLAIM_RETRIES 5
8cdea7c0 45
c077719b
KH
46#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
47/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
48int do_swap_account __read_mostly;
49static int really_do_swap_account __initdata = 1; /* for remember boot option*/
50#else
51#define do_swap_account (0)
52#endif
53
54
d52aa412
KH
55/*
56 * Statistics for memory cgroup.
57 */
58enum mem_cgroup_stat_index {
59 /*
60 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
61 */
62 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
63 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */
55e462b0
BR
64 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
65 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
d52aa412
KH
66
67 MEM_CGROUP_STAT_NSTATS,
68};
69
70struct mem_cgroup_stat_cpu {
71 s64 count[MEM_CGROUP_STAT_NSTATS];
72} ____cacheline_aligned_in_smp;
73
74struct mem_cgroup_stat {
c8dad2bb 75 struct mem_cgroup_stat_cpu cpustat[0];
d52aa412
KH
76};
77
78/*
79 * For accounting under irq disable, no need for increment preempt count.
80 */
addb9efe 81static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
d52aa412
KH
82 enum mem_cgroup_stat_index idx, int val)
83{
addb9efe 84 stat->count[idx] += val;
d52aa412
KH
85}
86
87static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
88 enum mem_cgroup_stat_index idx)
89{
90 int cpu;
91 s64 ret = 0;
92 for_each_possible_cpu(cpu)
93 ret += stat->cpustat[cpu].count[idx];
94 return ret;
95}
96
6d12e2d8
KH
97/*
98 * per-zone information in memory controller.
99 */
6d12e2d8 100struct mem_cgroup_per_zone {
072c56c1
KH
101 /*
102 * spin_lock to protect the per cgroup LRU
103 */
b69408e8
CL
104 struct list_head lists[NR_LRU_LISTS];
105 unsigned long count[NR_LRU_LISTS];
6d12e2d8
KH
106};
107/* Macro for accessing counter */
108#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
109
110struct mem_cgroup_per_node {
111 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
112};
113
114struct mem_cgroup_lru_info {
115 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
116};
117
8cdea7c0
BS
118/*
119 * The memory controller data structure. The memory controller controls both
120 * page cache and RSS per cgroup. We would eventually like to provide
121 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
122 * to help the administrator determine what knobs to tune.
123 *
124 * TODO: Add a water mark for the memory controller. Reclaim will begin when
8a9f3ccd
BS
125 * we hit the water mark. May be even add a low water mark, such that
126 * no reclaim occurs from a cgroup at it's low water mark, this is
127 * a feature that will be implemented much later in the future.
8cdea7c0
BS
128 */
129struct mem_cgroup {
130 struct cgroup_subsys_state css;
131 /*
132 * the counter to account for memory usage
133 */
134 struct res_counter res;
8c7c6e34
KH
135 /*
136 * the counter to account for mem+swap usage.
137 */
138 struct res_counter memsw;
78fb7466
PE
139 /*
140 * Per cgroup active and inactive list, similar to the
141 * per zone LRU lists.
78fb7466 142 */
6d12e2d8 143 struct mem_cgroup_lru_info info;
072c56c1 144
6c48a1d0 145 int prev_priority; /* for recording reclaim priority */
6d61ef40
BS
146
147 /*
148 * While reclaiming in a hiearchy, we cache the last child we
149 * reclaimed from. Protected by cgroup_lock()
150 */
151 struct mem_cgroup *last_scanned_child;
18f59ea7
BS
152 /*
153 * Should the accounting and control be hierarchical, per subtree?
154 */
155 bool use_hierarchy;
a636b327 156 unsigned long last_oom_jiffies;
8c7c6e34
KH
157 int obsolete;
158 atomic_t refcnt;
14797e23
KM
159
160 unsigned int inactive_ratio;
161
d52aa412 162 /*
c8dad2bb 163 * statistics. This must be placed at the end of memcg.
d52aa412
KH
164 */
165 struct mem_cgroup_stat stat;
8cdea7c0
BS
166};
167
217bc319
KH
168enum charge_type {
169 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
170 MEM_CGROUP_CHARGE_TYPE_MAPPED,
4f98a2fe 171 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
c05555b5 172 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
d13d1443 173 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
c05555b5
KH
174 NR_CHARGE_TYPE,
175};
176
52d4b9ac
KH
177/* only for here (for easy reading.) */
178#define PCGF_CACHE (1UL << PCG_CACHE)
179#define PCGF_USED (1UL << PCG_USED)
52d4b9ac 180#define PCGF_LOCK (1UL << PCG_LOCK)
c05555b5
KH
181static const unsigned long
182pcg_default_flags[NR_CHARGE_TYPE] = {
08e552c6
KH
183 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
184 PCGF_USED | PCGF_LOCK, /* Anon */
185 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
52d4b9ac 186 0, /* FORCE */
217bc319
KH
187};
188
8c7c6e34
KH
189/* for encoding cft->private value on file */
190#define _MEM (0)
191#define _MEMSWAP (1)
192#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
193#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
194#define MEMFILE_ATTR(val) ((val) & 0xffff)
195
196static void mem_cgroup_get(struct mem_cgroup *mem);
197static void mem_cgroup_put(struct mem_cgroup *mem);
198
c05555b5
KH
199static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
200 struct page_cgroup *pc,
201 bool charge)
d52aa412
KH
202{
203 int val = (charge)? 1 : -1;
204 struct mem_cgroup_stat *stat = &mem->stat;
addb9efe 205 struct mem_cgroup_stat_cpu *cpustat;
08e552c6 206 int cpu = get_cpu();
d52aa412 207
08e552c6 208 cpustat = &stat->cpustat[cpu];
c05555b5 209 if (PageCgroupCache(pc))
addb9efe 210 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
d52aa412 211 else
addb9efe 212 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
55e462b0
BR
213
214 if (charge)
addb9efe 215 __mem_cgroup_stat_add_safe(cpustat,
55e462b0
BR
216 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
217 else
addb9efe 218 __mem_cgroup_stat_add_safe(cpustat,
55e462b0 219 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
08e552c6 220 put_cpu();
6d12e2d8
KH
221}
222
d5b69e38 223static struct mem_cgroup_per_zone *
6d12e2d8
KH
224mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
225{
6d12e2d8
KH
226 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
227}
228
d5b69e38 229static struct mem_cgroup_per_zone *
6d12e2d8
KH
230page_cgroup_zoneinfo(struct page_cgroup *pc)
231{
232 struct mem_cgroup *mem = pc->mem_cgroup;
233 int nid = page_cgroup_nid(pc);
234 int zid = page_cgroup_zid(pc);
d52aa412 235
54992762
KM
236 if (!mem)
237 return NULL;
238
6d12e2d8
KH
239 return mem_cgroup_zoneinfo(mem, nid, zid);
240}
241
242static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
b69408e8 243 enum lru_list idx)
6d12e2d8
KH
244{
245 int nid, zid;
246 struct mem_cgroup_per_zone *mz;
247 u64 total = 0;
248
249 for_each_online_node(nid)
250 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
251 mz = mem_cgroup_zoneinfo(mem, nid, zid);
252 total += MEM_CGROUP_ZSTAT(mz, idx);
253 }
254 return total;
d52aa412
KH
255}
256
d5b69e38 257static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
8cdea7c0
BS
258{
259 return container_of(cgroup_subsys_state(cont,
260 mem_cgroup_subsys_id), struct mem_cgroup,
261 css);
262}
263
cf475ad2 264struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
78fb7466 265{
31a78f23
BS
266 /*
267 * mm_update_next_owner() may clear mm->owner to NULL
268 * if it races with swapoff, page migration, etc.
269 * So this can be called with p == NULL.
270 */
271 if (unlikely(!p))
272 return NULL;
273
78fb7466
PE
274 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
275 struct mem_cgroup, css);
276}
277
08e552c6
KH
278/*
279 * Following LRU functions are allowed to be used without PCG_LOCK.
280 * Operations are called by routine of global LRU independently from memcg.
281 * What we have to take care of here is validness of pc->mem_cgroup.
282 *
283 * Changes to pc->mem_cgroup happens when
284 * 1. charge
285 * 2. moving account
286 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
287 * It is added to LRU before charge.
288 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
289 * When moving account, the page is not on LRU. It's isolated.
290 */
4f98a2fe 291
08e552c6
KH
292void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
293{
294 struct page_cgroup *pc;
295 struct mem_cgroup *mem;
296 struct mem_cgroup_per_zone *mz;
6d12e2d8 297
f8d66542 298 if (mem_cgroup_disabled())
08e552c6
KH
299 return;
300 pc = lookup_page_cgroup(page);
301 /* can happen while we handle swapcache. */
302 if (list_empty(&pc->lru))
303 return;
304 mz = page_cgroup_zoneinfo(pc);
305 mem = pc->mem_cgroup;
b69408e8 306 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
08e552c6
KH
307 list_del_init(&pc->lru);
308 return;
6d12e2d8
KH
309}
310
08e552c6 311void mem_cgroup_del_lru(struct page *page)
6d12e2d8 312{
08e552c6
KH
313 mem_cgroup_del_lru_list(page, page_lru(page));
314}
b69408e8 315
08e552c6
KH
316void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
317{
318 struct mem_cgroup_per_zone *mz;
319 struct page_cgroup *pc;
b69408e8 320
f8d66542 321 if (mem_cgroup_disabled())
08e552c6 322 return;
6d12e2d8 323
08e552c6
KH
324 pc = lookup_page_cgroup(page);
325 smp_rmb();
326 /* unused page is not rotated. */
327 if (!PageCgroupUsed(pc))
328 return;
329 mz = page_cgroup_zoneinfo(pc);
330 list_move(&pc->lru, &mz->lists[lru]);
6d12e2d8
KH
331}
332
08e552c6 333void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
66e1707b 334{
08e552c6
KH
335 struct page_cgroup *pc;
336 struct mem_cgroup_per_zone *mz;
6d12e2d8 337
f8d66542 338 if (mem_cgroup_disabled())
08e552c6
KH
339 return;
340 pc = lookup_page_cgroup(page);
341 /* barrier to sync with "charge" */
342 smp_rmb();
343 if (!PageCgroupUsed(pc))
894bc310 344 return;
b69408e8 345
08e552c6 346 mz = page_cgroup_zoneinfo(pc);
b69408e8 347 MEM_CGROUP_ZSTAT(mz, lru) += 1;
08e552c6
KH
348 list_add(&pc->lru, &mz->lists[lru]);
349}
350/*
351 * To add swapcache into LRU. Be careful to all this function.
352 * zone->lru_lock shouldn't be held and irq must not be disabled.
353 */
354static void mem_cgroup_lru_fixup(struct page *page)
355{
356 if (!isolate_lru_page(page))
357 putback_lru_page(page);
358}
359
360void mem_cgroup_move_lists(struct page *page,
361 enum lru_list from, enum lru_list to)
362{
f8d66542 363 if (mem_cgroup_disabled())
08e552c6
KH
364 return;
365 mem_cgroup_del_lru_list(page, from);
366 mem_cgroup_add_lru_list(page, to);
66e1707b
BS
367}
368
4c4a2214
DR
369int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
370{
371 int ret;
372
373 task_lock(task);
bd845e38 374 ret = task->mm && mm_match_cgroup(task->mm, mem);
4c4a2214
DR
375 task_unlock(task);
376 return ret;
377}
378
58ae83db
KH
379/*
380 * Calculate mapped_ratio under memory controller. This will be used in
381 * vmscan.c for deteremining we have to reclaim mapped pages.
382 */
383int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
384{
385 long total, rss;
386
387 /*
388 * usage is recorded in bytes. But, here, we assume the number of
389 * physical pages can be represented by "long" on any arch.
390 */
391 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
392 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
393 return (int)((rss * 100L) / total);
394}
8869b8f6 395
6c48a1d0
KH
396/*
397 * prev_priority control...this will be used in memory reclaim path.
398 */
399int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
400{
401 return mem->prev_priority;
402}
403
404void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
405{
406 if (priority < mem->prev_priority)
407 mem->prev_priority = priority;
408}
409
410void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
411{
412 mem->prev_priority = priority;
413}
414
cc38108e
KH
415/*
416 * Calculate # of pages to be scanned in this priority/zone.
417 * See also vmscan.c
418 *
419 * priority starts from "DEF_PRIORITY" and decremented in each loop.
420 * (see include/linux/mmzone.h)
421 */
422
b69408e8
CL
423long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
424 int priority, enum lru_list lru)
cc38108e 425{
b69408e8 426 long nr_pages;
cc38108e
KH
427 int nid = zone->zone_pgdat->node_id;
428 int zid = zone_idx(zone);
429 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
430
b69408e8 431 nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
cc38108e 432
b69408e8 433 return (nr_pages >> priority);
cc38108e
KH
434}
435
14797e23
KM
436int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
437{
438 unsigned long active;
439 unsigned long inactive;
440
441 inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
442 active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);
443
444 if (inactive * memcg->inactive_ratio < active)
445 return 1;
446
447 return 0;
448}
449
a3d8e054
KM
450unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
451 struct zone *zone,
452 enum lru_list lru)
453{
454 int nid = zone->zone_pgdat->node_id;
455 int zid = zone_idx(zone);
456 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
457
458 return MEM_CGROUP_ZSTAT(mz, lru);
459}
460
66e1707b
BS
461unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
462 struct list_head *dst,
463 unsigned long *scanned, int order,
464 int mode, struct zone *z,
465 struct mem_cgroup *mem_cont,
4f98a2fe 466 int active, int file)
66e1707b
BS
467{
468 unsigned long nr_taken = 0;
469 struct page *page;
470 unsigned long scan;
471 LIST_HEAD(pc_list);
472 struct list_head *src;
ff7283fa 473 struct page_cgroup *pc, *tmp;
1ecaab2b
KH
474 int nid = z->zone_pgdat->node_id;
475 int zid = zone_idx(z);
476 struct mem_cgroup_per_zone *mz;
4f98a2fe 477 int lru = LRU_FILE * !!file + !!active;
66e1707b 478
cf475ad2 479 BUG_ON(!mem_cont);
1ecaab2b 480 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
b69408e8 481 src = &mz->lists[lru];
66e1707b 482
ff7283fa
KH
483 scan = 0;
484 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
436c6541 485 if (scan >= nr_to_scan)
ff7283fa 486 break;
08e552c6
KH
487
488 page = pc->page;
52d4b9ac
KH
489 if (unlikely(!PageCgroupUsed(pc)))
490 continue;
436c6541 491 if (unlikely(!PageLRU(page)))
ff7283fa 492 continue;
ff7283fa 493
436c6541 494 scan++;
4f98a2fe 495 if (__isolate_lru_page(page, mode, file) == 0) {
66e1707b
BS
496 list_move(&page->lru, dst);
497 nr_taken++;
498 }
499 }
500
66e1707b
BS
501 *scanned = scan;
502 return nr_taken;
503}
504
6d61ef40
BS
505#define mem_cgroup_from_res_counter(counter, member) \
506 container_of(counter, struct mem_cgroup, member)
507
508/*
509 * This routine finds the DFS walk successor. This routine should be
510 * called with cgroup_mutex held
511 */
512static struct mem_cgroup *
513mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
514{
515 struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
516
517 curr_cgroup = curr->css.cgroup;
518 root_cgroup = root_mem->css.cgroup;
519
520 if (!list_empty(&curr_cgroup->children)) {
521 /*
522 * Walk down to children
523 */
524 mem_cgroup_put(curr);
525 cgroup = list_entry(curr_cgroup->children.next,
526 struct cgroup, sibling);
527 curr = mem_cgroup_from_cont(cgroup);
528 mem_cgroup_get(curr);
529 goto done;
530 }
531
532visit_parent:
533 if (curr_cgroup == root_cgroup) {
534 mem_cgroup_put(curr);
535 curr = root_mem;
536 mem_cgroup_get(curr);
537 goto done;
538 }
539
540 /*
541 * Goto next sibling
542 */
543 if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
544 mem_cgroup_put(curr);
545 cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
546 sibling);
547 curr = mem_cgroup_from_cont(cgroup);
548 mem_cgroup_get(curr);
549 goto done;
550 }
551
552 /*
553 * Go up to next parent and next parent's sibling if need be
554 */
555 curr_cgroup = curr_cgroup->parent;
556 goto visit_parent;
557
558done:
559 root_mem->last_scanned_child = curr;
560 return curr;
561}
562
563/*
564 * Visit the first child (need not be the first child as per the ordering
565 * of the cgroup list, since we track last_scanned_child) of @mem and use
566 * that to reclaim free pages from.
567 */
568static struct mem_cgroup *
569mem_cgroup_get_first_node(struct mem_cgroup *root_mem)
570{
571 struct cgroup *cgroup;
572 struct mem_cgroup *ret;
573 bool obsolete = (root_mem->last_scanned_child &&
574 root_mem->last_scanned_child->obsolete);
575
576 /*
577 * Scan all children under the mem_cgroup mem
578 */
579 cgroup_lock();
580 if (list_empty(&root_mem->css.cgroup->children)) {
581 ret = root_mem;
582 goto done;
583 }
584
585 if (!root_mem->last_scanned_child || obsolete) {
586
587 if (obsolete)
588 mem_cgroup_put(root_mem->last_scanned_child);
589
590 cgroup = list_first_entry(&root_mem->css.cgroup->children,
591 struct cgroup, sibling);
592 ret = mem_cgroup_from_cont(cgroup);
593 mem_cgroup_get(ret);
594 } else
595 ret = mem_cgroup_get_next_node(root_mem->last_scanned_child,
596 root_mem);
597
598done:
599 root_mem->last_scanned_child = ret;
600 cgroup_unlock();
601 return ret;
602}
603
b85a96c0
DN
604static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
605{
606 if (do_swap_account) {
607 if (res_counter_check_under_limit(&mem->res) &&
608 res_counter_check_under_limit(&mem->memsw))
609 return true;
610 } else
611 if (res_counter_check_under_limit(&mem->res))
612 return true;
613 return false;
614}
615
6d61ef40
BS
616/*
617 * Dance down the hierarchy if needed to reclaim memory. We remember the
618 * last child we reclaimed from, so that we don't end up penalizing
619 * one child extensively based on its position in the children list.
620 *
621 * root_mem is the original ancestor that we've been reclaim from.
622 */
623static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
624 gfp_t gfp_mask, bool noswap)
625{
626 struct mem_cgroup *next_mem;
627 int ret = 0;
628
629 /*
630 * Reclaim unconditionally and don't check for return value.
631 * We need to reclaim in the current group and down the tree.
632 * One might think about checking for children before reclaiming,
633 * but there might be left over accounting, even after children
634 * have left.
635 */
636 ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap);
b85a96c0 637 if (mem_cgroup_check_under_limit(root_mem))
6d61ef40 638 return 0;
670ec2f1
DN
639 if (!root_mem->use_hierarchy)
640 return ret;
6d61ef40
BS
641
642 next_mem = mem_cgroup_get_first_node(root_mem);
643
644 while (next_mem != root_mem) {
645 if (next_mem->obsolete) {
646 mem_cgroup_put(next_mem);
647 cgroup_lock();
648 next_mem = mem_cgroup_get_first_node(root_mem);
649 cgroup_unlock();
650 continue;
651 }
652 ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap);
b85a96c0 653 if (mem_cgroup_check_under_limit(root_mem))
6d61ef40
BS
654 return 0;
655 cgroup_lock();
656 next_mem = mem_cgroup_get_next_node(next_mem, root_mem);
657 cgroup_unlock();
658 }
659 return ret;
660}
661
a636b327
KH
662bool mem_cgroup_oom_called(struct task_struct *task)
663{
664 bool ret = false;
665 struct mem_cgroup *mem;
666 struct mm_struct *mm;
667
668 rcu_read_lock();
669 mm = task->mm;
670 if (!mm)
671 mm = &init_mm;
672 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
673 if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
674 ret = true;
675 rcu_read_unlock();
676 return ret;
677}
f817ed48
KH
678/*
679 * Unlike exported interface, "oom" parameter is added. if oom==true,
680 * oom-killer can be invoked.
8a9f3ccd 681 */
f817ed48 682static int __mem_cgroup_try_charge(struct mm_struct *mm,
8c7c6e34
KH
683 gfp_t gfp_mask, struct mem_cgroup **memcg,
684 bool oom)
8a9f3ccd 685{
6d61ef40 686 struct mem_cgroup *mem, *mem_over_limit;
7a81b88c 687 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
28dbc4b6 688 struct res_counter *fail_res;
a636b327
KH
689
690 if (unlikely(test_thread_flag(TIF_MEMDIE))) {
691 /* Don't account this! */
692 *memcg = NULL;
693 return 0;
694 }
695
8a9f3ccd 696 /*
3be91277
HD
697 * We always charge the cgroup the mm_struct belongs to.
698 * The mm_struct's mem_cgroup changes on task migration if the
8a9f3ccd
BS
699 * thread group leader migrates. It's possible that mm is not
700 * set, if so charge the init_mm (happens for pagecache usage).
701 */
7a81b88c 702 if (likely(!*memcg)) {
e8589cc1
KH
703 rcu_read_lock();
704 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
31a78f23
BS
705 if (unlikely(!mem)) {
706 rcu_read_unlock();
31a78f23
BS
707 return 0;
708 }
e8589cc1
KH
709 /*
710 * For every charge from the cgroup, increment reference count
711 */
712 css_get(&mem->css);
7a81b88c 713 *memcg = mem;
e8589cc1
KH
714 rcu_read_unlock();
715 } else {
7a81b88c
KH
716 mem = *memcg;
717 css_get(&mem->css);
e8589cc1 718 }
8a9f3ccd 719
8c7c6e34
KH
720 while (1) {
721 int ret;
722 bool noswap = false;
7a81b88c 723
28dbc4b6 724 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
8c7c6e34
KH
725 if (likely(!ret)) {
726 if (!do_swap_account)
727 break;
28dbc4b6
BS
728 ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
729 &fail_res);
8c7c6e34
KH
730 if (likely(!ret))
731 break;
732 /* mem+swap counter fails */
733 res_counter_uncharge(&mem->res, PAGE_SIZE);
734 noswap = true;
6d61ef40
BS
735 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
736 memsw);
737 } else
738 /* mem counter fails */
739 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
740 res);
741
3be91277 742 if (!(gfp_mask & __GFP_WAIT))
7a81b88c 743 goto nomem;
e1a1cd59 744
6d61ef40
BS
745 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
746 noswap);
66e1707b
BS
747
748 /*
8869b8f6
HD
749 * try_to_free_mem_cgroup_pages() might not give us a full
750 * picture of reclaim. Some pages are reclaimed and might be
751 * moved to swap cache or just unmapped from the cgroup.
752 * Check the limit again to see if the reclaim reduced the
753 * current usage of the cgroup before giving up
8c7c6e34 754 *
8869b8f6 755 */
b85a96c0
DN
756 if (mem_cgroup_check_under_limit(mem_over_limit))
757 continue;
3be91277
HD
758
759 if (!nr_retries--) {
a636b327 760 if (oom) {
88700756
KH
761 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
762 mem_over_limit->last_oom_jiffies = jiffies;
a636b327 763 }
7a81b88c 764 goto nomem;
66e1707b 765 }
8a9f3ccd 766 }
7a81b88c
KH
767 return 0;
768nomem:
769 css_put(&mem->css);
770 return -ENOMEM;
771}
8a9f3ccd 772
f817ed48
KH
773/**
774 * mem_cgroup_try_charge - get charge of PAGE_SIZE.
775 * @mm: an mm_struct which is charged against. (when *memcg is NULL)
776 * @gfp_mask: gfp_mask for reclaim.
777 * @memcg: a pointer to memory cgroup which is charged against.
778 *
779 * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
780 * memory cgroup from @mm is got and stored in *memcg.
781 *
782 * Returns 0 if success. -ENOMEM at failure.
783 * This call can invoke OOM-Killer.
784 */
785
786int mem_cgroup_try_charge(struct mm_struct *mm,
787 gfp_t mask, struct mem_cgroup **memcg)
788{
789 return __mem_cgroup_try_charge(mm, mask, memcg, true);
790}
791
7a81b88c
KH
792/*
793 * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
794 * USED state. If already USED, uncharge and return.
795 */
796
797static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
798 struct page_cgroup *pc,
799 enum charge_type ctype)
800{
7a81b88c
KH
801 /* try_charge() can return NULL to *memcg, taking care of it. */
802 if (!mem)
803 return;
52d4b9ac
KH
804
805 lock_page_cgroup(pc);
806 if (unlikely(PageCgroupUsed(pc))) {
807 unlock_page_cgroup(pc);
808 res_counter_uncharge(&mem->res, PAGE_SIZE);
8c7c6e34
KH
809 if (do_swap_account)
810 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
52d4b9ac 811 css_put(&mem->css);
7a81b88c 812 return;
52d4b9ac 813 }
8a9f3ccd 814 pc->mem_cgroup = mem;
08e552c6 815 smp_wmb();
c05555b5 816 pc->flags = pcg_default_flags[ctype];
3be91277 817
08e552c6 818 mem_cgroup_charge_statistics(mem, pc, true);
52d4b9ac 819
52d4b9ac 820 unlock_page_cgroup(pc);
7a81b88c 821}
66e1707b 822
f817ed48
KH
823/**
824 * mem_cgroup_move_account - move account of the page
825 * @pc: page_cgroup of the page.
826 * @from: mem_cgroup which the page is moved from.
827 * @to: mem_cgroup which the page is moved to. @from != @to.
828 *
829 * The caller must confirm following.
08e552c6 830 * - page is not on LRU (isolate_page() is useful.)
f817ed48
KH
831 *
832 * returns 0 at success,
833 * returns -EBUSY when lock is busy or "pc" is unstable.
834 *
835 * This function does "uncharge" from old cgroup but doesn't do "charge" to
836 * new cgroup. It should be done by a caller.
837 */
838
839static int mem_cgroup_move_account(struct page_cgroup *pc,
840 struct mem_cgroup *from, struct mem_cgroup *to)
841{
842 struct mem_cgroup_per_zone *from_mz, *to_mz;
843 int nid, zid;
844 int ret = -EBUSY;
845
f817ed48 846 VM_BUG_ON(from == to);
08e552c6 847 VM_BUG_ON(PageLRU(pc->page));
f817ed48
KH
848
849 nid = page_cgroup_nid(pc);
850 zid = page_cgroup_zid(pc);
851 from_mz = mem_cgroup_zoneinfo(from, nid, zid);
852 to_mz = mem_cgroup_zoneinfo(to, nid, zid);
853
f817ed48
KH
854 if (!trylock_page_cgroup(pc))
855 return ret;
856
857 if (!PageCgroupUsed(pc))
858 goto out;
859
860 if (pc->mem_cgroup != from)
861 goto out;
862
08e552c6
KH
863 css_put(&from->css);
864 res_counter_uncharge(&from->res, PAGE_SIZE);
865 mem_cgroup_charge_statistics(from, pc, false);
866 if (do_swap_account)
867 res_counter_uncharge(&from->memsw, PAGE_SIZE);
868 pc->mem_cgroup = to;
869 mem_cgroup_charge_statistics(to, pc, true);
870 css_get(&to->css);
871 ret = 0;
f817ed48
KH
872out:
873 unlock_page_cgroup(pc);
874 return ret;
875}
876
877/*
878 * move charges to its parent.
879 */
880
881static int mem_cgroup_move_parent(struct page_cgroup *pc,
882 struct mem_cgroup *child,
883 gfp_t gfp_mask)
884{
08e552c6 885 struct page *page = pc->page;
f817ed48
KH
886 struct cgroup *cg = child->css.cgroup;
887 struct cgroup *pcg = cg->parent;
888 struct mem_cgroup *parent;
f817ed48
KH
889 int ret;
890
891 /* Is ROOT ? */
892 if (!pcg)
893 return -EINVAL;
894
08e552c6 895
f817ed48
KH
896 parent = mem_cgroup_from_cont(pcg);
897
08e552c6 898
f817ed48 899 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
a636b327 900 if (ret || !parent)
f817ed48
KH
901 return ret;
902
08e552c6
KH
903 if (!get_page_unless_zero(page))
904 return -EBUSY;
905
906 ret = isolate_lru_page(page);
907
908 if (ret)
909 goto cancel;
f817ed48 910
f817ed48 911 ret = mem_cgroup_move_account(pc, child, parent);
f817ed48 912
08e552c6 913 /* drop extra refcnt by try_charge() (move_account increment one) */
f817ed48 914 css_put(&parent->css);
08e552c6
KH
915 putback_lru_page(page);
916 if (!ret) {
917 put_page(page);
918 return 0;
8c7c6e34 919 }
08e552c6
KH
920 /* uncharge if move fails */
921cancel:
922 res_counter_uncharge(&parent->res, PAGE_SIZE);
923 if (do_swap_account)
924 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
925 put_page(page);
f817ed48
KH
926 return ret;
927}
928
7a81b88c
KH
929/*
930 * Charge the memory controller for page usage.
931 * Return
932 * 0 if the charge was successful
933 * < 0 if the cgroup is over its limit
934 */
935static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
936 gfp_t gfp_mask, enum charge_type ctype,
937 struct mem_cgroup *memcg)
938{
939 struct mem_cgroup *mem;
940 struct page_cgroup *pc;
941 int ret;
942
943 pc = lookup_page_cgroup(page);
944 /* can happen at boot */
945 if (unlikely(!pc))
946 return 0;
947 prefetchw(pc);
948
949 mem = memcg;
f817ed48 950 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
a636b327 951 if (ret || !mem)
7a81b88c
KH
952 return ret;
953
954 __mem_cgroup_commit_charge(mem, pc, ctype);
8a9f3ccd 955 return 0;
8a9f3ccd
BS
956}
957
7a81b88c
KH
958int mem_cgroup_newpage_charge(struct page *page,
959 struct mm_struct *mm, gfp_t gfp_mask)
217bc319 960{
f8d66542 961 if (mem_cgroup_disabled())
cede86ac 962 return 0;
52d4b9ac
KH
963 if (PageCompound(page))
964 return 0;
69029cd5
KH
965 /*
966 * If already mapped, we don't have to account.
967 * If page cache, page->mapping has address_space.
968 * But page->mapping may have out-of-use anon_vma pointer,
969 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
970 * is NULL.
971 */
972 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
973 return 0;
974 if (unlikely(!mm))
975 mm = &init_mm;
217bc319 976 return mem_cgroup_charge_common(page, mm, gfp_mask,
e8589cc1 977 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
217bc319
KH
978}
979
e1a1cd59
BS
980int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
981 gfp_t gfp_mask)
8697d331 982{
f8d66542 983 if (mem_cgroup_disabled())
cede86ac 984 return 0;
52d4b9ac
KH
985 if (PageCompound(page))
986 return 0;
accf163e
KH
987 /*
988 * Corner case handling. This is called from add_to_page_cache()
989 * in usual. But some FS (shmem) precharges this page before calling it
990 * and call add_to_page_cache() with GFP_NOWAIT.
991 *
992 * For GFP_NOWAIT case, the page may be pre-charged before calling
993 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
994 * charge twice. (It works but has to pay a bit larger cost.)
995 */
996 if (!(gfp_mask & __GFP_WAIT)) {
997 struct page_cgroup *pc;
998
52d4b9ac
KH
999
1000 pc = lookup_page_cgroup(page);
1001 if (!pc)
1002 return 0;
1003 lock_page_cgroup(pc);
1004 if (PageCgroupUsed(pc)) {
1005 unlock_page_cgroup(pc);
accf163e
KH
1006 return 0;
1007 }
52d4b9ac 1008 unlock_page_cgroup(pc);
accf163e
KH
1009 }
1010
69029cd5 1011 if (unlikely(!mm))
8697d331 1012 mm = &init_mm;
accf163e 1013
c05555b5
KH
1014 if (page_is_file_cache(page))
1015 return mem_cgroup_charge_common(page, mm, gfp_mask,
e8589cc1 1016 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
c05555b5
KH
1017 else
1018 return mem_cgroup_charge_common(page, mm, gfp_mask,
1019 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
e8589cc1
KH
1020}
1021
8c7c6e34
KH
1022int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1023 struct page *page,
1024 gfp_t mask, struct mem_cgroup **ptr)
1025{
1026 struct mem_cgroup *mem;
1027 swp_entry_t ent;
1028
f8d66542 1029 if (mem_cgroup_disabled())
8c7c6e34
KH
1030 return 0;
1031
1032 if (!do_swap_account)
1033 goto charge_cur_mm;
1034
1035 /*
1036 * A racing thread's fault, or swapoff, may have already updated
1037 * the pte, and even removed page from swap cache: return success
1038 * to go on to do_swap_page()'s pte_same() test, which should fail.
1039 */
1040 if (!PageSwapCache(page))
1041 return 0;
1042
1043 ent.val = page_private(page);
1044
1045 mem = lookup_swap_cgroup(ent);
1046 if (!mem || mem->obsolete)
1047 goto charge_cur_mm;
1048 *ptr = mem;
1049 return __mem_cgroup_try_charge(NULL, mask, ptr, true);
1050charge_cur_mm:
1051 if (unlikely(!mm))
1052 mm = &init_mm;
1053 return __mem_cgroup_try_charge(mm, mask, ptr, true);
1054}
1055
d13d1443 1056#ifdef CONFIG_SWAP
8c7c6e34 1057
d13d1443
KH
1058int mem_cgroup_cache_charge_swapin(struct page *page,
1059 struct mm_struct *mm, gfp_t mask, bool locked)
1060{
1061 int ret = 0;
1062
f8d66542 1063 if (mem_cgroup_disabled())
d13d1443
KH
1064 return 0;
1065 if (unlikely(!mm))
1066 mm = &init_mm;
1067 if (!locked)
1068 lock_page(page);
1069 /*
1070 * If not locked, the page can be dropped from SwapCache until
1071 * we reach here.
1072 */
1073 if (PageSwapCache(page)) {
8c7c6e34
KH
1074 struct mem_cgroup *mem = NULL;
1075 swp_entry_t ent;
1076
1077 ent.val = page_private(page);
1078 if (do_swap_account) {
1079 mem = lookup_swap_cgroup(ent);
1080 if (mem && mem->obsolete)
1081 mem = NULL;
1082 if (mem)
1083 mm = NULL;
1084 }
d13d1443 1085 ret = mem_cgroup_charge_common(page, mm, mask,
8c7c6e34
KH
1086 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
1087
1088 if (!ret && do_swap_account) {
1089 /* avoid double counting */
1090 mem = swap_cgroup_record(ent, NULL);
1091 if (mem) {
1092 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1093 mem_cgroup_put(mem);
1094 }
1095 }
d13d1443
KH
1096 }
1097 if (!locked)
1098 unlock_page(page);
08e552c6
KH
1099 /* add this page(page_cgroup) to the LRU we want. */
1100 mem_cgroup_lru_fixup(page);
d13d1443
KH
1101
1102 return ret;
1103}
1104#endif
1105
7a81b88c
KH
1106void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1107{
1108 struct page_cgroup *pc;
1109
f8d66542 1110 if (mem_cgroup_disabled())
7a81b88c
KH
1111 return;
1112 if (!ptr)
1113 return;
1114 pc = lookup_page_cgroup(page);
1115 __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
8c7c6e34
KH
1116 /*
1117 * Now swap is on-memory. This means this page may be
1118 * counted both as mem and swap....double count.
1119 * Fix it by uncharging from memsw. This SwapCache is stable
1120 * because we're still under lock_page().
1121 */
1122 if (do_swap_account) {
1123 swp_entry_t ent = {.val = page_private(page)};
1124 struct mem_cgroup *memcg;
1125 memcg = swap_cgroup_record(ent, NULL);
1126 if (memcg) {
1127 /* If memcg is obsolete, memcg can be != ptr */
1128 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1129 mem_cgroup_put(memcg);
1130 }
1131
1132 }
08e552c6
KH
1133 /* add this page(page_cgroup) to the LRU we want. */
1134 mem_cgroup_lru_fixup(page);
7a81b88c
KH
1135}
1136
1137void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1138{
f8d66542 1139 if (mem_cgroup_disabled())
7a81b88c
KH
1140 return;
1141 if (!mem)
1142 return;
1143 res_counter_uncharge(&mem->res, PAGE_SIZE);
8c7c6e34
KH
1144 if (do_swap_account)
1145 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
7a81b88c
KH
1146 css_put(&mem->css);
1147}
1148
1149
8a9f3ccd 1150/*
69029cd5 1151 * uncharge if !page_mapped(page)
8a9f3ccd 1152 */
8c7c6e34 1153static struct mem_cgroup *
69029cd5 1154__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
8a9f3ccd 1155{
8289546e 1156 struct page_cgroup *pc;
8c7c6e34 1157 struct mem_cgroup *mem = NULL;
072c56c1 1158 struct mem_cgroup_per_zone *mz;
8a9f3ccd 1159
f8d66542 1160 if (mem_cgroup_disabled())
8c7c6e34 1161 return NULL;
4077960e 1162
d13d1443 1163 if (PageSwapCache(page))
8c7c6e34 1164 return NULL;
d13d1443 1165
8697d331 1166 /*
3c541e14 1167 * Check if our page_cgroup is valid
8697d331 1168 */
52d4b9ac
KH
1169 pc = lookup_page_cgroup(page);
1170 if (unlikely(!pc || !PageCgroupUsed(pc)))
8c7c6e34 1171 return NULL;
b9c565d5 1172
52d4b9ac 1173 lock_page_cgroup(pc);
d13d1443 1174
8c7c6e34
KH
1175 mem = pc->mem_cgroup;
1176
d13d1443
KH
1177 if (!PageCgroupUsed(pc))
1178 goto unlock_out;
1179
1180 switch (ctype) {
1181 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1182 if (page_mapped(page))
1183 goto unlock_out;
1184 break;
1185 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
1186 if (!PageAnon(page)) { /* Shared memory */
1187 if (page->mapping && !page_is_file_cache(page))
1188 goto unlock_out;
1189 } else if (page_mapped(page)) /* Anon */
1190 goto unlock_out;
1191 break;
1192 default:
1193 break;
52d4b9ac 1194 }
d13d1443 1195
8c7c6e34
KH
1196 res_counter_uncharge(&mem->res, PAGE_SIZE);
1197 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1198 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1199
08e552c6 1200 mem_cgroup_charge_statistics(mem, pc, false);
52d4b9ac 1201 ClearPageCgroupUsed(pc);
b9c565d5 1202
69029cd5 1203 mz = page_cgroup_zoneinfo(pc);
52d4b9ac 1204 unlock_page_cgroup(pc);
fb59e9f1 1205
a7fe942e
KH
1206 /* at swapout, this memcg will be accessed to record to swap */
1207 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1208 css_put(&mem->css);
6d12e2d8 1209
8c7c6e34 1210 return mem;
d13d1443
KH
1211
1212unlock_out:
1213 unlock_page_cgroup(pc);
8c7c6e34 1214 return NULL;
3c541e14
BS
1215}
1216
69029cd5
KH
1217void mem_cgroup_uncharge_page(struct page *page)
1218{
52d4b9ac
KH
1219 /* early check. */
1220 if (page_mapped(page))
1221 return;
1222 if (page->mapping && !PageAnon(page))
1223 return;
69029cd5
KH
1224 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
1225}
1226
1227void mem_cgroup_uncharge_cache_page(struct page *page)
1228{
1229 VM_BUG_ON(page_mapped(page));
b7abea96 1230 VM_BUG_ON(page->mapping);
69029cd5
KH
1231 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1232}
1233
8c7c6e34
KH
1234/*
1235 * called from __delete_from_swap_cache() and drop "page" account.
1236 * memcg information is recorded to swap_cgroup of "ent"
1237 */
1238void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1239{
1240 struct mem_cgroup *memcg;
1241
1242 memcg = __mem_cgroup_uncharge_common(page,
1243 MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
1244 /* record memcg information */
1245 if (do_swap_account && memcg) {
1246 swap_cgroup_record(ent, memcg);
1247 mem_cgroup_get(memcg);
1248 }
a7fe942e
KH
1249 if (memcg)
1250 css_put(&memcg->css);
8c7c6e34
KH
1251}
1252
1253#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1254/*
1255 * called from swap_entry_free(). remove record in swap_cgroup and
1256 * uncharge "memsw" account.
1257 */
1258void mem_cgroup_uncharge_swap(swp_entry_t ent)
d13d1443 1259{
8c7c6e34
KH
1260 struct mem_cgroup *memcg;
1261
1262 if (!do_swap_account)
1263 return;
1264
1265 memcg = swap_cgroup_record(ent, NULL);
1266 if (memcg) {
1267 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1268 mem_cgroup_put(memcg);
1269 }
d13d1443 1270}
8c7c6e34 1271#endif
d13d1443 1272
ae41be37 1273/*
01b1ae63
KH
1274 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
1275 * page belongs to.
ae41be37 1276 */
01b1ae63 1277int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
ae41be37
KH
1278{
1279 struct page_cgroup *pc;
e8589cc1 1280 struct mem_cgroup *mem = NULL;
e8589cc1 1281 int ret = 0;
8869b8f6 1282
f8d66542 1283 if (mem_cgroup_disabled())
4077960e
BS
1284 return 0;
1285
52d4b9ac
KH
1286 pc = lookup_page_cgroup(page);
1287 lock_page_cgroup(pc);
1288 if (PageCgroupUsed(pc)) {
e8589cc1
KH
1289 mem = pc->mem_cgroup;
1290 css_get(&mem->css);
e8589cc1 1291 }
52d4b9ac 1292 unlock_page_cgroup(pc);
01b1ae63 1293
e8589cc1 1294 if (mem) {
2c26fdd7 1295 ret = mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem);
e8589cc1
KH
1296 css_put(&mem->css);
1297 }
01b1ae63 1298 *ptr = mem;
e8589cc1 1299 return ret;
ae41be37 1300}
8869b8f6 1301
69029cd5 1302/* remove redundant charge if migration failed*/
01b1ae63
KH
1303void mem_cgroup_end_migration(struct mem_cgroup *mem,
1304 struct page *oldpage, struct page *newpage)
ae41be37 1305{
01b1ae63
KH
1306 struct page *target, *unused;
1307 struct page_cgroup *pc;
1308 enum charge_type ctype;
1309
1310 if (!mem)
1311 return;
1312
1313 /* at migration success, oldpage->mapping is NULL. */
1314 if (oldpage->mapping) {
1315 target = oldpage;
1316 unused = NULL;
1317 } else {
1318 target = newpage;
1319 unused = oldpage;
1320 }
1321
1322 if (PageAnon(target))
1323 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
1324 else if (page_is_file_cache(target))
1325 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
1326 else
1327 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
1328
1329 /* unused page is not on radix-tree now. */
d13d1443 1330 if (unused)
01b1ae63
KH
1331 __mem_cgroup_uncharge_common(unused, ctype);
1332
1333 pc = lookup_page_cgroup(target);
69029cd5 1334 /*
01b1ae63
KH
1335 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
1336 * So, double-counting is effectively avoided.
1337 */
1338 __mem_cgroup_commit_charge(mem, pc, ctype);
1339
1340 /*
1341 * Both of oldpage and newpage are still under lock_page().
1342 * Then, we don't have to care about race in radix-tree.
1343 * But we have to be careful that this page is unmapped or not.
1344 *
1345 * There is a case for !page_mapped(). At the start of
1346 * migration, oldpage was mapped. But now, it's zapped.
1347 * But we know *target* page is not freed/reused under us.
1348 * mem_cgroup_uncharge_page() does all necessary checks.
69029cd5 1349 */
01b1ae63
KH
1350 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
1351 mem_cgroup_uncharge_page(target);
ae41be37 1352}
78fb7466 1353
c9b0ed51
KH
1354/*
1355 * A call to try to shrink memory usage under specified resource controller.
1356 * This is typically used for page reclaiming for shmem for reducing side
1357 * effect of page allocation from shmem, which is used by some mem_cgroup.
1358 */
1359int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
1360{
1361 struct mem_cgroup *mem;
1362 int progress = 0;
1363 int retry = MEM_CGROUP_RECLAIM_RETRIES;
1364
f8d66542 1365 if (mem_cgroup_disabled())
cede86ac 1366 return 0;
9623e078
HD
1367 if (!mm)
1368 return 0;
cede86ac 1369
c9b0ed51
KH
1370 rcu_read_lock();
1371 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
31a78f23
BS
1372 if (unlikely(!mem)) {
1373 rcu_read_unlock();
1374 return 0;
1375 }
c9b0ed51
KH
1376 css_get(&mem->css);
1377 rcu_read_unlock();
1378
1379 do {
8c7c6e34 1380 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true);
b85a96c0 1381 progress += mem_cgroup_check_under_limit(mem);
c9b0ed51
KH
1382 } while (!progress && --retry);
1383
1384 css_put(&mem->css);
1385 if (!retry)
1386 return -ENOMEM;
1387 return 0;
1388}
1389
14797e23
KM
1390/*
1391 * The inactive anon list should be small enough that the VM never has to
1392 * do too much work, but large enough that each inactive page has a chance
1393 * to be referenced again before it is swapped out.
1394 *
1395 * this calculation is straightforward porting from
1396 * page_alloc.c::setup_per_zone_inactive_ratio().
1397 * it describe more detail.
1398 */
1399static void mem_cgroup_set_inactive_ratio(struct mem_cgroup *memcg)
1400{
1401 unsigned int gb, ratio;
1402
1403 gb = res_counter_read_u64(&memcg->res, RES_LIMIT) >> 30;
1404 if (gb)
1405 ratio = int_sqrt(10 * gb);
1406 else
1407 ratio = 1;
1408
1409 memcg->inactive_ratio = ratio;
1410
1411}
1412
8c7c6e34
KH
1413static DEFINE_MUTEX(set_limit_mutex);
1414
d38d2a75 1415static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
8c7c6e34 1416 unsigned long long val)
628f4235
KH
1417{
1418
1419 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1420 int progress;
8c7c6e34 1421 u64 memswlimit;
628f4235
KH
1422 int ret = 0;
1423
8c7c6e34 1424 while (retry_count) {
628f4235
KH
1425 if (signal_pending(current)) {
1426 ret = -EINTR;
1427 break;
1428 }
8c7c6e34
KH
1429 /*
1430 * Rather than hide all in some function, I do this in
1431 * open coded manner. You see what this really does.
1432 * We have to guarantee mem->res.limit < mem->memsw.limit.
1433 */
1434 mutex_lock(&set_limit_mutex);
1435 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1436 if (memswlimit < val) {
1437 ret = -EINVAL;
1438 mutex_unlock(&set_limit_mutex);
628f4235
KH
1439 break;
1440 }
8c7c6e34
KH
1441 ret = res_counter_set_limit(&memcg->res, val);
1442 mutex_unlock(&set_limit_mutex);
1443
1444 if (!ret)
1445 break;
1446
bced0520 1447 progress = try_to_free_mem_cgroup_pages(memcg,
2c26fdd7 1448 GFP_KERNEL, false);
8c7c6e34
KH
1449 if (!progress) retry_count--;
1450 }
14797e23
KM
1451
1452 if (!ret)
1453 mem_cgroup_set_inactive_ratio(memcg);
1454
8c7c6e34
KH
1455 return ret;
1456}
1457
1458int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1459 unsigned long long val)
1460{
1461 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1462 u64 memlimit, oldusage, curusage;
1463 int ret;
1464
1465 if (!do_swap_account)
1466 return -EINVAL;
1467
1468 while (retry_count) {
1469 if (signal_pending(current)) {
1470 ret = -EINTR;
1471 break;
1472 }
1473 /*
1474 * Rather than hide all in some function, I do this in
1475 * open coded manner. You see what this really does.
1476 * We have to guarantee mem->res.limit < mem->memsw.limit.
1477 */
1478 mutex_lock(&set_limit_mutex);
1479 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1480 if (memlimit > val) {
1481 ret = -EINVAL;
1482 mutex_unlock(&set_limit_mutex);
1483 break;
1484 }
1485 ret = res_counter_set_limit(&memcg->memsw, val);
1486 mutex_unlock(&set_limit_mutex);
1487
1488 if (!ret)
1489 break;
1490
1491 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
2c26fdd7 1492 try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, true);
8c7c6e34
KH
1493 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1494 if (curusage >= oldusage)
628f4235
KH
1495 retry_count--;
1496 }
1497 return ret;
1498}
1499
cc847582
KH
1500/*
1501 * This routine traverse page_cgroup in given list and drop them all.
cc847582
KH
1502 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
1503 */
f817ed48 1504static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
08e552c6 1505 int node, int zid, enum lru_list lru)
cc847582 1506{
08e552c6
KH
1507 struct zone *zone;
1508 struct mem_cgroup_per_zone *mz;
f817ed48 1509 struct page_cgroup *pc, *busy;
08e552c6 1510 unsigned long flags, loop;
072c56c1 1511 struct list_head *list;
f817ed48 1512 int ret = 0;
072c56c1 1513
08e552c6
KH
1514 zone = &NODE_DATA(node)->node_zones[zid];
1515 mz = mem_cgroup_zoneinfo(mem, node, zid);
b69408e8 1516 list = &mz->lists[lru];
cc847582 1517
f817ed48
KH
1518 loop = MEM_CGROUP_ZSTAT(mz, lru);
1519 /* give some margin against EBUSY etc...*/
1520 loop += 256;
1521 busy = NULL;
1522 while (loop--) {
1523 ret = 0;
08e552c6 1524 spin_lock_irqsave(&zone->lru_lock, flags);
f817ed48 1525 if (list_empty(list)) {
08e552c6 1526 spin_unlock_irqrestore(&zone->lru_lock, flags);
52d4b9ac 1527 break;
f817ed48
KH
1528 }
1529 pc = list_entry(list->prev, struct page_cgroup, lru);
1530 if (busy == pc) {
1531 list_move(&pc->lru, list);
1532 busy = 0;
08e552c6 1533 spin_unlock_irqrestore(&zone->lru_lock, flags);
f817ed48
KH
1534 continue;
1535 }
08e552c6 1536 spin_unlock_irqrestore(&zone->lru_lock, flags);
f817ed48 1537
2c26fdd7 1538 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
f817ed48 1539 if (ret == -ENOMEM)
52d4b9ac 1540 break;
f817ed48
KH
1541
1542 if (ret == -EBUSY || ret == -EINVAL) {
1543 /* found lock contention or "pc" is obsolete. */
1544 busy = pc;
1545 cond_resched();
1546 } else
1547 busy = NULL;
cc847582 1548 }
08e552c6 1549
f817ed48
KH
1550 if (!ret && !list_empty(list))
1551 return -EBUSY;
1552 return ret;
cc847582
KH
1553}
1554
1555/*
1556 * make mem_cgroup's charge to be 0 if there is no task.
1557 * This enables deleting this mem_cgroup.
1558 */
c1e862c1 1559static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
cc847582 1560{
f817ed48
KH
1561 int ret;
1562 int node, zid, shrink;
1563 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
c1e862c1 1564 struct cgroup *cgrp = mem->css.cgroup;
8869b8f6 1565
cc847582 1566 css_get(&mem->css);
f817ed48
KH
1567
1568 shrink = 0;
c1e862c1
KH
1569 /* should free all ? */
1570 if (free_all)
1571 goto try_to_free;
f817ed48 1572move_account:
1ecaab2b 1573 while (mem->res.usage > 0) {
f817ed48 1574 ret = -EBUSY;
c1e862c1
KH
1575 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
1576 goto out;
1577 ret = -EINTR;
1578 if (signal_pending(current))
cc847582 1579 goto out;
52d4b9ac
KH
1580 /* This is for making all *used* pages to be on LRU. */
1581 lru_add_drain_all();
f817ed48
KH
1582 ret = 0;
1583 for_each_node_state(node, N_POSSIBLE) {
1584 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
b69408e8 1585 enum lru_list l;
f817ed48
KH
1586 for_each_lru(l) {
1587 ret = mem_cgroup_force_empty_list(mem,
08e552c6 1588 node, zid, l);
f817ed48
KH
1589 if (ret)
1590 break;
1591 }
1ecaab2b 1592 }
f817ed48
KH
1593 if (ret)
1594 break;
1595 }
1596 /* it seems parent cgroup doesn't have enough mem */
1597 if (ret == -ENOMEM)
1598 goto try_to_free;
52d4b9ac 1599 cond_resched();
cc847582
KH
1600 }
1601 ret = 0;
1602out:
1603 css_put(&mem->css);
1604 return ret;
f817ed48
KH
1605
1606try_to_free:
c1e862c1
KH
1607 /* returns EBUSY if there is a task or if we come here twice. */
1608 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
f817ed48
KH
1609 ret = -EBUSY;
1610 goto out;
1611 }
c1e862c1
KH
1612 /* we call try-to-free pages for make this cgroup empty */
1613 lru_add_drain_all();
f817ed48
KH
1614 /* try to free all pages in this cgroup */
1615 shrink = 1;
1616 while (nr_retries && mem->res.usage > 0) {
1617 int progress;
c1e862c1
KH
1618
1619 if (signal_pending(current)) {
1620 ret = -EINTR;
1621 goto out;
1622 }
f817ed48 1623 progress = try_to_free_mem_cgroup_pages(mem,
2c26fdd7 1624 GFP_KERNEL, false);
c1e862c1 1625 if (!progress) {
f817ed48 1626 nr_retries--;
c1e862c1
KH
1627 /* maybe some writeback is necessary */
1628 congestion_wait(WRITE, HZ/10);
1629 }
f817ed48
KH
1630
1631 }
08e552c6 1632 lru_add_drain();
f817ed48
KH
1633 /* try move_account...there may be some *locked* pages. */
1634 if (mem->res.usage)
1635 goto move_account;
1636 ret = 0;
1637 goto out;
cc847582
KH
1638}
1639
c1e862c1
KH
1640int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
1641{
1642 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
1643}
1644
1645
18f59ea7
BS
1646static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
1647{
1648 return mem_cgroup_from_cont(cont)->use_hierarchy;
1649}
1650
1651static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
1652 u64 val)
1653{
1654 int retval = 0;
1655 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1656 struct cgroup *parent = cont->parent;
1657 struct mem_cgroup *parent_mem = NULL;
1658
1659 if (parent)
1660 parent_mem = mem_cgroup_from_cont(parent);
1661
1662 cgroup_lock();
1663 /*
1664 * If parent's use_hiearchy is set, we can't make any modifications
1665 * in the child subtrees. If it is unset, then the change can
1666 * occur, provided the current cgroup has no children.
1667 *
1668 * For the root cgroup, parent_mem is NULL, we allow value to be
1669 * set if there are no children.
1670 */
1671 if ((!parent_mem || !parent_mem->use_hierarchy) &&
1672 (val == 1 || val == 0)) {
1673 if (list_empty(&cont->children))
1674 mem->use_hierarchy = val;
1675 else
1676 retval = -EBUSY;
1677 } else
1678 retval = -EINVAL;
1679 cgroup_unlock();
1680
1681 return retval;
1682}
1683
2c3daa72 1684static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
8cdea7c0 1685{
8c7c6e34
KH
1686 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1687 u64 val = 0;
1688 int type, name;
1689
1690 type = MEMFILE_TYPE(cft->private);
1691 name = MEMFILE_ATTR(cft->private);
1692 switch (type) {
1693 case _MEM:
1694 val = res_counter_read_u64(&mem->res, name);
1695 break;
1696 case _MEMSWAP:
1697 if (do_swap_account)
1698 val = res_counter_read_u64(&mem->memsw, name);
1699 break;
1700 default:
1701 BUG();
1702 break;
1703 }
1704 return val;
8cdea7c0 1705}
628f4235
KH
1706/*
1707 * The user of this function is...
1708 * RES_LIMIT.
1709 */
856c13aa
PM
1710static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
1711 const char *buffer)
8cdea7c0 1712{
628f4235 1713 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
8c7c6e34 1714 int type, name;
628f4235
KH
1715 unsigned long long val;
1716 int ret;
1717
8c7c6e34
KH
1718 type = MEMFILE_TYPE(cft->private);
1719 name = MEMFILE_ATTR(cft->private);
1720 switch (name) {
628f4235
KH
1721 case RES_LIMIT:
1722 /* This function does all necessary parse...reuse it */
1723 ret = res_counter_memparse_write_strategy(buffer, &val);
8c7c6e34
KH
1724 if (ret)
1725 break;
1726 if (type == _MEM)
628f4235 1727 ret = mem_cgroup_resize_limit(memcg, val);
8c7c6e34
KH
1728 else
1729 ret = mem_cgroup_resize_memsw_limit(memcg, val);
628f4235
KH
1730 break;
1731 default:
1732 ret = -EINVAL; /* should be BUG() ? */
1733 break;
1734 }
1735 return ret;
8cdea7c0
BS
1736}
1737
29f2a4da 1738static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
c84872e1
PE
1739{
1740 struct mem_cgroup *mem;
8c7c6e34 1741 int type, name;
c84872e1
PE
1742
1743 mem = mem_cgroup_from_cont(cont);
8c7c6e34
KH
1744 type = MEMFILE_TYPE(event);
1745 name = MEMFILE_ATTR(event);
1746 switch (name) {
29f2a4da 1747 case RES_MAX_USAGE:
8c7c6e34
KH
1748 if (type == _MEM)
1749 res_counter_reset_max(&mem->res);
1750 else
1751 res_counter_reset_max(&mem->memsw);
29f2a4da
PE
1752 break;
1753 case RES_FAILCNT:
8c7c6e34
KH
1754 if (type == _MEM)
1755 res_counter_reset_failcnt(&mem->res);
1756 else
1757 res_counter_reset_failcnt(&mem->memsw);
29f2a4da
PE
1758 break;
1759 }
85cc59db 1760 return 0;
c84872e1
PE
1761}
1762
d2ceb9b7
KH
1763static const struct mem_cgroup_stat_desc {
1764 const char *msg;
1765 u64 unit;
1766} mem_cgroup_stat_desc[] = {
1767 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
1768 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
55e462b0
BR
1769 [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
1770 [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
d2ceb9b7
KH
1771};
1772
c64745cf
PM
1773static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1774 struct cgroup_map_cb *cb)
d2ceb9b7 1775{
d2ceb9b7
KH
1776 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
1777 struct mem_cgroup_stat *stat = &mem_cont->stat;
1778 int i;
1779
1780 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
1781 s64 val;
1782
1783 val = mem_cgroup_read_stat(stat, i);
1784 val *= mem_cgroup_stat_desc[i].unit;
c64745cf 1785 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
d2ceb9b7 1786 }
6d12e2d8
KH
1787 /* showing # of active pages */
1788 {
4f98a2fe
RR
1789 unsigned long active_anon, inactive_anon;
1790 unsigned long active_file, inactive_file;
7b854121 1791 unsigned long unevictable;
4f98a2fe
RR
1792
1793 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
1794 LRU_INACTIVE_ANON);
1795 active_anon = mem_cgroup_get_all_zonestat(mem_cont,
1796 LRU_ACTIVE_ANON);
1797 inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
1798 LRU_INACTIVE_FILE);
1799 active_file = mem_cgroup_get_all_zonestat(mem_cont,
1800 LRU_ACTIVE_FILE);
7b854121
LS
1801 unevictable = mem_cgroup_get_all_zonestat(mem_cont,
1802 LRU_UNEVICTABLE);
1803
4f98a2fe
RR
1804 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
1805 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
1806 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
1807 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
7b854121
LS
1808 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
1809
6d12e2d8 1810 }
d2ceb9b7
KH
1811 return 0;
1812}
1813
c1e862c1 1814
8cdea7c0
BS
1815static struct cftype mem_cgroup_files[] = {
1816 {
0eea1030 1817 .name = "usage_in_bytes",
8c7c6e34 1818 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
2c3daa72 1819 .read_u64 = mem_cgroup_read,
8cdea7c0 1820 },
c84872e1
PE
1821 {
1822 .name = "max_usage_in_bytes",
8c7c6e34 1823 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
29f2a4da 1824 .trigger = mem_cgroup_reset,
c84872e1
PE
1825 .read_u64 = mem_cgroup_read,
1826 },
8cdea7c0 1827 {
0eea1030 1828 .name = "limit_in_bytes",
8c7c6e34 1829 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
856c13aa 1830 .write_string = mem_cgroup_write,
2c3daa72 1831 .read_u64 = mem_cgroup_read,
8cdea7c0
BS
1832 },
1833 {
1834 .name = "failcnt",
8c7c6e34 1835 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
29f2a4da 1836 .trigger = mem_cgroup_reset,
2c3daa72 1837 .read_u64 = mem_cgroup_read,
8cdea7c0 1838 },
d2ceb9b7
KH
1839 {
1840 .name = "stat",
c64745cf 1841 .read_map = mem_control_stat_show,
d2ceb9b7 1842 },
c1e862c1
KH
1843 {
1844 .name = "force_empty",
1845 .trigger = mem_cgroup_force_empty_write,
1846 },
18f59ea7
BS
1847 {
1848 .name = "use_hierarchy",
1849 .write_u64 = mem_cgroup_hierarchy_write,
1850 .read_u64 = mem_cgroup_hierarchy_read,
1851 },
8cdea7c0
BS
1852};
1853
8c7c6e34
KH
1854#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1855static struct cftype memsw_cgroup_files[] = {
1856 {
1857 .name = "memsw.usage_in_bytes",
1858 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
1859 .read_u64 = mem_cgroup_read,
1860 },
1861 {
1862 .name = "memsw.max_usage_in_bytes",
1863 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
1864 .trigger = mem_cgroup_reset,
1865 .read_u64 = mem_cgroup_read,
1866 },
1867 {
1868 .name = "memsw.limit_in_bytes",
1869 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
1870 .write_string = mem_cgroup_write,
1871 .read_u64 = mem_cgroup_read,
1872 },
1873 {
1874 .name = "memsw.failcnt",
1875 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
1876 .trigger = mem_cgroup_reset,
1877 .read_u64 = mem_cgroup_read,
1878 },
1879};
1880
1881static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
1882{
1883 if (!do_swap_account)
1884 return 0;
1885 return cgroup_add_files(cont, ss, memsw_cgroup_files,
1886 ARRAY_SIZE(memsw_cgroup_files));
1887};
1888#else
1889static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
1890{
1891 return 0;
1892}
1893#endif
1894
6d12e2d8
KH
1895static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1896{
1897 struct mem_cgroup_per_node *pn;
1ecaab2b 1898 struct mem_cgroup_per_zone *mz;
b69408e8 1899 enum lru_list l;
41e3355d 1900 int zone, tmp = node;
1ecaab2b
KH
1901 /*
1902 * This routine is called against possible nodes.
1903 * But it's BUG to call kmalloc() against offline node.
1904 *
1905 * TODO: this routine can waste much memory for nodes which will
1906 * never be onlined. It's better to use memory hotplug callback
1907 * function.
1908 */
41e3355d
KH
1909 if (!node_state(node, N_NORMAL_MEMORY))
1910 tmp = -1;
1911 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
6d12e2d8
KH
1912 if (!pn)
1913 return 1;
1ecaab2b 1914
6d12e2d8
KH
1915 mem->info.nodeinfo[node] = pn;
1916 memset(pn, 0, sizeof(*pn));
1ecaab2b
KH
1917
1918 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1919 mz = &pn->zoneinfo[zone];
b69408e8
CL
1920 for_each_lru(l)
1921 INIT_LIST_HEAD(&mz->lists[l]);
1ecaab2b 1922 }
6d12e2d8
KH
1923 return 0;
1924}
1925
1ecaab2b
KH
1926static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1927{
1928 kfree(mem->info.nodeinfo[node]);
1929}
1930
c8dad2bb
JB
1931static int mem_cgroup_size(void)
1932{
1933 int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
1934 return sizeof(struct mem_cgroup) + cpustat_size;
1935}
1936
33327948
KH
1937static struct mem_cgroup *mem_cgroup_alloc(void)
1938{
1939 struct mem_cgroup *mem;
c8dad2bb 1940 int size = mem_cgroup_size();
33327948 1941
c8dad2bb
JB
1942 if (size < PAGE_SIZE)
1943 mem = kmalloc(size, GFP_KERNEL);
33327948 1944 else
c8dad2bb 1945 mem = vmalloc(size);
33327948
KH
1946
1947 if (mem)
c8dad2bb 1948 memset(mem, 0, size);
33327948
KH
1949 return mem;
1950}
1951
8c7c6e34
KH
1952/*
1953 * At destroying mem_cgroup, references from swap_cgroup can remain.
1954 * (scanning all at force_empty is too costly...)
1955 *
1956 * Instead of clearing all references at force_empty, we remember
1957 * the number of reference from swap_cgroup and free mem_cgroup when
1958 * it goes down to 0.
1959 *
1960 * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and
1961 * entry which points to this memcg will be ignore at swapin.
1962 *
1963 * Removal of cgroup itself succeeds regardless of refs from swap.
1964 */
1965
33327948
KH
1966static void mem_cgroup_free(struct mem_cgroup *mem)
1967{
08e552c6
KH
1968 int node;
1969
8c7c6e34
KH
1970 if (atomic_read(&mem->refcnt) > 0)
1971 return;
08e552c6
KH
1972
1973
1974 for_each_node_state(node, N_POSSIBLE)
1975 free_mem_cgroup_per_zone_info(mem, node);
1976
c8dad2bb 1977 if (mem_cgroup_size() < PAGE_SIZE)
33327948
KH
1978 kfree(mem);
1979 else
1980 vfree(mem);
1981}
1982
8c7c6e34
KH
1983static void mem_cgroup_get(struct mem_cgroup *mem)
1984{
1985 atomic_inc(&mem->refcnt);
1986}
1987
1988static void mem_cgroup_put(struct mem_cgroup *mem)
1989{
1990 if (atomic_dec_and_test(&mem->refcnt)) {
1991 if (!mem->obsolete)
1992 return;
1993 mem_cgroup_free(mem);
1994 }
1995}
1996
33327948 1997
c077719b
KH
1998#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1999static void __init enable_swap_cgroup(void)
2000{
f8d66542 2001 if (!mem_cgroup_disabled() && really_do_swap_account)
c077719b
KH
2002 do_swap_account = 1;
2003}
2004#else
2005static void __init enable_swap_cgroup(void)
2006{
2007}
2008#endif
2009
8cdea7c0
BS
2010static struct cgroup_subsys_state *
2011mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2012{
28dbc4b6 2013 struct mem_cgroup *mem, *parent;
6d12e2d8 2014 int node;
8cdea7c0 2015
c8dad2bb
JB
2016 mem = mem_cgroup_alloc();
2017 if (!mem)
2018 return ERR_PTR(-ENOMEM);
78fb7466 2019
6d12e2d8
KH
2020 for_each_node_state(node, N_POSSIBLE)
2021 if (alloc_mem_cgroup_per_zone_info(mem, node))
2022 goto free_out;
c077719b 2023 /* root ? */
28dbc4b6 2024 if (cont->parent == NULL) {
c077719b 2025 enable_swap_cgroup();
28dbc4b6 2026 parent = NULL;
18f59ea7 2027 } else {
28dbc4b6 2028 parent = mem_cgroup_from_cont(cont->parent);
18f59ea7
BS
2029 mem->use_hierarchy = parent->use_hierarchy;
2030 }
28dbc4b6 2031
18f59ea7
BS
2032 if (parent && parent->use_hierarchy) {
2033 res_counter_init(&mem->res, &parent->res);
2034 res_counter_init(&mem->memsw, &parent->memsw);
2035 } else {
2036 res_counter_init(&mem->res, NULL);
2037 res_counter_init(&mem->memsw, NULL);
2038 }
14797e23 2039 mem_cgroup_set_inactive_ratio(mem);
6d61ef40
BS
2040 mem->last_scanned_child = NULL;
2041
8cdea7c0 2042 return &mem->css;
6d12e2d8
KH
2043free_out:
2044 for_each_node_state(node, N_POSSIBLE)
1ecaab2b 2045 free_mem_cgroup_per_zone_info(mem, node);
c8dad2bb 2046 mem_cgroup_free(mem);
2dda81ca 2047 return ERR_PTR(-ENOMEM);
8cdea7c0
BS
2048}
2049
df878fb0
KH
2050static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
2051 struct cgroup *cont)
2052{
2053 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
8c7c6e34 2054 mem->obsolete = 1;
c1e862c1 2055 mem_cgroup_force_empty(mem, false);
df878fb0
KH
2056}
2057
8cdea7c0
BS
2058static void mem_cgroup_destroy(struct cgroup_subsys *ss,
2059 struct cgroup *cont)
2060{
33327948 2061 mem_cgroup_free(mem_cgroup_from_cont(cont));
8cdea7c0
BS
2062}
2063
2064static int mem_cgroup_populate(struct cgroup_subsys *ss,
2065 struct cgroup *cont)
2066{
8c7c6e34
KH
2067 int ret;
2068
2069 ret = cgroup_add_files(cont, ss, mem_cgroup_files,
2070 ARRAY_SIZE(mem_cgroup_files));
2071
2072 if (!ret)
2073 ret = register_memsw_files(cont, ss);
2074 return ret;
8cdea7c0
BS
2075}
2076
67e465a7
BS
2077static void mem_cgroup_move_task(struct cgroup_subsys *ss,
2078 struct cgroup *cont,
2079 struct cgroup *old_cont,
2080 struct task_struct *p)
2081{
67e465a7 2082 /*
f9717d28
NK
2083 * FIXME: It's better to move charges of this process from old
2084 * memcg to new memcg. But it's just on TODO-List now.
67e465a7 2085 */
67e465a7
BS
2086}
2087
8cdea7c0
BS
2088struct cgroup_subsys mem_cgroup_subsys = {
2089 .name = "memory",
2090 .subsys_id = mem_cgroup_subsys_id,
2091 .create = mem_cgroup_create,
df878fb0 2092 .pre_destroy = mem_cgroup_pre_destroy,
8cdea7c0
BS
2093 .destroy = mem_cgroup_destroy,
2094 .populate = mem_cgroup_populate,
67e465a7 2095 .attach = mem_cgroup_move_task,
6d12e2d8 2096 .early_init = 0,
8cdea7c0 2097};
c077719b
KH
2098
2099#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2100
2101static int __init disable_swap_account(char *s)
2102{
2103 really_do_swap_account = 0;
2104 return 1;
2105}
2106__setup("noswapaccount", disable_swap_account);
2107#endif