[net-next-2.6.git] / mm / memcontrol.c

/* memcontrol.c - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <linux/res_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>

struct cgroup_subsys mem_cgroup_subsys;

/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 *
 * TODO: Add a water mark for the memory controller. Reclaim will begin when
 * we hit the water mark. May be even add a low water mark, such that
 * no reclaim occurs from a cgroup at it's low water mark, this is
 * a feature that will be implemented much later in the future.
 */
struct mem_cgroup {
	struct cgroup_subsys_state css;
	/*
	 * the counter to account for memory usage
	 */
	struct res_counter res;
	/*
	 * Per cgroup active and inactive list, similar to the
	 * per zone LRU lists.
	 * TODO: Consider making these lists per zone
	 */
	struct list_head active_list;
	struct list_head inactive_list;
};

/*
 * We use the lower bit of the page->page_cgroup pointer as a bit spin
 * lock. We need to ensure that page->page_cgroup is atleast two
 * byte aligned (based on comments from Nick Piggin)
 */
#define PAGE_CGROUP_LOCK_BIT 	0x0
#define PAGE_CGROUP_LOCK 		(1 << PAGE_CGROUP_LOCK_BIT)

/*
 * A page_cgroup page is associated with every page descriptor. The
 * page_cgroup helps us identify information about the cgroup
 */
struct page_cgroup {
	struct list_head lru;		/* per cgroup LRU list */
	struct page *page;
	struct mem_cgroup *mem_cgroup;
	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
					/* mapped and cached states     */
};


static inline
struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
{
	return container_of(cgroup_subsys_state(cont,
				mem_cgroup_subsys_id), struct mem_cgroup,
				css);
}

static inline
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
				struct mem_cgroup, css);
}

void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
{
	struct mem_cgroup *mem;

	mem = mem_cgroup_from_task(p);
	css_get(&mem->css);
	mm->mem_cgroup = mem;
}

void mm_free_cgroup(struct mm_struct *mm)
{
	css_put(&mm->mem_cgroup->css);
}

static inline int page_cgroup_locked(struct page *page)
{
	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
					&page->page_cgroup);
}

void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
{
	int locked;

	/*
	 * While resetting the page_cgroup we might not hold the
	 * page_cgroup lock. free_hot_cold_page() is an example
	 * of such a scenario
	 */
	if (pc)
		VM_BUG_ON(!page_cgroup_locked(page));
	locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
	page->page_cgroup = ((unsigned long)pc | locked);
}

struct page_cgroup *page_get_page_cgroup(struct page *page)
{
	return (struct page_cgroup *)
		(page->page_cgroup & ~PAGE_CGROUP_LOCK);
}

void __always_inline lock_page_cgroup(struct page *page)
{
	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
	VM_BUG_ON(!page_cgroup_locked(page));
}

void __always_inline unlock_page_cgroup(struct page *page)
{
	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
}

/*
 * Charge the memory controller for page usage.
 * Return
 * 0 if the charge was successful
 * < 0 if the cgroup is over its limit
 */
int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
{
	struct mem_cgroup *mem;
	struct page_cgroup *pc, *race_pc;

	/*
	 * Should page_cgroup's go to their own slab?
	 * One could optimize the performance of the charging routine
	 * by saving a bit in the page_flags and using it as a lock
	 * to see if the cgroup page already has a page_cgroup associated
	 * with it
	 */
	lock_page_cgroup(page);
	pc = page_get_page_cgroup(page);
	/*
	 * The page_cgroup exists and the page has already been accounted
	 */
	if (pc) {
		atomic_inc(&pc->ref_cnt);
		goto done;
	}

	unlock_page_cgroup(page);

	pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL);
	if (pc == NULL)
		goto err;

	rcu_read_lock();
	/*
	 * We always charge the cgroup the mm_struct belongs to
	 * the mm_struct's mem_cgroup changes on task migration if the
	 * thread group leader migrates. It's possible that mm is not
	 * set, if so charge the init_mm (happens for pagecache usage).
	 */
	if (!mm)
		mm = &init_mm;

	mem = rcu_dereference(mm->mem_cgroup);
	/*
	 * For every charge from the cgroup, increment reference
	 * count
	 */
	css_get(&mem->css);
	rcu_read_unlock();

	/*
	 * If we created the page_cgroup, we should free it on exceeding
	 * the cgroup limit.
	 */
	if (res_counter_charge(&mem->res, 1)) {
		css_put(&mem->css);
		goto free_pc;
	}

	lock_page_cgroup(page);
	/*
	 * Check if somebody else beat us to allocating the page_cgroup
	 */
	race_pc = page_get_page_cgroup(page);
	if (race_pc) {
		kfree(pc);
		pc = race_pc;
		atomic_inc(&pc->ref_cnt);
		res_counter_uncharge(&mem->res, 1);
		css_put(&mem->css);
		goto done;
	}

	atomic_set(&pc->ref_cnt, 1);
	pc->mem_cgroup = mem;
	pc->page = page;
	page_assign_page_cgroup(page, pc);

done:
	unlock_page_cgroup(page);
	return 0;
free_pc:
	kfree(pc);
	return -ENOMEM;
err:
	unlock_page_cgroup(page);
	return -ENOMEM;
}

/*
 * Uncharging is always a welcome operation, we never complain, simply
 * uncharge.
 */
void mem_cgroup_uncharge(struct page_cgroup *pc)
{
	struct mem_cgroup *mem;
	struct page *page;

	if (!pc)
		return;

	if (atomic_dec_and_test(&pc->ref_cnt)) {
		page = pc->page;
		lock_page_cgroup(page);
		mem = pc->mem_cgroup;
		css_put(&mem->css);
		page_assign_page_cgroup(page, NULL);
		unlock_page_cgroup(page);
		res_counter_uncharge(&mem->res, 1);
		kfree(pc);
	}
}

static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
			struct file *file, char __user *userbuf, size_t nbytes,
			loff_t *ppos)
{
	return res_counter_read(&mem_cgroup_from_cont(cont)->res,
				cft->private, userbuf, nbytes, ppos);
}

static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
				struct file *file, const char __user *userbuf,
				size_t nbytes, loff_t *ppos)
{
	return res_counter_write(&mem_cgroup_from_cont(cont)->res,
				cft->private, userbuf, nbytes, ppos);
}

static struct cftype mem_cgroup_files[] = {
	{
		.name = "usage",
		.private = RES_USAGE,
		.read = mem_cgroup_read,
	},
	{
		.name = "limit",
		.private = RES_LIMIT,
		.write = mem_cgroup_write,
		.read = mem_cgroup_read,
	},
	{
		.name = "failcnt",
		.private = RES_FAILCNT,
		.read = mem_cgroup_read,
	},
};

static struct mem_cgroup init_mem_cgroup;

static struct cgroup_subsys_state *
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
	struct mem_cgroup *mem;

	if (unlikely((cont->parent) == NULL)) {
		mem = &init_mem_cgroup;
		init_mm.mem_cgroup = mem;
	} else
		mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);

	if (mem == NULL)
		return NULL;

	res_counter_init(&mem->res);
	INIT_LIST_HEAD(&mem->active_list);
	INIT_LIST_HEAD(&mem->inactive_list);
	return &mem->css;
}

static void mem_cgroup_destroy(struct cgroup_subsys *ss,
				struct cgroup *cont)
{
	kfree(mem_cgroup_from_cont(cont));
}

static int mem_cgroup_populate(struct cgroup_subsys *ss,
				struct cgroup *cont)
{
	return cgroup_add_files(cont, ss, mem_cgroup_files,
					ARRAY_SIZE(mem_cgroup_files));
}

struct cgroup_subsys mem_cgroup_subsys = {
	.name = "memory",
	.subsys_id = mem_cgroup_subsys_id,
	.create = mem_cgroup_create,
	.destroy = mem_cgroup_destroy,
	.populate = mem_cgroup_populate,
	.early_init = 1,
};
Commit	Line	Data
8cdea7c0 BS	1	/* memcontrol.c - Memory Controller
	2	*
	3	* Copyright IBM Corporation, 2007
	4	* Author Balbir Singh <balbir@linux.vnet.ibm.com>
	5	*
78fb7466 PE	6	* Copyright 2007 OpenVZ SWsoft Inc
	7	* Author: Pavel Emelianov <xemul@openvz.org>
	8	*
8cdea7c0 BS	9	* This program is free software; you can redistribute it and/or modify
	10	* it under the terms of the GNU General Public License as published by
	11	* the Free Software Foundation; either version 2 of the License, or
	12	* (at your option) any later version.
	13	*
	14	* This program is distributed in the hope that it will be useful,
	15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	17	* GNU General Public License for more details.
	18	*/
	19
	20	#include <linux/res_counter.h>
	21	#include <linux/memcontrol.h>
	22	#include <linux/cgroup.h>
78fb7466	23	#include <linux/mm.h>
8a9f3ccd BS	24	#include <linux/page-flags.h>
	25	#include <linux/bit_spinlock.h>
	26	#include <linux/rcupdate.h>
8cdea7c0 BS	27
	28	struct cgroup_subsys mem_cgroup_subsys;
	29
	30	/*
	31	* The memory controller data structure. The memory controller controls both
	32	* page cache and RSS per cgroup. We would eventually like to provide
	33	* statistics based on the statistics developed by Rik Van Riel for clock-pro,
	34	* to help the administrator determine what knobs to tune.
	35	*
	36	* TODO: Add a water mark for the memory controller. Reclaim will begin when
8a9f3ccd BS	37	* we hit the water mark. May be even add a low water mark, such that
	38	* no reclaim occurs from a cgroup at it's low water mark, this is
	39	* a feature that will be implemented much later in the future.
8cdea7c0 BS	40	*/
	41	struct mem_cgroup {
	42	struct cgroup_subsys_state css;
	43	/*
	44	* the counter to account for memory usage
	45	*/
	46	struct res_counter res;
78fb7466 PE	47	/*
	48	* Per cgroup active and inactive list, similar to the
	49	* per zone LRU lists.
	50	* TODO: Consider making these lists per zone
	51	*/
	52	struct list_head active_list;
	53	struct list_head inactive_list;
8cdea7c0 BS	54	};
8cdea7c0 BS	55
8a9f3ccd BS	56	/*
	57	* We use the lower bit of the page->page_cgroup pointer as a bit spin
	58	* lock. We need to ensure that page->page_cgroup is atleast two
	59	* byte aligned (based on comments from Nick Piggin)
	60	*/
	61	#define PAGE_CGROUP_LOCK_BIT 0x0
	62	#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
	63
8cdea7c0 BS	64	/*
	65	* A page_cgroup page is associated with every page descriptor. The
	66	* page_cgroup helps us identify information about the cgroup
	67	*/
	68	struct page_cgroup {
	69	struct list_head lru; /* per cgroup LRU list */
	70	struct page *page;
	71	struct mem_cgroup *mem_cgroup;
8a9f3ccd BS	72	atomic_t ref_cnt; /* Helpful when pages move b/w */
8a9f3ccd BS	73	/* mapped and cached states */
8cdea7c0 BS	74	};
	75
	76
	77	static inline
	78	struct mem_cgroup mem_cgroup_from_cont(struct cgroup cont)
	79	{
	80	return container_of(cgroup_subsys_state(cont,
	81	mem_cgroup_subsys_id), struct mem_cgroup,
	82	css);
	83	}
	84
78fb7466 PE	85	static inline
	86	struct mem_cgroup mem_cgroup_from_task(struct task_struct p)
	87	{
	88	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
	89	struct mem_cgroup, css);
	90	}
	91
	92	void mm_init_cgroup(struct mm_struct mm, struct task_struct p)
	93	{
	94	struct mem_cgroup *mem;
	95
	96	mem = mem_cgroup_from_task(p);
	97	css_get(&mem->css);
	98	mm->mem_cgroup = mem;
	99	}
	100
	101	void mm_free_cgroup(struct mm_struct *mm)
	102	{
	103	css_put(&mm->mem_cgroup->css);
	104	}
	105
8a9f3ccd BS	106	static inline int page_cgroup_locked(struct page *page)
	107	{
	108	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
	109	&page->page_cgroup);
	110	}
	111
78fb7466 PE	112	void page_assign_page_cgroup(struct page page, struct page_cgroup pc)
78fb7466 PE	113	{
8a9f3ccd BS	114	int locked;
	115
	116	/*
	117	* While resetting the page_cgroup we might not hold the
	118	* page_cgroup lock. free_hot_cold_page() is an example
	119	* of such a scenario
	120	*/
	121	if (pc)
	122	VM_BUG_ON(!page_cgroup_locked(page));
	123	locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
	124	page->page_cgroup = ((unsigned long)pc \| locked);
78fb7466 PE	125	}
	126
	127	struct page_cgroup page_get_page_cgroup(struct page page)
	128	{
8a9f3ccd BS	129	return (struct page_cgroup *)
	130	(page->page_cgroup & ~PAGE_CGROUP_LOCK);
	131	}
	132
	133	void __always_inline lock_page_cgroup(struct page *page)
	134	{
	135	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
	136	VM_BUG_ON(!page_cgroup_locked(page));
	137	}
	138
	139	void __always_inline unlock_page_cgroup(struct page *page)
	140	{
	141	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
	142	}
	143
	144	/*
	145	* Charge the memory controller for page usage.
	146	* Return
	147	* 0 if the charge was successful
	148	* < 0 if the cgroup is over its limit
	149	*/
	150	int mem_cgroup_charge(struct page page, struct mm_struct mm)
	151	{
	152	struct mem_cgroup *mem;
	153	struct page_cgroup pc, race_pc;
	154
	155	/*
	156	* Should page_cgroup's go to their own slab?
	157	* One could optimize the performance of the charging routine
	158	* by saving a bit in the page_flags and using it as a lock
	159	* to see if the cgroup page already has a page_cgroup associated
	160	* with it
	161	*/
	162	lock_page_cgroup(page);
	163	pc = page_get_page_cgroup(page);
	164	/*
	165	* The page_cgroup exists and the page has already been accounted
	166	*/
	167	if (pc) {
	168	atomic_inc(&pc->ref_cnt);
	169	goto done;
	170	}
	171
	172	unlock_page_cgroup(page);
	173
	174	pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL);
	175	if (pc == NULL)
	176	goto err;
	177
	178	rcu_read_lock();
	179	/*
	180	* We always charge the cgroup the mm_struct belongs to
	181	* the mm_struct's mem_cgroup changes on task migration if the
	182	* thread group leader migrates. It's possible that mm is not
	183	* set, if so charge the init_mm (happens for pagecache usage).
	184	*/
	185	if (!mm)
	186	mm = &init_mm;
	187
	188	mem = rcu_dereference(mm->mem_cgroup);
	189	/*
	190	* For every charge from the cgroup, increment reference
	191	* count
	192	*/
193	css_get(&mem->css);
194	rcu_read_unlock();
195
196	/*
197	* If we created the page_cgroup, we should free it on exceeding
198	* the cgroup limit.
199	*/
200	if (res_counter_charge(&mem->res, 1)) {
201	css_put(&mem->css);
202	goto free_pc;
203	}
204
205	lock_page_cgroup(page);
206	/*
207	* Check if somebody else beat us to allocating the page_cgroup
208	*/
209	race_pc = page_get_page_cgroup(page);
210	if (race_pc) {
211	kfree(pc);
212	pc = race_pc;
213	atomic_inc(&pc->ref_cnt);
214	res_counter_uncharge(&mem->res, 1);
215	css_put(&mem->css);
216	goto done;
217	}
218
219	atomic_set(&pc->ref_cnt, 1);
220	pc->mem_cgroup = mem;
221	pc->page = page;
222	page_assign_page_cgroup(page, pc);
223
224	done:
225	unlock_page_cgroup(page);
226	return 0;
227	free_pc:
228	kfree(pc);
229	return -ENOMEM;
230	err:
231	unlock_page_cgroup(page);
232	return -ENOMEM;
233	}
234
235	/*
236	* Uncharging is always a welcome operation, we never complain, simply
237	* uncharge.
238	*/
239	void mem_cgroup_uncharge(struct page_cgroup *pc)
240	{
241	struct mem_cgroup *mem;
242	struct page *page;
243
244	if (!pc)
245	return;
246
247	if (atomic_dec_and_test(&pc->ref_cnt)) {
248	page = pc->page;
249	lock_page_cgroup(page);
250	mem = pc->mem_cgroup;
251	css_put(&mem->css);
252	page_assign_page_cgroup(page, NULL);
253	unlock_page_cgroup(page);
254	res_counter_uncharge(&mem->res, 1);
255	kfree(pc);
256	}
78fb7466 PE	257	}
78fb7466 PE	258
8cdea7c0 BS	259	static ssize_t mem_cgroup_read(struct cgroup cont, struct cftype cft,
	260	struct file file, char __user userbuf, size_t nbytes,
	261	loff_t *ppos)
	262	{
	263	return res_counter_read(&mem_cgroup_from_cont(cont)->res,
	264	cft->private, userbuf, nbytes, ppos);
	265	}
	266
	267	static ssize_t mem_cgroup_write(struct cgroup cont, struct cftype cft,
	268	struct file file, const char __user userbuf,
	269	size_t nbytes, loff_t *ppos)
	270	{
	271	return res_counter_write(&mem_cgroup_from_cont(cont)->res,
	272	cft->private, userbuf, nbytes, ppos);
	273	}
	274
	275	static struct cftype mem_cgroup_files[] = {
	276	{
	277	.name = "usage",
	278	.private = RES_USAGE,
	279	.read = mem_cgroup_read,
	280	},
	281	{
	282	.name = "limit",
	283	.private = RES_LIMIT,
	284	.write = mem_cgroup_write,
	285	.read = mem_cgroup_read,
	286	},
	287	{
	288	.name = "failcnt",
	289	.private = RES_FAILCNT,
	290	.read = mem_cgroup_read,
	291	},
	292	};
	293
78fb7466 PE	294	static struct mem_cgroup init_mem_cgroup;
78fb7466 PE	295
8cdea7c0 BS	296	static struct cgroup_subsys_state *
	297	mem_cgroup_create(struct cgroup_subsys ss, struct cgroup cont)
	298	{
	299	struct mem_cgroup *mem;
	300
78fb7466 PE	301	if (unlikely((cont->parent) == NULL)) {
	302	mem = &init_mem_cgroup;
	303	init_mm.mem_cgroup = mem;
	304	} else
	305	mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
	306
	307	if (mem == NULL)
	308	return NULL;
8cdea7c0 BS	309
8cdea7c0 BS	310	res_counter_init(&mem->res);
8a9f3ccd BS	311	INIT_LIST_HEAD(&mem->active_list);
8a9f3ccd BS	312	INIT_LIST_HEAD(&mem->inactive_list);
8cdea7c0 BS	313	return &mem->css;
	314	}
	315
	316	static void mem_cgroup_destroy(struct cgroup_subsys *ss,
	317	struct cgroup *cont)
	318	{
	319	kfree(mem_cgroup_from_cont(cont));
	320	}
	321
	322	static int mem_cgroup_populate(struct cgroup_subsys *ss,
	323	struct cgroup *cont)
	324	{
	325	return cgroup_add_files(cont, ss, mem_cgroup_files,
	326	ARRAY_SIZE(mem_cgroup_files));
	327	}
	328
	329	struct cgroup_subsys mem_cgroup_subsys = {
	330	.name = "memory",
	331	.subsys_id = mem_cgroup_subsys_id,
	332	.create = mem_cgroup_create,
	333	.destroy = mem_cgroup_destroy,
	334	.populate = mem_cgroup_populate,
78fb7466	335	.early_init = 1,
8cdea7c0	336	};