]>
Commit | Line | Data |
---|---|---|
8cdea7c0 BS |
1 | /* memcontrol.c - Memory Controller |
2 | * | |
3 | * Copyright IBM Corporation, 2007 | |
4 | * Author Balbir Singh <balbir@linux.vnet.ibm.com> | |
5 | * | |
78fb7466 PE |
6 | * Copyright 2007 OpenVZ SWsoft Inc |
7 | * Author: Pavel Emelianov <xemul@openvz.org> | |
8 | * | |
8cdea7c0 BS |
9 | * This program is free software; you can redistribute it and/or modify |
10 | * it under the terms of the GNU General Public License as published by | |
11 | * the Free Software Foundation; either version 2 of the License, or | |
12 | * (at your option) any later version. | |
13 | * | |
14 | * This program is distributed in the hope that it will be useful, | |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 | * GNU General Public License for more details. | |
18 | */ | |
19 | ||
20 | #include <linux/res_counter.h> | |
21 | #include <linux/memcontrol.h> | |
22 | #include <linux/cgroup.h> | |
78fb7466 | 23 | #include <linux/mm.h> |
8a9f3ccd BS |
24 | #include <linux/page-flags.h> |
25 | #include <linux/bit_spinlock.h> | |
26 | #include <linux/rcupdate.h> | |
8cdea7c0 BS |
27 | |
28 | struct cgroup_subsys mem_cgroup_subsys; | |
29 | ||
30 | /* | |
31 | * The memory controller data structure. The memory controller controls both | |
32 | * page cache and RSS per cgroup. We would eventually like to provide | |
33 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | |
34 | * to help the administrator determine what knobs to tune. | |
35 | * | |
36 | * TODO: Add a water mark for the memory controller. Reclaim will begin when | |
8a9f3ccd BS |
37 | * we hit the water mark. May be even add a low water mark, such that |
38 | * no reclaim occurs from a cgroup at it's low water mark, this is | |
39 | * a feature that will be implemented much later in the future. | |
8cdea7c0 BS |
40 | */ |
41 | struct mem_cgroup { | |
42 | struct cgroup_subsys_state css; | |
43 | /* | |
44 | * the counter to account for memory usage | |
45 | */ | |
46 | struct res_counter res; | |
78fb7466 PE |
47 | /* |
48 | * Per cgroup active and inactive list, similar to the | |
49 | * per zone LRU lists. | |
50 | * TODO: Consider making these lists per zone | |
51 | */ | |
52 | struct list_head active_list; | |
53 | struct list_head inactive_list; | |
8cdea7c0 BS |
54 | }; |
55 | ||
8a9f3ccd BS |
56 | /* |
57 | * We use the lower bit of the page->page_cgroup pointer as a bit spin | |
58 | * lock. We need to ensure that page->page_cgroup is atleast two | |
59 | * byte aligned (based on comments from Nick Piggin) | |
60 | */ | |
61 | #define PAGE_CGROUP_LOCK_BIT 0x0 | |
62 | #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) | |
63 | ||
8cdea7c0 BS |
64 | /* |
65 | * A page_cgroup page is associated with every page descriptor. The | |
66 | * page_cgroup helps us identify information about the cgroup | |
67 | */ | |
68 | struct page_cgroup { | |
69 | struct list_head lru; /* per cgroup LRU list */ | |
70 | struct page *page; | |
71 | struct mem_cgroup *mem_cgroup; | |
8a9f3ccd BS |
72 | atomic_t ref_cnt; /* Helpful when pages move b/w */ |
73 | /* mapped and cached states */ | |
8cdea7c0 BS |
74 | }; |
75 | ||
76 | ||
77 | static inline | |
78 | struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | |
79 | { | |
80 | return container_of(cgroup_subsys_state(cont, | |
81 | mem_cgroup_subsys_id), struct mem_cgroup, | |
82 | css); | |
83 | } | |
84 | ||
78fb7466 PE |
85 | static inline |
86 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |
87 | { | |
88 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), | |
89 | struct mem_cgroup, css); | |
90 | } | |
91 | ||
92 | void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p) | |
93 | { | |
94 | struct mem_cgroup *mem; | |
95 | ||
96 | mem = mem_cgroup_from_task(p); | |
97 | css_get(&mem->css); | |
98 | mm->mem_cgroup = mem; | |
99 | } | |
100 | ||
101 | void mm_free_cgroup(struct mm_struct *mm) | |
102 | { | |
103 | css_put(&mm->mem_cgroup->css); | |
104 | } | |
105 | ||
8a9f3ccd BS |
106 | static inline int page_cgroup_locked(struct page *page) |
107 | { | |
108 | return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, | |
109 | &page->page_cgroup); | |
110 | } | |
111 | ||
78fb7466 PE |
112 | void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) |
113 | { | |
8a9f3ccd BS |
114 | int locked; |
115 | ||
116 | /* | |
117 | * While resetting the page_cgroup we might not hold the | |
118 | * page_cgroup lock. free_hot_cold_page() is an example | |
119 | * of such a scenario | |
120 | */ | |
121 | if (pc) | |
122 | VM_BUG_ON(!page_cgroup_locked(page)); | |
123 | locked = (page->page_cgroup & PAGE_CGROUP_LOCK); | |
124 | page->page_cgroup = ((unsigned long)pc | locked); | |
78fb7466 PE |
125 | } |
126 | ||
127 | struct page_cgroup *page_get_page_cgroup(struct page *page) | |
128 | { | |
8a9f3ccd BS |
129 | return (struct page_cgroup *) |
130 | (page->page_cgroup & ~PAGE_CGROUP_LOCK); | |
131 | } | |
132 | ||
133 | void __always_inline lock_page_cgroup(struct page *page) | |
134 | { | |
135 | bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | |
136 | VM_BUG_ON(!page_cgroup_locked(page)); | |
137 | } | |
138 | ||
139 | void __always_inline unlock_page_cgroup(struct page *page) | |
140 | { | |
141 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | |
142 | } | |
143 | ||
144 | /* | |
145 | * Charge the memory controller for page usage. | |
146 | * Return | |
147 | * 0 if the charge was successful | |
148 | * < 0 if the cgroup is over its limit | |
149 | */ | |
150 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm) | |
151 | { | |
152 | struct mem_cgroup *mem; | |
153 | struct page_cgroup *pc, *race_pc; | |
154 | ||
155 | /* | |
156 | * Should page_cgroup's go to their own slab? | |
157 | * One could optimize the performance of the charging routine | |
158 | * by saving a bit in the page_flags and using it as a lock | |
159 | * to see if the cgroup page already has a page_cgroup associated | |
160 | * with it | |
161 | */ | |
162 | lock_page_cgroup(page); | |
163 | pc = page_get_page_cgroup(page); | |
164 | /* | |
165 | * The page_cgroup exists and the page has already been accounted | |
166 | */ | |
167 | if (pc) { | |
168 | atomic_inc(&pc->ref_cnt); | |
169 | goto done; | |
170 | } | |
171 | ||
172 | unlock_page_cgroup(page); | |
173 | ||
174 | pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL); | |
175 | if (pc == NULL) | |
176 | goto err; | |
177 | ||
178 | rcu_read_lock(); | |
179 | /* | |
180 | * We always charge the cgroup the mm_struct belongs to | |
181 | * the mm_struct's mem_cgroup changes on task migration if the | |
182 | * thread group leader migrates. It's possible that mm is not | |
183 | * set, if so charge the init_mm (happens for pagecache usage). | |
184 | */ | |
185 | if (!mm) | |
186 | mm = &init_mm; | |
187 | ||
188 | mem = rcu_dereference(mm->mem_cgroup); | |
189 | /* | |
190 | * For every charge from the cgroup, increment reference | |
191 | * count | |
192 | */ | |
193 | css_get(&mem->css); | |
194 | rcu_read_unlock(); | |
195 | ||
196 | /* | |
197 | * If we created the page_cgroup, we should free it on exceeding | |
198 | * the cgroup limit. | |
199 | */ | |
200 | if (res_counter_charge(&mem->res, 1)) { | |
201 | css_put(&mem->css); | |
202 | goto free_pc; | |
203 | } | |
204 | ||
205 | lock_page_cgroup(page); | |
206 | /* | |
207 | * Check if somebody else beat us to allocating the page_cgroup | |
208 | */ | |
209 | race_pc = page_get_page_cgroup(page); | |
210 | if (race_pc) { | |
211 | kfree(pc); | |
212 | pc = race_pc; | |
213 | atomic_inc(&pc->ref_cnt); | |
214 | res_counter_uncharge(&mem->res, 1); | |
215 | css_put(&mem->css); | |
216 | goto done; | |
217 | } | |
218 | ||
219 | atomic_set(&pc->ref_cnt, 1); | |
220 | pc->mem_cgroup = mem; | |
221 | pc->page = page; | |
222 | page_assign_page_cgroup(page, pc); | |
223 | ||
224 | done: | |
225 | unlock_page_cgroup(page); | |
226 | return 0; | |
227 | free_pc: | |
228 | kfree(pc); | |
229 | return -ENOMEM; | |
230 | err: | |
231 | unlock_page_cgroup(page); | |
232 | return -ENOMEM; | |
233 | } | |
234 | ||
235 | /* | |
236 | * Uncharging is always a welcome operation, we never complain, simply | |
237 | * uncharge. | |
238 | */ | |
239 | void mem_cgroup_uncharge(struct page_cgroup *pc) | |
240 | { | |
241 | struct mem_cgroup *mem; | |
242 | struct page *page; | |
243 | ||
244 | if (!pc) | |
245 | return; | |
246 | ||
247 | if (atomic_dec_and_test(&pc->ref_cnt)) { | |
248 | page = pc->page; | |
249 | lock_page_cgroup(page); | |
250 | mem = pc->mem_cgroup; | |
251 | css_put(&mem->css); | |
252 | page_assign_page_cgroup(page, NULL); | |
253 | unlock_page_cgroup(page); | |
254 | res_counter_uncharge(&mem->res, 1); | |
255 | kfree(pc); | |
256 | } | |
78fb7466 PE |
257 | } |
258 | ||
8cdea7c0 BS |
259 | static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, |
260 | struct file *file, char __user *userbuf, size_t nbytes, | |
261 | loff_t *ppos) | |
262 | { | |
263 | return res_counter_read(&mem_cgroup_from_cont(cont)->res, | |
264 | cft->private, userbuf, nbytes, ppos); | |
265 | } | |
266 | ||
267 | static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |
268 | struct file *file, const char __user *userbuf, | |
269 | size_t nbytes, loff_t *ppos) | |
270 | { | |
271 | return res_counter_write(&mem_cgroup_from_cont(cont)->res, | |
272 | cft->private, userbuf, nbytes, ppos); | |
273 | } | |
274 | ||
275 | static struct cftype mem_cgroup_files[] = { | |
276 | { | |
277 | .name = "usage", | |
278 | .private = RES_USAGE, | |
279 | .read = mem_cgroup_read, | |
280 | }, | |
281 | { | |
282 | .name = "limit", | |
283 | .private = RES_LIMIT, | |
284 | .write = mem_cgroup_write, | |
285 | .read = mem_cgroup_read, | |
286 | }, | |
287 | { | |
288 | .name = "failcnt", | |
289 | .private = RES_FAILCNT, | |
290 | .read = mem_cgroup_read, | |
291 | }, | |
292 | }; | |
293 | ||
78fb7466 PE |
294 | static struct mem_cgroup init_mem_cgroup; |
295 | ||
8cdea7c0 BS |
296 | static struct cgroup_subsys_state * |
297 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |
298 | { | |
299 | struct mem_cgroup *mem; | |
300 | ||
78fb7466 PE |
301 | if (unlikely((cont->parent) == NULL)) { |
302 | mem = &init_mem_cgroup; | |
303 | init_mm.mem_cgroup = mem; | |
304 | } else | |
305 | mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL); | |
306 | ||
307 | if (mem == NULL) | |
308 | return NULL; | |
8cdea7c0 BS |
309 | |
310 | res_counter_init(&mem->res); | |
8a9f3ccd BS |
311 | INIT_LIST_HEAD(&mem->active_list); |
312 | INIT_LIST_HEAD(&mem->inactive_list); | |
8cdea7c0 BS |
313 | return &mem->css; |
314 | } | |
315 | ||
316 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, | |
317 | struct cgroup *cont) | |
318 | { | |
319 | kfree(mem_cgroup_from_cont(cont)); | |
320 | } | |
321 | ||
322 | static int mem_cgroup_populate(struct cgroup_subsys *ss, | |
323 | struct cgroup *cont) | |
324 | { | |
325 | return cgroup_add_files(cont, ss, mem_cgroup_files, | |
326 | ARRAY_SIZE(mem_cgroup_files)); | |
327 | } | |
328 | ||
329 | struct cgroup_subsys mem_cgroup_subsys = { | |
330 | .name = "memory", | |
331 | .subsys_id = mem_cgroup_subsys_id, | |
332 | .create = mem_cgroup_create, | |
333 | .destroy = mem_cgroup_destroy, | |
334 | .populate = mem_cgroup_populate, | |
78fb7466 | 335 | .early_init = 1, |
8cdea7c0 | 336 | }; |