]> bbs.cooldavid.org Git - net-next-2.6.git/blame - mm/mempolicy.c
mempolicy: rename mpol_free to mpol_put
[net-next-2.6.git] / mm / mempolicy.c
CommitLineData
1da177e4
LT
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
1da177e4
LT
28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
1da177e4
LT
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
1da177e4
LT
66*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
1da177e4
LT
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/gfp.h>
77#include <linux/slab.h>
78#include <linux/string.h>
79#include <linux/module.h>
b488893a 80#include <linux/nsproxy.h>
1da177e4
LT
81#include <linux/interrupt.h>
82#include <linux/init.h>
83#include <linux/compat.h>
dc9aa5b9 84#include <linux/swap.h>
1a75a6c8
CL
85#include <linux/seq_file.h>
86#include <linux/proc_fs.h>
b20a3503 87#include <linux/migrate.h>
95a402c3 88#include <linux/rmap.h>
86c3a764 89#include <linux/security.h>
dbcb0f19 90#include <linux/syscalls.h>
dc9aa5b9 91
1da177e4
LT
92#include <asm/tlbflush.h>
93#include <asm/uaccess.h>
94
38e35860 95/* Internal flags */
dc9aa5b9 96#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
38e35860 97#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
1a75a6c8 98#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
dc9aa5b9 99
fcc234f8
PE
100static struct kmem_cache *policy_cache;
101static struct kmem_cache *sn_cache;
1da177e4 102
1da177e4
LT
103/* Highest zone. An specific allocation for a zone below that is not
104 policied. */
6267276f 105enum zone_type policy_zone = 0;
1da177e4 106
d42c6997 107struct mempolicy default_policy = {
1da177e4
LT
108 .refcnt = ATOMIC_INIT(1), /* never free it */
109 .policy = MPOL_DEFAULT,
110};
111
37012946
DR
112static const struct mempolicy_operations {
113 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
114 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
115} mpol_ops[MPOL_MAX];
116
19770b32 117/* Check that the nodemask contains at least one populated zone */
37012946 118static int is_valid_nodemask(const nodemask_t *nodemask)
1da177e4 119{
19770b32 120 int nd, k;
1da177e4 121
19770b32
MG
122 /* Check that there is something useful in this mask */
123 k = policy_zone;
124
125 for_each_node_mask(nd, *nodemask) {
126 struct zone *z;
127
128 for (k = 0; k <= policy_zone; k++) {
129 z = &NODE_DATA(nd)->node_zones[k];
130 if (z->present_pages > 0)
131 return 1;
dd942ae3 132 }
8af5e2eb 133 }
19770b32
MG
134
135 return 0;
1da177e4
LT
136}
137
f5b087b5
DR
138static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
139{
4c50bc01
DR
140 return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
141}
142
143static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
144 const nodemask_t *rel)
145{
146 nodemask_t tmp;
147 nodes_fold(tmp, *orig, nodes_weight(*rel));
148 nodes_onto(*ret, tmp, *rel);
f5b087b5
DR
149}
150
37012946
DR
151static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
152{
153 if (nodes_empty(*nodes))
154 return -EINVAL;
155 pol->v.nodes = *nodes;
156 return 0;
157}
158
159static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
160{
161 if (!nodes)
162 pol->v.preferred_node = -1; /* local allocation */
163 else if (nodes_empty(*nodes))
164 return -EINVAL; /* no allowed nodes */
165 else
166 pol->v.preferred_node = first_node(*nodes);
167 return 0;
168}
169
170static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
171{
172 if (!is_valid_nodemask(nodes))
173 return -EINVAL;
174 pol->v.nodes = *nodes;
175 return 0;
176}
177
1da177e4 178/* Create a new policy */
028fec41
DR
179static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
180 nodemask_t *nodes)
1da177e4
LT
181{
182 struct mempolicy *policy;
f5b087b5 183 nodemask_t cpuset_context_nmask;
37012946 184 int ret;
1da177e4 185
028fec41
DR
186 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
187 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
140d5a49 188
3e1f0645
DR
189 if (mode == MPOL_DEFAULT) {
190 if (nodes && !nodes_empty(*nodes))
37012946 191 return ERR_PTR(-EINVAL);
3e1f0645 192 return NULL;
37012946 193 }
3e1f0645
DR
194 VM_BUG_ON(!nodes);
195
196 /*
197 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
198 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
199 * All other modes require a valid pointer to a non-empty nodemask.
200 */
201 if (mode == MPOL_PREFERRED) {
202 if (nodes_empty(*nodes)) {
203 if (((flags & MPOL_F_STATIC_NODES) ||
204 (flags & MPOL_F_RELATIVE_NODES)))
205 return ERR_PTR(-EINVAL);
206 nodes = NULL; /* flag local alloc */
207 }
208 } else if (nodes_empty(*nodes))
209 return ERR_PTR(-EINVAL);
1da177e4
LT
210 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
211 if (!policy)
212 return ERR_PTR(-ENOMEM);
213 atomic_set(&policy->refcnt, 1);
1da177e4 214 policy->policy = mode;
3e1f0645 215 policy->flags = flags;
37012946 216
3e1f0645
DR
217 if (nodes) {
218 /*
219 * cpuset related setup doesn't apply to local allocation
220 */
37012946
DR
221 cpuset_update_task_memory_state();
222 if (flags & MPOL_F_RELATIVE_NODES)
223 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
224 &cpuset_current_mems_allowed);
225 else
226 nodes_and(cpuset_context_nmask, *nodes,
227 cpuset_current_mems_allowed);
228 if (mpol_store_user_nodemask(policy))
229 policy->w.user_nodemask = *nodes;
230 else
231 policy->w.cpuset_mems_allowed =
232 cpuset_mems_allowed(current);
233 }
234
235 ret = mpol_ops[mode].create(policy,
3e1f0645 236 nodes ? &cpuset_context_nmask : NULL);
37012946
DR
237 if (ret < 0) {
238 kmem_cache_free(policy_cache, policy);
239 return ERR_PTR(ret);
240 }
1da177e4 241 return policy;
37012946
DR
242}
243
244static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
245{
246}
247
248static void mpol_rebind_nodemask(struct mempolicy *pol,
249 const nodemask_t *nodes)
250{
251 nodemask_t tmp;
252
253 if (pol->flags & MPOL_F_STATIC_NODES)
254 nodes_and(tmp, pol->w.user_nodemask, *nodes);
255 else if (pol->flags & MPOL_F_RELATIVE_NODES)
256 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
257 else {
258 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
259 *nodes);
260 pol->w.cpuset_mems_allowed = *nodes;
261 }
f5b087b5 262
37012946
DR
263 pol->v.nodes = tmp;
264 if (!node_isset(current->il_next, tmp)) {
265 current->il_next = next_node(current->il_next, tmp);
266 if (current->il_next >= MAX_NUMNODES)
267 current->il_next = first_node(tmp);
268 if (current->il_next >= MAX_NUMNODES)
269 current->il_next = numa_node_id();
270 }
271}
272
273static void mpol_rebind_preferred(struct mempolicy *pol,
274 const nodemask_t *nodes)
275{
276 nodemask_t tmp;
277
37012946
DR
278 if (pol->flags & MPOL_F_STATIC_NODES) {
279 int node = first_node(pol->w.user_nodemask);
280
281 if (node_isset(node, *nodes))
282 pol->v.preferred_node = node;
283 else
284 pol->v.preferred_node = -1;
37012946
DR
285 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
286 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
287 pol->v.preferred_node = first_node(tmp);
3e1f0645 288 } else if (pol->v.preferred_node != -1) {
37012946
DR
289 pol->v.preferred_node = node_remap(pol->v.preferred_node,
290 pol->w.cpuset_mems_allowed,
291 *nodes);
292 pol->w.cpuset_mems_allowed = *nodes;
293 }
1da177e4
LT
294}
295
1d0d2680
DR
296/* Migrate a policy to a different set of nodes */
297static void mpol_rebind_policy(struct mempolicy *pol,
298 const nodemask_t *newmask)
299{
1d0d2680
DR
300 if (!pol)
301 return;
1d0d2680
DR
302 if (!mpol_store_user_nodemask(pol) &&
303 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
304 return;
37012946 305 mpol_ops[pol->policy].rebind(pol, newmask);
1d0d2680
DR
306}
307
308/*
309 * Wrapper for mpol_rebind_policy() that just requires task
310 * pointer, and updates task mempolicy.
311 */
312
313void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
314{
315 mpol_rebind_policy(tsk->mempolicy, new);
316}
317
318/*
319 * Rebind each vma in mm to new nodemask.
320 *
321 * Call holding a reference to mm. Takes mm->mmap_sem during call.
322 */
323
324void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
325{
326 struct vm_area_struct *vma;
327
328 down_write(&mm->mmap_sem);
329 for (vma = mm->mmap; vma; vma = vma->vm_next)
330 mpol_rebind_policy(vma->vm_policy, new);
331 up_write(&mm->mmap_sem);
332}
333
37012946
DR
334static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
335 [MPOL_DEFAULT] = {
336 .rebind = mpol_rebind_default,
337 },
338 [MPOL_INTERLEAVE] = {
339 .create = mpol_new_interleave,
340 .rebind = mpol_rebind_nodemask,
341 },
342 [MPOL_PREFERRED] = {
343 .create = mpol_new_preferred,
344 .rebind = mpol_rebind_preferred,
345 },
346 [MPOL_BIND] = {
347 .create = mpol_new_bind,
348 .rebind = mpol_rebind_nodemask,
349 },
350};
351
397874df 352static void gather_stats(struct page *, void *, int pte_dirty);
fc301289
CL
353static void migrate_page_add(struct page *page, struct list_head *pagelist,
354 unsigned long flags);
1a75a6c8 355
38e35860 356/* Scan through pages checking if pages follow certain conditions. */
b5810039 357static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
dc9aa5b9
CL
358 unsigned long addr, unsigned long end,
359 const nodemask_t *nodes, unsigned long flags,
38e35860 360 void *private)
1da177e4 361{
91612e0d
HD
362 pte_t *orig_pte;
363 pte_t *pte;
705e87c0 364 spinlock_t *ptl;
941150a3 365
705e87c0 366 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
91612e0d 367 do {
6aab341e 368 struct page *page;
25ba77c1 369 int nid;
91612e0d
HD
370
371 if (!pte_present(*pte))
1da177e4 372 continue;
6aab341e
LT
373 page = vm_normal_page(vma, addr, *pte);
374 if (!page)
1da177e4 375 continue;
053837fc
NP
376 /*
377 * The check for PageReserved here is important to avoid
378 * handling zero pages and other pages that may have been
379 * marked special by the system.
380 *
381 * If the PageReserved would not be checked here then f.e.
382 * the location of the zero page could have an influence
383 * on MPOL_MF_STRICT, zero pages would be counted for
384 * the per node stats, and there would be useless attempts
385 * to put zero pages on the migration list.
386 */
f4598c8b
CL
387 if (PageReserved(page))
388 continue;
6aab341e 389 nid = page_to_nid(page);
38e35860
CL
390 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
391 continue;
392
1a75a6c8 393 if (flags & MPOL_MF_STATS)
397874df 394 gather_stats(page, private, pte_dirty(*pte));
053837fc 395 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
fc301289 396 migrate_page_add(page, private, flags);
38e35860
CL
397 else
398 break;
91612e0d 399 } while (pte++, addr += PAGE_SIZE, addr != end);
705e87c0 400 pte_unmap_unlock(orig_pte, ptl);
91612e0d
HD
401 return addr != end;
402}
403
b5810039 404static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
dc9aa5b9
CL
405 unsigned long addr, unsigned long end,
406 const nodemask_t *nodes, unsigned long flags,
38e35860 407 void *private)
91612e0d
HD
408{
409 pmd_t *pmd;
410 unsigned long next;
411
412 pmd = pmd_offset(pud, addr);
413 do {
414 next = pmd_addr_end(addr, end);
415 if (pmd_none_or_clear_bad(pmd))
416 continue;
dc9aa5b9 417 if (check_pte_range(vma, pmd, addr, next, nodes,
38e35860 418 flags, private))
91612e0d
HD
419 return -EIO;
420 } while (pmd++, addr = next, addr != end);
421 return 0;
422}
423
b5810039 424static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
dc9aa5b9
CL
425 unsigned long addr, unsigned long end,
426 const nodemask_t *nodes, unsigned long flags,
38e35860 427 void *private)
91612e0d
HD
428{
429 pud_t *pud;
430 unsigned long next;
431
432 pud = pud_offset(pgd, addr);
433 do {
434 next = pud_addr_end(addr, end);
435 if (pud_none_or_clear_bad(pud))
436 continue;
dc9aa5b9 437 if (check_pmd_range(vma, pud, addr, next, nodes,
38e35860 438 flags, private))
91612e0d
HD
439 return -EIO;
440 } while (pud++, addr = next, addr != end);
441 return 0;
442}
443
b5810039 444static inline int check_pgd_range(struct vm_area_struct *vma,
dc9aa5b9
CL
445 unsigned long addr, unsigned long end,
446 const nodemask_t *nodes, unsigned long flags,
38e35860 447 void *private)
91612e0d
HD
448{
449 pgd_t *pgd;
450 unsigned long next;
451
b5810039 452 pgd = pgd_offset(vma->vm_mm, addr);
91612e0d
HD
453 do {
454 next = pgd_addr_end(addr, end);
455 if (pgd_none_or_clear_bad(pgd))
456 continue;
dc9aa5b9 457 if (check_pud_range(vma, pgd, addr, next, nodes,
38e35860 458 flags, private))
91612e0d
HD
459 return -EIO;
460 } while (pgd++, addr = next, addr != end);
461 return 0;
1da177e4
LT
462}
463
dc9aa5b9
CL
464/*
465 * Check if all pages in a range are on a set of nodes.
466 * If pagelist != NULL then isolate pages from the LRU and
467 * put them on the pagelist.
468 */
1da177e4
LT
469static struct vm_area_struct *
470check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
38e35860 471 const nodemask_t *nodes, unsigned long flags, void *private)
1da177e4
LT
472{
473 int err;
474 struct vm_area_struct *first, *vma, *prev;
475
90036ee5 476 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
90036ee5 477
b20a3503
CL
478 err = migrate_prep();
479 if (err)
480 return ERR_PTR(err);
90036ee5 481 }
053837fc 482
1da177e4
LT
483 first = find_vma(mm, start);
484 if (!first)
485 return ERR_PTR(-EFAULT);
486 prev = NULL;
487 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
dc9aa5b9
CL
488 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
489 if (!vma->vm_next && vma->vm_end < end)
490 return ERR_PTR(-EFAULT);
491 if (prev && prev->vm_end < vma->vm_start)
492 return ERR_PTR(-EFAULT);
493 }
494 if (!is_vm_hugetlb_page(vma) &&
495 ((flags & MPOL_MF_STRICT) ||
496 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
497 vma_migratable(vma)))) {
5b952b3c 498 unsigned long endvma = vma->vm_end;
dc9aa5b9 499
5b952b3c
AK
500 if (endvma > end)
501 endvma = end;
502 if (vma->vm_start > start)
503 start = vma->vm_start;
dc9aa5b9 504 err = check_pgd_range(vma, start, endvma, nodes,
38e35860 505 flags, private);
1da177e4
LT
506 if (err) {
507 first = ERR_PTR(err);
508 break;
509 }
510 }
511 prev = vma;
512 }
513 return first;
514}
515
516/* Apply policy to a single VMA */
517static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
518{
519 int err = 0;
520 struct mempolicy *old = vma->vm_policy;
521
140d5a49 522 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
1da177e4
LT
523 vma->vm_start, vma->vm_end, vma->vm_pgoff,
524 vma->vm_ops, vma->vm_file,
525 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
526
527 if (vma->vm_ops && vma->vm_ops->set_policy)
528 err = vma->vm_ops->set_policy(vma, new);
529 if (!err) {
530 mpol_get(new);
531 vma->vm_policy = new;
f0be3d32 532 mpol_put(old);
1da177e4
LT
533 }
534 return err;
535}
536
537/* Step 2: apply policy to a range and do splits. */
538static int mbind_range(struct vm_area_struct *vma, unsigned long start,
539 unsigned long end, struct mempolicy *new)
540{
541 struct vm_area_struct *next;
542 int err;
543
544 err = 0;
545 for (; vma && vma->vm_start < end; vma = next) {
546 next = vma->vm_next;
547 if (vma->vm_start < start)
548 err = split_vma(vma->vm_mm, vma, start, 1);
549 if (!err && vma->vm_end > end)
550 err = split_vma(vma->vm_mm, vma, end, 0);
551 if (!err)
552 err = policy_vma(vma, new);
553 if (err)
554 break;
555 }
556 return err;
557}
558
c61afb18
PJ
559/*
560 * Update task->flags PF_MEMPOLICY bit: set iff non-default
561 * mempolicy. Allows more rapid checking of this (combined perhaps
562 * with other PF_* flag bits) on memory allocation hot code paths.
563 *
564 * If called from outside this file, the task 'p' should -only- be
565 * a newly forked child not yet visible on the task list, because
566 * manipulating the task flags of a visible task is not safe.
567 *
568 * The above limitation is why this routine has the funny name
569 * mpol_fix_fork_child_flag().
570 *
571 * It is also safe to call this with a task pointer of current,
572 * which the static wrapper mpol_set_task_struct_flag() does,
573 * for use within this file.
574 */
575
576void mpol_fix_fork_child_flag(struct task_struct *p)
577{
578 if (p->mempolicy)
579 p->flags |= PF_MEMPOLICY;
580 else
581 p->flags &= ~PF_MEMPOLICY;
582}
583
584static void mpol_set_task_struct_flag(void)
585{
586 mpol_fix_fork_child_flag(current);
587}
588
1da177e4 589/* Set the process memory policy */
028fec41
DR
590static long do_set_mempolicy(unsigned short mode, unsigned short flags,
591 nodemask_t *nodes)
1da177e4 592{
1da177e4 593 struct mempolicy *new;
1da177e4 594
028fec41 595 new = mpol_new(mode, flags, nodes);
1da177e4
LT
596 if (IS_ERR(new))
597 return PTR_ERR(new);
f0be3d32 598 mpol_put(current->mempolicy);
1da177e4 599 current->mempolicy = new;
c61afb18 600 mpol_set_task_struct_flag();
f5b087b5
DR
601 if (new && new->policy == MPOL_INTERLEAVE &&
602 nodes_weight(new->v.nodes))
dfcd3c0d 603 current->il_next = first_node(new->v.nodes);
1da177e4
LT
604 return 0;
605}
606
607/* Fill a zone bitmap for a policy */
dfcd3c0d 608static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4 609{
dfcd3c0d 610 nodes_clear(*nodes);
1da177e4 611 switch (p->policy) {
1da177e4
LT
612 case MPOL_DEFAULT:
613 break;
19770b32
MG
614 case MPOL_BIND:
615 /* Fall through */
1da177e4 616 case MPOL_INTERLEAVE:
dfcd3c0d 617 *nodes = p->v.nodes;
1da177e4
LT
618 break;
619 case MPOL_PREFERRED:
56bbd65d 620 /* or use current node instead of memory_map? */
1da177e4 621 if (p->v.preferred_node < 0)
56bbd65d 622 *nodes = node_states[N_HIGH_MEMORY];
1da177e4 623 else
dfcd3c0d 624 node_set(p->v.preferred_node, *nodes);
1da177e4
LT
625 break;
626 default:
627 BUG();
628 }
629}
630
631static int lookup_node(struct mm_struct *mm, unsigned long addr)
632{
633 struct page *p;
634 int err;
635
636 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
637 if (err >= 0) {
638 err = page_to_nid(p);
639 put_page(p);
640 }
641 return err;
642}
643
1da177e4 644/* Retrieve NUMA policy */
dbcb0f19
AB
645static long do_get_mempolicy(int *policy, nodemask_t *nmask,
646 unsigned long addr, unsigned long flags)
1da177e4 647{
8bccd85f 648 int err;
1da177e4
LT
649 struct mm_struct *mm = current->mm;
650 struct vm_area_struct *vma = NULL;
651 struct mempolicy *pol = current->mempolicy;
652
cf2a473c 653 cpuset_update_task_memory_state();
754af6f5
LS
654 if (flags &
655 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1da177e4 656 return -EINVAL;
754af6f5
LS
657
658 if (flags & MPOL_F_MEMS_ALLOWED) {
659 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
660 return -EINVAL;
661 *policy = 0; /* just so it's initialized */
662 *nmask = cpuset_current_mems_allowed;
663 return 0;
664 }
665
1da177e4
LT
666 if (flags & MPOL_F_ADDR) {
667 down_read(&mm->mmap_sem);
668 vma = find_vma_intersection(mm, addr, addr+1);
669 if (!vma) {
670 up_read(&mm->mmap_sem);
671 return -EFAULT;
672 }
673 if (vma->vm_ops && vma->vm_ops->get_policy)
674 pol = vma->vm_ops->get_policy(vma, addr);
675 else
676 pol = vma->vm_policy;
677 } else if (addr)
678 return -EINVAL;
679
680 if (!pol)
681 pol = &default_policy;
682
683 if (flags & MPOL_F_NODE) {
684 if (flags & MPOL_F_ADDR) {
685 err = lookup_node(mm, addr);
686 if (err < 0)
687 goto out;
8bccd85f 688 *policy = err;
1da177e4
LT
689 } else if (pol == current->mempolicy &&
690 pol->policy == MPOL_INTERLEAVE) {
8bccd85f 691 *policy = current->il_next;
1da177e4
LT
692 } else {
693 err = -EINVAL;
694 goto out;
695 }
696 } else
028fec41 697 *policy = pol->policy | pol->flags;
1da177e4
LT
698
699 if (vma) {
700 up_read(&current->mm->mmap_sem);
701 vma = NULL;
702 }
703
1da177e4 704 err = 0;
8bccd85f
CL
705 if (nmask)
706 get_zonemask(pol, nmask);
1da177e4
LT
707
708 out:
709 if (vma)
710 up_read(&current->mm->mmap_sem);
711 return err;
712}
713
b20a3503 714#ifdef CONFIG_MIGRATION
6ce3c4c0
CL
715/*
716 * page migration
717 */
fc301289
CL
718static void migrate_page_add(struct page *page, struct list_head *pagelist,
719 unsigned long flags)
6ce3c4c0
CL
720{
721 /*
fc301289 722 * Avoid migrating a page that is shared with others.
6ce3c4c0 723 */
b20a3503
CL
724 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
725 isolate_lru_page(page, pagelist);
7e2ab150 726}
6ce3c4c0 727
742755a1 728static struct page *new_node_page(struct page *page, unsigned long node, int **x)
95a402c3 729{
769848c0 730 return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
95a402c3
CL
731}
732
7e2ab150
CL
733/*
734 * Migrate pages from one node to a target node.
735 * Returns error or the number of pages not migrated.
736 */
dbcb0f19
AB
737static int migrate_to_node(struct mm_struct *mm, int source, int dest,
738 int flags)
7e2ab150
CL
739{
740 nodemask_t nmask;
741 LIST_HEAD(pagelist);
742 int err = 0;
743
744 nodes_clear(nmask);
745 node_set(source, nmask);
6ce3c4c0 746
7e2ab150
CL
747 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
748 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
749
aaa994b3 750 if (!list_empty(&pagelist))
95a402c3
CL
751 err = migrate_pages(&pagelist, new_node_page, dest);
752
7e2ab150 753 return err;
6ce3c4c0
CL
754}
755
39743889 756/*
7e2ab150
CL
757 * Move pages between the two nodesets so as to preserve the physical
758 * layout as much as possible.
39743889
CL
759 *
760 * Returns the number of page that could not be moved.
761 */
762int do_migrate_pages(struct mm_struct *mm,
763 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
764{
765 LIST_HEAD(pagelist);
7e2ab150
CL
766 int busy = 0;
767 int err = 0;
768 nodemask_t tmp;
39743889 769
7e2ab150 770 down_read(&mm->mmap_sem);
39743889 771
7b2259b3
CL
772 err = migrate_vmas(mm, from_nodes, to_nodes, flags);
773 if (err)
774 goto out;
775
7e2ab150
CL
776/*
777 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
778 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
779 * bit in 'tmp', and return that <source, dest> pair for migration.
780 * The pair of nodemasks 'to' and 'from' define the map.
781 *
782 * If no pair of bits is found that way, fallback to picking some
783 * pair of 'source' and 'dest' bits that are not the same. If the
784 * 'source' and 'dest' bits are the same, this represents a node
785 * that will be migrating to itself, so no pages need move.
786 *
787 * If no bits are left in 'tmp', or if all remaining bits left
788 * in 'tmp' correspond to the same bit in 'to', return false
789 * (nothing left to migrate).
790 *
791 * This lets us pick a pair of nodes to migrate between, such that
792 * if possible the dest node is not already occupied by some other
793 * source node, minimizing the risk of overloading the memory on a
794 * node that would happen if we migrated incoming memory to a node
795 * before migrating outgoing memory source that same node.
796 *
797 * A single scan of tmp is sufficient. As we go, we remember the
798 * most recent <s, d> pair that moved (s != d). If we find a pair
799 * that not only moved, but what's better, moved to an empty slot
800 * (d is not set in tmp), then we break out then, with that pair.
801 * Otherwise when we finish scannng from_tmp, we at least have the
802 * most recent <s, d> pair that moved. If we get all the way through
803 * the scan of tmp without finding any node that moved, much less
804 * moved to an empty node, then there is nothing left worth migrating.
805 */
d4984711 806
7e2ab150
CL
807 tmp = *from_nodes;
808 while (!nodes_empty(tmp)) {
809 int s,d;
810 int source = -1;
811 int dest = 0;
812
813 for_each_node_mask(s, tmp) {
814 d = node_remap(s, *from_nodes, *to_nodes);
815 if (s == d)
816 continue;
817
818 source = s; /* Node moved. Memorize */
819 dest = d;
820
821 /* dest not in remaining from nodes? */
822 if (!node_isset(dest, tmp))
823 break;
824 }
825 if (source == -1)
826 break;
827
828 node_clear(source, tmp);
829 err = migrate_to_node(mm, source, dest, flags);
830 if (err > 0)
831 busy += err;
832 if (err < 0)
833 break;
39743889 834 }
7b2259b3 835out:
39743889 836 up_read(&mm->mmap_sem);
7e2ab150
CL
837 if (err < 0)
838 return err;
839 return busy;
b20a3503
CL
840
841}
842
3ad33b24
LS
843/*
844 * Allocate a new page for page migration based on vma policy.
845 * Start assuming that page is mapped by vma pointed to by @private.
846 * Search forward from there, if not. N.B., this assumes that the
847 * list of pages handed to migrate_pages()--which is how we get here--
848 * is in virtual address order.
849 */
742755a1 850static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
95a402c3
CL
851{
852 struct vm_area_struct *vma = (struct vm_area_struct *)private;
3ad33b24 853 unsigned long uninitialized_var(address);
95a402c3 854
3ad33b24
LS
855 while (vma) {
856 address = page_address_in_vma(page, vma);
857 if (address != -EFAULT)
858 break;
859 vma = vma->vm_next;
860 }
861
862 /*
863 * if !vma, alloc_page_vma() will use task or system default policy
864 */
865 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
95a402c3 866}
b20a3503
CL
867#else
868
869static void migrate_page_add(struct page *page, struct list_head *pagelist,
870 unsigned long flags)
871{
39743889
CL
872}
873
b20a3503
CL
874int do_migrate_pages(struct mm_struct *mm,
875 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
876{
877 return -ENOSYS;
878}
95a402c3 879
69939749 880static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
95a402c3
CL
881{
882 return NULL;
883}
b20a3503
CL
884#endif
885
dbcb0f19 886static long do_mbind(unsigned long start, unsigned long len,
028fec41
DR
887 unsigned short mode, unsigned short mode_flags,
888 nodemask_t *nmask, unsigned long flags)
6ce3c4c0
CL
889{
890 struct vm_area_struct *vma;
891 struct mm_struct *mm = current->mm;
892 struct mempolicy *new;
893 unsigned long end;
894 int err;
895 LIST_HEAD(pagelist);
896
a3b51e01
DR
897 if (flags & ~(unsigned long)(MPOL_MF_STRICT |
898 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
6ce3c4c0 899 return -EINVAL;
74c00241 900 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
6ce3c4c0
CL
901 return -EPERM;
902
903 if (start & ~PAGE_MASK)
904 return -EINVAL;
905
906 if (mode == MPOL_DEFAULT)
907 flags &= ~MPOL_MF_STRICT;
908
909 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
910 end = start + len;
911
912 if (end < start)
913 return -EINVAL;
914 if (end == start)
915 return 0;
916
028fec41 917 new = mpol_new(mode, mode_flags, nmask);
6ce3c4c0
CL
918 if (IS_ERR(new))
919 return PTR_ERR(new);
920
921 /*
922 * If we are using the default policy then operation
923 * on discontinuous address spaces is okay after all
924 */
925 if (!new)
926 flags |= MPOL_MF_DISCONTIG_OK;
927
028fec41
DR
928 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
929 start, start + len, mode, mode_flags,
930 nmask ? nodes_addr(*nmask)[0] : -1);
6ce3c4c0
CL
931
932 down_write(&mm->mmap_sem);
933 vma = check_range(mm, start, end, nmask,
934 flags | MPOL_MF_INVERT, &pagelist);
935
936 err = PTR_ERR(vma);
937 if (!IS_ERR(vma)) {
938 int nr_failed = 0;
939
940 err = mbind_range(vma, start, end, new);
7e2ab150 941
6ce3c4c0 942 if (!list_empty(&pagelist))
95a402c3
CL
943 nr_failed = migrate_pages(&pagelist, new_vma_page,
944 (unsigned long)vma);
6ce3c4c0
CL
945
946 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
947 err = -EIO;
948 }
b20a3503 949
6ce3c4c0 950 up_write(&mm->mmap_sem);
f0be3d32 951 mpol_put(new);
6ce3c4c0
CL
952 return err;
953}
954
8bccd85f
CL
955/*
956 * User space interface with variable sized bitmaps for nodelists.
957 */
958
959/* Copy a node mask from user space. */
39743889 960static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85f
CL
961 unsigned long maxnode)
962{
963 unsigned long k;
964 unsigned long nlongs;
965 unsigned long endmask;
966
967 --maxnode;
968 nodes_clear(*nodes);
969 if (maxnode == 0 || !nmask)
970 return 0;
a9c930ba 971 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
636f13c1 972 return -EINVAL;
8bccd85f
CL
973
974 nlongs = BITS_TO_LONGS(maxnode);
975 if ((maxnode % BITS_PER_LONG) == 0)
976 endmask = ~0UL;
977 else
978 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
979
980 /* When the user specified more nodes than supported just check
981 if the non supported part is all zero. */
982 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
983 if (nlongs > PAGE_SIZE/sizeof(long))
984 return -EINVAL;
985 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
986 unsigned long t;
987 if (get_user(t, nmask + k))
988 return -EFAULT;
989 if (k == nlongs - 1) {
990 if (t & endmask)
991 return -EINVAL;
992 } else if (t)
993 return -EINVAL;
994 }
995 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
996 endmask = ~0UL;
997 }
998
999 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1000 return -EFAULT;
1001 nodes_addr(*nodes)[nlongs-1] &= endmask;
1002 return 0;
1003}
1004
1005/* Copy a kernel node mask to user space */
1006static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1007 nodemask_t *nodes)
1008{
1009 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1010 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1011
1012 if (copy > nbytes) {
1013 if (copy > PAGE_SIZE)
1014 return -EINVAL;
1015 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1016 return -EFAULT;
1017 copy = nbytes;
1018 }
1019 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1020}
1021
1022asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1023 unsigned long mode,
1024 unsigned long __user *nmask, unsigned long maxnode,
1025 unsigned flags)
1026{
1027 nodemask_t nodes;
1028 int err;
028fec41 1029 unsigned short mode_flags;
8bccd85f 1030
028fec41
DR
1031 mode_flags = mode & MPOL_MODE_FLAGS;
1032 mode &= ~MPOL_MODE_FLAGS;
a3b51e01
DR
1033 if (mode >= MPOL_MAX)
1034 return -EINVAL;
4c50bc01
DR
1035 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1036 (mode_flags & MPOL_F_RELATIVE_NODES))
1037 return -EINVAL;
8bccd85f
CL
1038 err = get_nodes(&nodes, nmask, maxnode);
1039 if (err)
1040 return err;
028fec41 1041 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
8bccd85f
CL
1042}
1043
1044/* Set the process memory policy */
1045asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1046 unsigned long maxnode)
1047{
1048 int err;
1049 nodemask_t nodes;
028fec41 1050 unsigned short flags;
8bccd85f 1051
028fec41
DR
1052 flags = mode & MPOL_MODE_FLAGS;
1053 mode &= ~MPOL_MODE_FLAGS;
1054 if ((unsigned int)mode >= MPOL_MAX)
8bccd85f 1055 return -EINVAL;
4c50bc01
DR
1056 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1057 return -EINVAL;
8bccd85f
CL
1058 err = get_nodes(&nodes, nmask, maxnode);
1059 if (err)
1060 return err;
028fec41 1061 return do_set_mempolicy(mode, flags, &nodes);
8bccd85f
CL
1062}
1063
39743889
CL
1064asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1065 const unsigned long __user *old_nodes,
1066 const unsigned long __user *new_nodes)
1067{
1068 struct mm_struct *mm;
1069 struct task_struct *task;
1070 nodemask_t old;
1071 nodemask_t new;
1072 nodemask_t task_nodes;
1073 int err;
1074
1075 err = get_nodes(&old, old_nodes, maxnode);
1076 if (err)
1077 return err;
1078
1079 err = get_nodes(&new, new_nodes, maxnode);
1080 if (err)
1081 return err;
1082
1083 /* Find the mm_struct */
1084 read_lock(&tasklist_lock);
228ebcbe 1085 task = pid ? find_task_by_vpid(pid) : current;
39743889
CL
1086 if (!task) {
1087 read_unlock(&tasklist_lock);
1088 return -ESRCH;
1089 }
1090 mm = get_task_mm(task);
1091 read_unlock(&tasklist_lock);
1092
1093 if (!mm)
1094 return -EINVAL;
1095
1096 /*
1097 * Check if this process has the right to modify the specified
1098 * process. The right exists if the process has administrative
7f927fcc 1099 * capabilities, superuser privileges or the same
39743889
CL
1100 * userid as the target process.
1101 */
1102 if ((current->euid != task->suid) && (current->euid != task->uid) &&
1103 (current->uid != task->suid) && (current->uid != task->uid) &&
74c00241 1104 !capable(CAP_SYS_NICE)) {
39743889
CL
1105 err = -EPERM;
1106 goto out;
1107 }
1108
1109 task_nodes = cpuset_mems_allowed(task);
1110 /* Is the user allowed to access the target nodes? */
74c00241 1111 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
39743889
CL
1112 err = -EPERM;
1113 goto out;
1114 }
1115
37b07e41 1116 if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
3b42d28b
CL
1117 err = -EINVAL;
1118 goto out;
1119 }
1120
86c3a764
DQ
1121 err = security_task_movememory(task);
1122 if (err)
1123 goto out;
1124
511030bc 1125 err = do_migrate_pages(mm, &old, &new,
74c00241 1126 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
39743889
CL
1127out:
1128 mmput(mm);
1129 return err;
1130}
1131
1132
8bccd85f
CL
1133/* Retrieve NUMA policy */
1134asmlinkage long sys_get_mempolicy(int __user *policy,
1135 unsigned long __user *nmask,
1136 unsigned long maxnode,
1137 unsigned long addr, unsigned long flags)
1138{
dbcb0f19
AB
1139 int err;
1140 int uninitialized_var(pval);
8bccd85f
CL
1141 nodemask_t nodes;
1142
1143 if (nmask != NULL && maxnode < MAX_NUMNODES)
1144 return -EINVAL;
1145
1146 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1147
1148 if (err)
1149 return err;
1150
1151 if (policy && put_user(pval, policy))
1152 return -EFAULT;
1153
1154 if (nmask)
1155 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1156
1157 return err;
1158}
1159
1da177e4
LT
1160#ifdef CONFIG_COMPAT
1161
1162asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1163 compat_ulong_t __user *nmask,
1164 compat_ulong_t maxnode,
1165 compat_ulong_t addr, compat_ulong_t flags)
1166{
1167 long err;
1168 unsigned long __user *nm = NULL;
1169 unsigned long nr_bits, alloc_size;
1170 DECLARE_BITMAP(bm, MAX_NUMNODES);
1171
1172 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1173 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1174
1175 if (nmask)
1176 nm = compat_alloc_user_space(alloc_size);
1177
1178 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1179
1180 if (!err && nmask) {
1181 err = copy_from_user(bm, nm, alloc_size);
1182 /* ensure entire bitmap is zeroed */
1183 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1184 err |= compat_put_bitmap(nmask, bm, nr_bits);
1185 }
1186
1187 return err;
1188}
1189
1190asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1191 compat_ulong_t maxnode)
1192{
1193 long err = 0;
1194 unsigned long __user *nm = NULL;
1195 unsigned long nr_bits, alloc_size;
1196 DECLARE_BITMAP(bm, MAX_NUMNODES);
1197
1198 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1199 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1200
1201 if (nmask) {
1202 err = compat_get_bitmap(bm, nmask, nr_bits);
1203 nm = compat_alloc_user_space(alloc_size);
1204 err |= copy_to_user(nm, bm, alloc_size);
1205 }
1206
1207 if (err)
1208 return -EFAULT;
1209
1210 return sys_set_mempolicy(mode, nm, nr_bits+1);
1211}
1212
1213asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1214 compat_ulong_t mode, compat_ulong_t __user *nmask,
1215 compat_ulong_t maxnode, compat_ulong_t flags)
1216{
1217 long err = 0;
1218 unsigned long __user *nm = NULL;
1219 unsigned long nr_bits, alloc_size;
dfcd3c0d 1220 nodemask_t bm;
1da177e4
LT
1221
1222 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1223 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1224
1225 if (nmask) {
dfcd3c0d 1226 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1da177e4 1227 nm = compat_alloc_user_space(alloc_size);
dfcd3c0d 1228 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1da177e4
LT
1229 }
1230
1231 if (err)
1232 return -EFAULT;
1233
1234 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1235}
1236
1237#endif
1238
480eccf9
LS
1239/*
1240 * get_vma_policy(@task, @vma, @addr)
1241 * @task - task for fallback if vma policy == default
1242 * @vma - virtual memory area whose policy is sought
1243 * @addr - address in @vma for shared policy lookup
1244 *
1245 * Returns effective policy for a VMA at specified address.
1246 * Falls back to @task or system default policy, as necessary.
1247 * Returned policy has extra reference count if shared, vma,
1248 * or some other task's policy [show_numa_maps() can pass
1249 * @task != current]. It is the caller's responsibility to
1250 * free the reference in these cases.
1251 */
48fce342
CL
1252static struct mempolicy * get_vma_policy(struct task_struct *task,
1253 struct vm_area_struct *vma, unsigned long addr)
1da177e4 1254{
6e21c8f1 1255 struct mempolicy *pol = task->mempolicy;
480eccf9 1256 int shared_pol = 0;
1da177e4
LT
1257
1258 if (vma) {
480eccf9 1259 if (vma->vm_ops && vma->vm_ops->get_policy) {
8bccd85f 1260 pol = vma->vm_ops->get_policy(vma, addr);
480eccf9
LS
1261 shared_pol = 1; /* if pol non-NULL, add ref below */
1262 } else if (vma->vm_policy &&
1da177e4
LT
1263 vma->vm_policy->policy != MPOL_DEFAULT)
1264 pol = vma->vm_policy;
1265 }
1266 if (!pol)
1267 pol = &default_policy;
480eccf9
LS
1268 else if (!shared_pol && pol != current->mempolicy)
1269 mpol_get(pol); /* vma or other task's policy */
1da177e4
LT
1270 return pol;
1271}
1272
19770b32
MG
1273/* Return a nodemask representing a mempolicy */
1274static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
1275{
1276 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1277 if (unlikely(policy->policy == MPOL_BIND) &&
1278 gfp_zone(gfp) >= policy_zone &&
1279 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1280 return &policy->v.nodes;
1281
1282 return NULL;
1283}
1284
1da177e4 1285/* Return a zonelist representing a mempolicy */
dd0fc66f 1286static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1da177e4
LT
1287{
1288 int nd;
1289
1290 switch (policy->policy) {
1291 case MPOL_PREFERRED:
1292 nd = policy->v.preferred_node;
1293 if (nd < 0)
1294 nd = numa_node_id();
1295 break;
1296 case MPOL_BIND:
19770b32
MG
1297 /*
1298 * Normally, MPOL_BIND allocations node-local are node-local
1299 * within the allowed nodemask. However, if __GFP_THISNODE is
1300 * set and the current node is part of the mask, we use the
1301 * the zonelist for the first node in the mask instead.
1302 */
1303 nd = numa_node_id();
1304 if (unlikely(gfp & __GFP_THISNODE) &&
1305 unlikely(!node_isset(nd, policy->v.nodes)))
1306 nd = first_node(policy->v.nodes);
1307 break;
1da177e4
LT
1308 case MPOL_INTERLEAVE: /* should not happen */
1309 case MPOL_DEFAULT:
1310 nd = numa_node_id();
1311 break;
1312 default:
1313 nd = 0;
1314 BUG();
1315 }
0e88460d 1316 return node_zonelist(nd, gfp);
1da177e4
LT
1317}
1318
1319/* Do dynamic interleaving for a process */
1320static unsigned interleave_nodes(struct mempolicy *policy)
1321{
1322 unsigned nid, next;
1323 struct task_struct *me = current;
1324
1325 nid = me->il_next;
dfcd3c0d 1326 next = next_node(nid, policy->v.nodes);
1da177e4 1327 if (next >= MAX_NUMNODES)
dfcd3c0d 1328 next = first_node(policy->v.nodes);
f5b087b5
DR
1329 if (next < MAX_NUMNODES)
1330 me->il_next = next;
1da177e4
LT
1331 return nid;
1332}
1333
dc85da15
CL
1334/*
1335 * Depending on the memory policy provide a node from which to allocate the
1336 * next slab entry.
1337 */
1338unsigned slab_node(struct mempolicy *policy)
1339{
a3b51e01 1340 unsigned short pol = policy ? policy->policy : MPOL_DEFAULT;
765c4507
CL
1341
1342 switch (pol) {
dc85da15
CL
1343 case MPOL_INTERLEAVE:
1344 return interleave_nodes(policy);
1345
dd1a239f 1346 case MPOL_BIND: {
dc85da15
CL
1347 /*
1348 * Follow bind policy behavior and start allocation at the
1349 * first node.
1350 */
19770b32
MG
1351 struct zonelist *zonelist;
1352 struct zone *zone;
1353 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1354 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1355 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1356 &policy->v.nodes,
1357 &zone);
1358 return zone->node;
dd1a239f 1359 }
dc85da15
CL
1360
1361 case MPOL_PREFERRED:
1362 if (policy->v.preferred_node >= 0)
1363 return policy->v.preferred_node;
1364 /* Fall through */
1365
1366 default:
1367 return numa_node_id();
1368 }
1369}
1370
1da177e4
LT
1371/* Do static interleaving for a VMA with known offset. */
1372static unsigned offset_il_node(struct mempolicy *pol,
1373 struct vm_area_struct *vma, unsigned long off)
1374{
dfcd3c0d 1375 unsigned nnodes = nodes_weight(pol->v.nodes);
f5b087b5 1376 unsigned target;
1da177e4
LT
1377 int c;
1378 int nid = -1;
1379
f5b087b5
DR
1380 if (!nnodes)
1381 return numa_node_id();
1382 target = (unsigned int)off % nnodes;
1da177e4
LT
1383 c = 0;
1384 do {
dfcd3c0d 1385 nid = next_node(nid, pol->v.nodes);
1da177e4
LT
1386 c++;
1387 } while (c <= target);
1da177e4
LT
1388 return nid;
1389}
1390
5da7ca86
CL
1391/* Determine a node number for interleave */
1392static inline unsigned interleave_nid(struct mempolicy *pol,
1393 struct vm_area_struct *vma, unsigned long addr, int shift)
1394{
1395 if (vma) {
1396 unsigned long off;
1397
3b98b087
NA
1398 /*
1399 * for small pages, there is no difference between
1400 * shift and PAGE_SHIFT, so the bit-shift is safe.
1401 * for huge pages, since vm_pgoff is in units of small
1402 * pages, we need to shift off the always 0 bits to get
1403 * a useful offset.
1404 */
1405 BUG_ON(shift < PAGE_SHIFT);
1406 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
5da7ca86
CL
1407 off += (addr - vma->vm_start) >> shift;
1408 return offset_il_node(pol, vma, off);
1409 } else
1410 return interleave_nodes(pol);
1411}
1412
00ac59ad 1413#ifdef CONFIG_HUGETLBFS
480eccf9
LS
1414/*
1415 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1416 * @vma = virtual memory area whose policy is sought
1417 * @addr = address in @vma for shared policy lookup and interleave policy
1418 * @gfp_flags = for requested zone
19770b32
MG
1419 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1420 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
480eccf9
LS
1421 *
1422 * Returns a zonelist suitable for a huge page allocation.
19770b32
MG
1423 * If the effective policy is 'BIND, returns pointer to local node's zonelist,
1424 * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
480eccf9 1425 * If it is also a policy for which get_vma_policy() returns an extra
19770b32 1426 * reference, we must hold that reference until after the allocation.
480eccf9 1427 * In that case, return policy via @mpol so hugetlb allocation can drop
19770b32 1428 * the reference. For non-'BIND referenced policies, we can/do drop the
480eccf9
LS
1429 * reference here, so the caller doesn't need to know about the special case
1430 * for default and current task policy.
1431 */
396faf03 1432struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
19770b32
MG
1433 gfp_t gfp_flags, struct mempolicy **mpol,
1434 nodemask_t **nodemask)
5da7ca86
CL
1435{
1436 struct mempolicy *pol = get_vma_policy(current, vma, addr);
480eccf9 1437 struct zonelist *zl;
5da7ca86 1438
480eccf9 1439 *mpol = NULL; /* probably no unref needed */
19770b32
MG
1440 *nodemask = NULL; /* assume !MPOL_BIND */
1441 if (pol->policy == MPOL_BIND) {
1442 *nodemask = &pol->v.nodes;
1443 } else if (pol->policy == MPOL_INTERLEAVE) {
5da7ca86
CL
1444 unsigned nid;
1445
1446 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
69682d85
LS
1447 if (unlikely(pol != &default_policy &&
1448 pol != current->mempolicy))
f0be3d32 1449 __mpol_put(pol); /* finished with pol */
0e88460d 1450 return node_zonelist(nid, gfp_flags);
5da7ca86 1451 }
480eccf9
LS
1452
1453 zl = zonelist_policy(GFP_HIGHUSER, pol);
1454 if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1455 if (pol->policy != MPOL_BIND)
f0be3d32 1456 __mpol_put(pol); /* finished with pol */
480eccf9
LS
1457 else
1458 *mpol = pol; /* unref needed after allocation */
1459 }
1460 return zl;
5da7ca86 1461}
00ac59ad 1462#endif
5da7ca86 1463
1da177e4
LT
1464/* Allocate a page in interleaved policy.
1465 Own path because it needs to do special accounting. */
662f3a0b
AK
1466static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1467 unsigned nid)
1da177e4
LT
1468{
1469 struct zonelist *zl;
1470 struct page *page;
1471
0e88460d 1472 zl = node_zonelist(nid, gfp);
1da177e4 1473 page = __alloc_pages(gfp, order, zl);
dd1a239f 1474 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
ca889e6c 1475 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1da177e4
LT
1476 return page;
1477}
1478
1479/**
1480 * alloc_page_vma - Allocate a page for a VMA.
1481 *
1482 * @gfp:
1483 * %GFP_USER user allocation.
1484 * %GFP_KERNEL kernel allocations,
1485 * %GFP_HIGHMEM highmem/user allocations,
1486 * %GFP_FS allocation should not call back into a file system.
1487 * %GFP_ATOMIC don't sleep.
1488 *
1489 * @vma: Pointer to VMA or NULL if not available.
1490 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1491 *
1492 * This function allocates a page from the kernel page pool and applies
1493 * a NUMA policy associated with the VMA or the current process.
1494 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1495 * mm_struct of the VMA to prevent it from going away. Should be used for
1496 * all allocations for pages that will be mapped into
1497 * user space. Returns NULL when no page can be allocated.
1498 *
1499 * Should be called with the mm_sem of the vma hold.
1500 */
1501struct page *
dd0fc66f 1502alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1da177e4 1503{
6e21c8f1 1504 struct mempolicy *pol = get_vma_policy(current, vma, addr);
480eccf9 1505 struct zonelist *zl;
1da177e4 1506
cf2a473c 1507 cpuset_update_task_memory_state();
1da177e4
LT
1508
1509 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1510 unsigned nid;
5da7ca86
CL
1511
1512 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
69682d85
LS
1513 if (unlikely(pol != &default_policy &&
1514 pol != current->mempolicy))
f0be3d32 1515 __mpol_put(pol); /* finished with pol */
1da177e4
LT
1516 return alloc_page_interleave(gfp, 0, nid);
1517 }
480eccf9
LS
1518 zl = zonelist_policy(gfp, pol);
1519 if (pol != &default_policy && pol != current->mempolicy) {
1520 /*
1521 * slow path: ref counted policy -- shared or vma
1522 */
19770b32
MG
1523 struct page *page = __alloc_pages_nodemask(gfp, 0,
1524 zl, nodemask_policy(gfp, pol));
f0be3d32 1525 __mpol_put(pol);
480eccf9
LS
1526 return page;
1527 }
1528 /*
1529 * fast path: default or task policy
1530 */
19770b32 1531 return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
1da177e4
LT
1532}
1533
1534/**
1535 * alloc_pages_current - Allocate pages.
1536 *
1537 * @gfp:
1538 * %GFP_USER user allocation,
1539 * %GFP_KERNEL kernel allocation,
1540 * %GFP_HIGHMEM highmem allocation,
1541 * %GFP_FS don't call back into a file system.
1542 * %GFP_ATOMIC don't sleep.
1543 * @order: Power of two of allocation size in pages. 0 is a single page.
1544 *
1545 * Allocate a page from the kernel page pool. When not in
1546 * interrupt context and apply the current process NUMA policy.
1547 * Returns NULL when no page can be allocated.
1548 *
cf2a473c 1549 * Don't call cpuset_update_task_memory_state() unless
1da177e4
LT
1550 * 1) it's ok to take cpuset_sem (can WAIT), and
1551 * 2) allocating for current task (not interrupt).
1552 */
dd0fc66f 1553struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1da177e4
LT
1554{
1555 struct mempolicy *pol = current->mempolicy;
1556
1557 if ((gfp & __GFP_WAIT) && !in_interrupt())
cf2a473c 1558 cpuset_update_task_memory_state();
9b819d20 1559 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1da177e4
LT
1560 pol = &default_policy;
1561 if (pol->policy == MPOL_INTERLEAVE)
1562 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
19770b32
MG
1563 return __alloc_pages_nodemask(gfp, order,
1564 zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
1da177e4
LT
1565}
1566EXPORT_SYMBOL(alloc_pages_current);
1567
4225399a
PJ
1568/*
1569 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1570 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1571 * with the mems_allowed returned by cpuset_mems_allowed(). This
1572 * keeps mempolicies cpuset relative after its cpuset moves. See
1573 * further kernel/cpuset.c update_nodemask().
1574 */
4225399a 1575
1da177e4
LT
1576/* Slow path of a mempolicy copy */
1577struct mempolicy *__mpol_copy(struct mempolicy *old)
1578{
1579 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1580
1581 if (!new)
1582 return ERR_PTR(-ENOMEM);
4225399a
PJ
1583 if (current_cpuset_is_being_rebound()) {
1584 nodemask_t mems = cpuset_mems_allowed(current);
1585 mpol_rebind_policy(old, &mems);
1586 }
1da177e4
LT
1587 *new = *old;
1588 atomic_set(&new->refcnt, 1);
1da177e4
LT
1589 return new;
1590}
1591
f5b087b5
DR
1592static int mpol_match_intent(const struct mempolicy *a,
1593 const struct mempolicy *b)
1594{
1595 if (a->flags != b->flags)
1596 return 0;
1597 if (!mpol_store_user_nodemask(a))
1598 return 1;
1599 return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1600}
1601
1da177e4
LT
1602/* Slow path of a mempolicy comparison */
1603int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1604{
1605 if (!a || !b)
1606 return 0;
1607 if (a->policy != b->policy)
1608 return 0;
f5b087b5
DR
1609 if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b))
1610 return 0;
1da177e4
LT
1611 switch (a->policy) {
1612 case MPOL_DEFAULT:
1613 return 1;
19770b32
MG
1614 case MPOL_BIND:
1615 /* Fall through */
1da177e4 1616 case MPOL_INTERLEAVE:
dfcd3c0d 1617 return nodes_equal(a->v.nodes, b->v.nodes);
1da177e4
LT
1618 case MPOL_PREFERRED:
1619 return a->v.preferred_node == b->v.preferred_node;
1da177e4
LT
1620 default:
1621 BUG();
1622 return 0;
1623 }
1624}
1625
1626/* Slow path of a mpol destructor. */
f0be3d32 1627void __mpol_put(struct mempolicy *p)
1da177e4
LT
1628{
1629 if (!atomic_dec_and_test(&p->refcnt))
1630 return;
1da177e4
LT
1631 p->policy = MPOL_DEFAULT;
1632 kmem_cache_free(policy_cache, p);
1633}
1634
1da177e4
LT
1635/*
1636 * Shared memory backing store policy support.
1637 *
1638 * Remember policies even when nobody has shared memory mapped.
1639 * The policies are kept in Red-Black tree linked from the inode.
1640 * They are protected by the sp->lock spinlock, which should be held
1641 * for any accesses to the tree.
1642 */
1643
1644/* lookup first element intersecting start-end */
1645/* Caller holds sp->lock */
1646static struct sp_node *
1647sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1648{
1649 struct rb_node *n = sp->root.rb_node;
1650
1651 while (n) {
1652 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1653
1654 if (start >= p->end)
1655 n = n->rb_right;
1656 else if (end <= p->start)
1657 n = n->rb_left;
1658 else
1659 break;
1660 }
1661 if (!n)
1662 return NULL;
1663 for (;;) {
1664 struct sp_node *w = NULL;
1665 struct rb_node *prev = rb_prev(n);
1666 if (!prev)
1667 break;
1668 w = rb_entry(prev, struct sp_node, nd);
1669 if (w->end <= start)
1670 break;
1671 n = prev;
1672 }
1673 return rb_entry(n, struct sp_node, nd);
1674}
1675
1676/* Insert a new shared policy into the list. */
1677/* Caller holds sp->lock */
1678static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1679{
1680 struct rb_node **p = &sp->root.rb_node;
1681 struct rb_node *parent = NULL;
1682 struct sp_node *nd;
1683
1684 while (*p) {
1685 parent = *p;
1686 nd = rb_entry(parent, struct sp_node, nd);
1687 if (new->start < nd->start)
1688 p = &(*p)->rb_left;
1689 else if (new->end > nd->end)
1690 p = &(*p)->rb_right;
1691 else
1692 BUG();
1693 }
1694 rb_link_node(&new->nd, parent, p);
1695 rb_insert_color(&new->nd, &sp->root);
140d5a49 1696 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1da177e4
LT
1697 new->policy ? new->policy->policy : 0);
1698}
1699
1700/* Find shared policy intersecting idx */
1701struct mempolicy *
1702mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1703{
1704 struct mempolicy *pol = NULL;
1705 struct sp_node *sn;
1706
1707 if (!sp->root.rb_node)
1708 return NULL;
1709 spin_lock(&sp->lock);
1710 sn = sp_lookup(sp, idx, idx+1);
1711 if (sn) {
1712 mpol_get(sn->policy);
1713 pol = sn->policy;
1714 }
1715 spin_unlock(&sp->lock);
1716 return pol;
1717}
1718
1719static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1720{
140d5a49 1721 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1da177e4 1722 rb_erase(&n->nd, &sp->root);
f0be3d32 1723 mpol_put(n->policy);
1da177e4
LT
1724 kmem_cache_free(sn_cache, n);
1725}
1726
dbcb0f19
AB
1727static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1728 struct mempolicy *pol)
1da177e4
LT
1729{
1730 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1731
1732 if (!n)
1733 return NULL;
1734 n->start = start;
1735 n->end = end;
1736 mpol_get(pol);
1737 n->policy = pol;
1738 return n;
1739}
1740
1741/* Replace a policy range. */
1742static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1743 unsigned long end, struct sp_node *new)
1744{
1745 struct sp_node *n, *new2 = NULL;
1746
1747restart:
1748 spin_lock(&sp->lock);
1749 n = sp_lookup(sp, start, end);
1750 /* Take care of old policies in the same range. */
1751 while (n && n->start < end) {
1752 struct rb_node *next = rb_next(&n->nd);
1753 if (n->start >= start) {
1754 if (n->end <= end)
1755 sp_delete(sp, n);
1756 else
1757 n->start = end;
1758 } else {
1759 /* Old policy spanning whole new range. */
1760 if (n->end > end) {
1761 if (!new2) {
1762 spin_unlock(&sp->lock);
1763 new2 = sp_alloc(end, n->end, n->policy);
1764 if (!new2)
1765 return -ENOMEM;
1766 goto restart;
1767 }
1768 n->end = start;
1769 sp_insert(sp, new2);
1770 new2 = NULL;
1771 break;
1772 } else
1773 n->end = start;
1774 }
1775 if (!next)
1776 break;
1777 n = rb_entry(next, struct sp_node, nd);
1778 }
1779 if (new)
1780 sp_insert(sp, new);
1781 spin_unlock(&sp->lock);
1782 if (new2) {
f0be3d32 1783 mpol_put(new2->policy);
1da177e4
LT
1784 kmem_cache_free(sn_cache, new2);
1785 }
1786 return 0;
1787}
1788
a3b51e01 1789void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
028fec41 1790 unsigned short flags, nodemask_t *policy_nodes)
7339ff83
RH
1791{
1792 info->root = RB_ROOT;
1793 spin_lock_init(&info->lock);
1794
1795 if (policy != MPOL_DEFAULT) {
1796 struct mempolicy *newpol;
1797
1798 /* Falls back to MPOL_DEFAULT on any error */
028fec41 1799 newpol = mpol_new(policy, flags, policy_nodes);
7339ff83
RH
1800 if (!IS_ERR(newpol)) {
1801 /* Create pseudo-vma that contains just the policy */
1802 struct vm_area_struct pvma;
1803
1804 memset(&pvma, 0, sizeof(struct vm_area_struct));
1805 /* Policy covers entire file */
1806 pvma.vm_end = TASK_SIZE;
1807 mpol_set_shared_policy(info, &pvma, newpol);
f0be3d32 1808 mpol_put(newpol);
7339ff83
RH
1809 }
1810 }
1811}
1812
1da177e4
LT
1813int mpol_set_shared_policy(struct shared_policy *info,
1814 struct vm_area_struct *vma, struct mempolicy *npol)
1815{
1816 int err;
1817 struct sp_node *new = NULL;
1818 unsigned long sz = vma_pages(vma);
1819
028fec41 1820 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1da177e4 1821 vma->vm_pgoff,
028fec41
DR
1822 sz, npol ? npol->policy : -1,
1823 npol ? npol->flags : -1,
140d5a49 1824 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1da177e4
LT
1825
1826 if (npol) {
1827 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1828 if (!new)
1829 return -ENOMEM;
1830 }
1831 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1832 if (err && new)
1833 kmem_cache_free(sn_cache, new);
1834 return err;
1835}
1836
1837/* Free a backing policy store on inode delete. */
1838void mpol_free_shared_policy(struct shared_policy *p)
1839{
1840 struct sp_node *n;
1841 struct rb_node *next;
1842
1843 if (!p->root.rb_node)
1844 return;
1845 spin_lock(&p->lock);
1846 next = rb_first(&p->root);
1847 while (next) {
1848 n = rb_entry(next, struct sp_node, nd);
1849 next = rb_next(&n->nd);
90c5029e 1850 rb_erase(&n->nd, &p->root);
f0be3d32 1851 mpol_put(n->policy);
1da177e4
LT
1852 kmem_cache_free(sn_cache, n);
1853 }
1854 spin_unlock(&p->lock);
1da177e4
LT
1855}
1856
1857/* assumes fs == KERNEL_DS */
1858void __init numa_policy_init(void)
1859{
b71636e2
PM
1860 nodemask_t interleave_nodes;
1861 unsigned long largest = 0;
1862 int nid, prefer = 0;
1863
1da177e4
LT
1864 policy_cache = kmem_cache_create("numa_policy",
1865 sizeof(struct mempolicy),
20c2df83 1866 0, SLAB_PANIC, NULL);
1da177e4
LT
1867
1868 sn_cache = kmem_cache_create("shared_policy_node",
1869 sizeof(struct sp_node),
20c2df83 1870 0, SLAB_PANIC, NULL);
1da177e4 1871
b71636e2
PM
1872 /*
1873 * Set interleaving policy for system init. Interleaving is only
1874 * enabled across suitably sized nodes (default is >= 16MB), or
1875 * fall back to the largest node if they're all smaller.
1876 */
1877 nodes_clear(interleave_nodes);
56bbd65d 1878 for_each_node_state(nid, N_HIGH_MEMORY) {
b71636e2
PM
1879 unsigned long total_pages = node_present_pages(nid);
1880
1881 /* Preserve the largest node */
1882 if (largest < total_pages) {
1883 largest = total_pages;
1884 prefer = nid;
1885 }
1886
1887 /* Interleave this node? */
1888 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1889 node_set(nid, interleave_nodes);
1890 }
1891
1892 /* All too small, use the largest */
1893 if (unlikely(nodes_empty(interleave_nodes)))
1894 node_set(prefer, interleave_nodes);
1da177e4 1895
028fec41 1896 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1da177e4
LT
1897 printk("numa_policy_init: interleaving failed\n");
1898}
1899
8bccd85f 1900/* Reset policy of current process to default */
1da177e4
LT
1901void numa_default_policy(void)
1902{
028fec41 1903 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1da177e4 1904}
68860ec1 1905
1a75a6c8
CL
1906/*
1907 * Display pages allocated per node and memory policy via /proc.
1908 */
15ad7cdc
HD
1909static const char * const policy_types[] =
1910 { "default", "prefer", "bind", "interleave" };
1a75a6c8
CL
1911
1912/*
1913 * Convert a mempolicy into a string.
1914 * Returns the number of characters in buffer (if positive)
1915 * or an error (negative)
1916 */
1917static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1918{
1919 char *p = buffer;
1920 int l;
1921 nodemask_t nodes;
a3b51e01 1922 unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
f5b087b5 1923 unsigned short flags = pol ? pol->flags : 0;
1a75a6c8
CL
1924
1925 switch (mode) {
1926 case MPOL_DEFAULT:
1927 nodes_clear(nodes);
1928 break;
1929
1930 case MPOL_PREFERRED:
1931 nodes_clear(nodes);
1932 node_set(pol->v.preferred_node, nodes);
1933 break;
1934
1935 case MPOL_BIND:
19770b32 1936 /* Fall through */
1a75a6c8
CL
1937 case MPOL_INTERLEAVE:
1938 nodes = pol->v.nodes;
1939 break;
1940
1941 default:
1942 BUG();
1943 return -EFAULT;
1944 }
1945
1946 l = strlen(policy_types[mode]);
1947 if (buffer + maxlen < p + l + 1)
1948 return -ENOSPC;
1949
1950 strcpy(p, policy_types[mode]);
1951 p += l;
1952
f5b087b5
DR
1953 if (flags) {
1954 int need_bar = 0;
1955
1956 if (buffer + maxlen < p + 2)
1957 return -ENOSPC;
1958 *p++ = '=';
1959
1960 if (flags & MPOL_F_STATIC_NODES)
1961 p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
4c50bc01
DR
1962 if (flags & MPOL_F_RELATIVE_NODES)
1963 p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
f5b087b5
DR
1964 }
1965
1a75a6c8
CL
1966 if (!nodes_empty(nodes)) {
1967 if (buffer + maxlen < p + 2)
1968 return -ENOSPC;
1969 *p++ = '=';
1970 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1971 }
1972 return p - buffer;
1973}
1974
1975struct numa_maps {
1976 unsigned long pages;
1977 unsigned long anon;
397874df
CL
1978 unsigned long active;
1979 unsigned long writeback;
1a75a6c8 1980 unsigned long mapcount_max;
397874df
CL
1981 unsigned long dirty;
1982 unsigned long swapcache;
1a75a6c8
CL
1983 unsigned long node[MAX_NUMNODES];
1984};
1985
397874df 1986static void gather_stats(struct page *page, void *private, int pte_dirty)
1a75a6c8
CL
1987{
1988 struct numa_maps *md = private;
1989 int count = page_mapcount(page);
1990
397874df
CL
1991 md->pages++;
1992 if (pte_dirty || PageDirty(page))
1993 md->dirty++;
1a75a6c8 1994
397874df
CL
1995 if (PageSwapCache(page))
1996 md->swapcache++;
1a75a6c8 1997
397874df
CL
1998 if (PageActive(page))
1999 md->active++;
2000
2001 if (PageWriteback(page))
2002 md->writeback++;
1a75a6c8
CL
2003
2004 if (PageAnon(page))
2005 md->anon++;
2006
397874df
CL
2007 if (count > md->mapcount_max)
2008 md->mapcount_max = count;
2009
1a75a6c8 2010 md->node[page_to_nid(page)]++;
1a75a6c8
CL
2011}
2012
7f709ed0 2013#ifdef CONFIG_HUGETLB_PAGE
397874df
CL
2014static void check_huge_range(struct vm_area_struct *vma,
2015 unsigned long start, unsigned long end,
2016 struct numa_maps *md)
2017{
2018 unsigned long addr;
2019 struct page *page;
2020
2021 for (addr = start; addr < end; addr += HPAGE_SIZE) {
2022 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
2023 pte_t pte;
2024
2025 if (!ptep)
2026 continue;
2027
2028 pte = *ptep;
2029 if (pte_none(pte))
2030 continue;
2031
2032 page = pte_page(pte);
2033 if (!page)
2034 continue;
2035
2036 gather_stats(page, md, pte_dirty(*ptep));
2037 }
2038}
7f709ed0
AM
2039#else
2040static inline void check_huge_range(struct vm_area_struct *vma,
2041 unsigned long start, unsigned long end,
2042 struct numa_maps *md)
2043{
2044}
2045#endif
397874df 2046
1a75a6c8
CL
2047int show_numa_map(struct seq_file *m, void *v)
2048{
99f89551 2049 struct proc_maps_private *priv = m->private;
1a75a6c8
CL
2050 struct vm_area_struct *vma = v;
2051 struct numa_maps *md;
397874df
CL
2052 struct file *file = vma->vm_file;
2053 struct mm_struct *mm = vma->vm_mm;
480eccf9 2054 struct mempolicy *pol;
1a75a6c8
CL
2055 int n;
2056 char buffer[50];
2057
397874df 2058 if (!mm)
1a75a6c8
CL
2059 return 0;
2060
2061 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2062 if (!md)
2063 return 0;
2064
480eccf9
LS
2065 pol = get_vma_policy(priv->task, vma, vma->vm_start);
2066 mpol_to_str(buffer, sizeof(buffer), pol);
2067 /*
2068 * unref shared or other task's mempolicy
2069 */
2070 if (pol != &default_policy && pol != current->mempolicy)
f0be3d32 2071 __mpol_put(pol);
397874df
CL
2072
2073 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2074
2075 if (file) {
2076 seq_printf(m, " file=");
c32c2f63 2077 seq_path(m, &file->f_path, "\n\t= ");
397874df
CL
2078 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2079 seq_printf(m, " heap");
2080 } else if (vma->vm_start <= mm->start_stack &&
2081 vma->vm_end >= mm->start_stack) {
2082 seq_printf(m, " stack");
2083 }
2084
2085 if (is_vm_hugetlb_page(vma)) {
2086 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2087 seq_printf(m, " huge");
2088 } else {
a57ebfdb 2089 check_pgd_range(vma, vma->vm_start, vma->vm_end,
56bbd65d 2090 &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
397874df
CL
2091 }
2092
2093 if (!md->pages)
2094 goto out;
1a75a6c8 2095
397874df
CL
2096 if (md->anon)
2097 seq_printf(m," anon=%lu",md->anon);
1a75a6c8 2098
397874df
CL
2099 if (md->dirty)
2100 seq_printf(m," dirty=%lu",md->dirty);
1a75a6c8 2101
397874df
CL
2102 if (md->pages != md->anon && md->pages != md->dirty)
2103 seq_printf(m, " mapped=%lu", md->pages);
1a75a6c8 2104
397874df
CL
2105 if (md->mapcount_max > 1)
2106 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1a75a6c8 2107
397874df
CL
2108 if (md->swapcache)
2109 seq_printf(m," swapcache=%lu", md->swapcache);
2110
2111 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2112 seq_printf(m," active=%lu", md->active);
2113
2114 if (md->writeback)
2115 seq_printf(m," writeback=%lu", md->writeback);
2116
56bbd65d 2117 for_each_node_state(n, N_HIGH_MEMORY)
397874df
CL
2118 if (md->node[n])
2119 seq_printf(m, " N%d=%lu", n, md->node[n]);
2120out:
2121 seq_putc(m, '\n');
1a75a6c8
CL
2122 kfree(md);
2123
2124 if (m->count < m->size)
99f89551 2125 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1a75a6c8
CL
2126 return 0;
2127}