]> bbs.cooldavid.org Git - net-next-2.6.git/blame - mm/mempolicy.c
mempolicy: small header file cleanup
[net-next-2.6.git] / mm / mempolicy.c
CommitLineData
1da177e4
LT
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
1da177e4
LT
28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
1da177e4
LT
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
1da177e4
LT
66*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
1da177e4
LT
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/gfp.h>
77#include <linux/slab.h>
78#include <linux/string.h>
79#include <linux/module.h>
b488893a 80#include <linux/nsproxy.h>
1da177e4
LT
81#include <linux/interrupt.h>
82#include <linux/init.h>
83#include <linux/compat.h>
dc9aa5b9 84#include <linux/swap.h>
1a75a6c8
CL
85#include <linux/seq_file.h>
86#include <linux/proc_fs.h>
b20a3503 87#include <linux/migrate.h>
95a402c3 88#include <linux/rmap.h>
86c3a764 89#include <linux/security.h>
dbcb0f19 90#include <linux/syscalls.h>
dc9aa5b9 91
1da177e4
LT
92#include <asm/tlbflush.h>
93#include <asm/uaccess.h>
94
38e35860 95/* Internal flags */
dc9aa5b9 96#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
38e35860 97#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
1a75a6c8 98#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
dc9aa5b9 99
fcc234f8
PE
100static struct kmem_cache *policy_cache;
101static struct kmem_cache *sn_cache;
1da177e4 102
1da177e4
LT
103/* Highest zone. An specific allocation for a zone below that is not
104 policied. */
6267276f 105enum zone_type policy_zone = 0;
1da177e4 106
d42c6997 107struct mempolicy default_policy = {
1da177e4
LT
108 .refcnt = ATOMIC_INIT(1), /* never free it */
109 .policy = MPOL_DEFAULT,
110};
111
37012946
DR
112static const struct mempolicy_operations {
113 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
114 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
115} mpol_ops[MPOL_MAX];
116
19770b32 117/* Check that the nodemask contains at least one populated zone */
37012946 118static int is_valid_nodemask(const nodemask_t *nodemask)
1da177e4 119{
19770b32 120 int nd, k;
1da177e4 121
19770b32
MG
122 /* Check that there is something useful in this mask */
123 k = policy_zone;
124
125 for_each_node_mask(nd, *nodemask) {
126 struct zone *z;
127
128 for (k = 0; k <= policy_zone; k++) {
129 z = &NODE_DATA(nd)->node_zones[k];
130 if (z->present_pages > 0)
131 return 1;
dd942ae3 132 }
8af5e2eb 133 }
19770b32
MG
134
135 return 0;
1da177e4
LT
136}
137
f5b087b5
DR
138static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
139{
4c50bc01
DR
140 return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
141}
142
143static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
144 const nodemask_t *rel)
145{
146 nodemask_t tmp;
147 nodes_fold(tmp, *orig, nodes_weight(*rel));
148 nodes_onto(*ret, tmp, *rel);
f5b087b5
DR
149}
150
37012946
DR
151static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
152{
153 if (nodes_empty(*nodes))
154 return -EINVAL;
155 pol->v.nodes = *nodes;
156 return 0;
157}
158
159static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
160{
161 if (!nodes)
162 pol->v.preferred_node = -1; /* local allocation */
163 else if (nodes_empty(*nodes))
164 return -EINVAL; /* no allowed nodes */
165 else
166 pol->v.preferred_node = first_node(*nodes);
167 return 0;
168}
169
170static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
171{
172 if (!is_valid_nodemask(nodes))
173 return -EINVAL;
174 pol->v.nodes = *nodes;
175 return 0;
176}
177
1da177e4 178/* Create a new policy */
028fec41
DR
179static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
180 nodemask_t *nodes)
1da177e4
LT
181{
182 struct mempolicy *policy;
f5b087b5 183 nodemask_t cpuset_context_nmask;
37012946
DR
184 int localalloc = 0;
185 int ret;
1da177e4 186
028fec41
DR
187 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
188 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
140d5a49 189
1da177e4 190 if (mode == MPOL_DEFAULT)
37012946
DR
191 return NULL;
192 if (!nodes || nodes_empty(*nodes)) {
193 if (mode != MPOL_PREFERRED)
194 return ERR_PTR(-EINVAL);
195 localalloc = 1; /* special case: no mode flags */
196 }
1da177e4
LT
197 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
198 if (!policy)
199 return ERR_PTR(-ENOMEM);
200 atomic_set(&policy->refcnt, 1);
1da177e4 201 policy->policy = mode;
37012946
DR
202
203 if (!localalloc) {
204 policy->flags = flags;
205 cpuset_update_task_memory_state();
206 if (flags & MPOL_F_RELATIVE_NODES)
207 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
208 &cpuset_current_mems_allowed);
209 else
210 nodes_and(cpuset_context_nmask, *nodes,
211 cpuset_current_mems_allowed);
212 if (mpol_store_user_nodemask(policy))
213 policy->w.user_nodemask = *nodes;
214 else
215 policy->w.cpuset_mems_allowed =
216 cpuset_mems_allowed(current);
217 }
218
219 ret = mpol_ops[mode].create(policy,
220 localalloc ? NULL : &cpuset_context_nmask);
221 if (ret < 0) {
222 kmem_cache_free(policy_cache, policy);
223 return ERR_PTR(ret);
224 }
1da177e4 225 return policy;
37012946
DR
226}
227
228static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
229{
230}
231
232static void mpol_rebind_nodemask(struct mempolicy *pol,
233 const nodemask_t *nodes)
234{
235 nodemask_t tmp;
236
237 if (pol->flags & MPOL_F_STATIC_NODES)
238 nodes_and(tmp, pol->w.user_nodemask, *nodes);
239 else if (pol->flags & MPOL_F_RELATIVE_NODES)
240 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
241 else {
242 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
243 *nodes);
244 pol->w.cpuset_mems_allowed = *nodes;
245 }
f5b087b5 246
37012946
DR
247 pol->v.nodes = tmp;
248 if (!node_isset(current->il_next, tmp)) {
249 current->il_next = next_node(current->il_next, tmp);
250 if (current->il_next >= MAX_NUMNODES)
251 current->il_next = first_node(tmp);
252 if (current->il_next >= MAX_NUMNODES)
253 current->il_next = numa_node_id();
254 }
255}
256
257static void mpol_rebind_preferred(struct mempolicy *pol,
258 const nodemask_t *nodes)
259{
260 nodemask_t tmp;
261
262 /*
263 * check 'STATIC_NODES first, as preferred_node == -1 may be
264 * a temporary, "fallback" state for this policy.
265 */
266 if (pol->flags & MPOL_F_STATIC_NODES) {
267 int node = first_node(pol->w.user_nodemask);
268
269 if (node_isset(node, *nodes))
270 pol->v.preferred_node = node;
271 else
272 pol->v.preferred_node = -1;
273 } else if (pol->v.preferred_node == -1) {
274 return; /* no remap required for explicit local alloc */
275 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
276 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
277 pol->v.preferred_node = first_node(tmp);
278 } else {
279 pol->v.preferred_node = node_remap(pol->v.preferred_node,
280 pol->w.cpuset_mems_allowed,
281 *nodes);
282 pol->w.cpuset_mems_allowed = *nodes;
283 }
1da177e4
LT
284}
285
1d0d2680
DR
286/* Migrate a policy to a different set of nodes */
287static void mpol_rebind_policy(struct mempolicy *pol,
288 const nodemask_t *newmask)
289{
1d0d2680
DR
290 if (!pol)
291 return;
1d0d2680
DR
292 if (!mpol_store_user_nodemask(pol) &&
293 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
294 return;
37012946 295 mpol_ops[pol->policy].rebind(pol, newmask);
1d0d2680
DR
296}
297
298/*
299 * Wrapper for mpol_rebind_policy() that just requires task
300 * pointer, and updates task mempolicy.
301 */
302
303void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
304{
305 mpol_rebind_policy(tsk->mempolicy, new);
306}
307
308/*
309 * Rebind each vma in mm to new nodemask.
310 *
311 * Call holding a reference to mm. Takes mm->mmap_sem during call.
312 */
313
314void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
315{
316 struct vm_area_struct *vma;
317
318 down_write(&mm->mmap_sem);
319 for (vma = mm->mmap; vma; vma = vma->vm_next)
320 mpol_rebind_policy(vma->vm_policy, new);
321 up_write(&mm->mmap_sem);
322}
323
37012946
DR
324static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
325 [MPOL_DEFAULT] = {
326 .rebind = mpol_rebind_default,
327 },
328 [MPOL_INTERLEAVE] = {
329 .create = mpol_new_interleave,
330 .rebind = mpol_rebind_nodemask,
331 },
332 [MPOL_PREFERRED] = {
333 .create = mpol_new_preferred,
334 .rebind = mpol_rebind_preferred,
335 },
336 [MPOL_BIND] = {
337 .create = mpol_new_bind,
338 .rebind = mpol_rebind_nodemask,
339 },
340};
341
397874df 342static void gather_stats(struct page *, void *, int pte_dirty);
fc301289
CL
343static void migrate_page_add(struct page *page, struct list_head *pagelist,
344 unsigned long flags);
1a75a6c8 345
38e35860 346/* Scan through pages checking if pages follow certain conditions. */
b5810039 347static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
dc9aa5b9
CL
348 unsigned long addr, unsigned long end,
349 const nodemask_t *nodes, unsigned long flags,
38e35860 350 void *private)
1da177e4 351{
91612e0d
HD
352 pte_t *orig_pte;
353 pte_t *pte;
705e87c0 354 spinlock_t *ptl;
941150a3 355
705e87c0 356 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
91612e0d 357 do {
6aab341e 358 struct page *page;
25ba77c1 359 int nid;
91612e0d
HD
360
361 if (!pte_present(*pte))
1da177e4 362 continue;
6aab341e
LT
363 page = vm_normal_page(vma, addr, *pte);
364 if (!page)
1da177e4 365 continue;
053837fc
NP
366 /*
367 * The check for PageReserved here is important to avoid
368 * handling zero pages and other pages that may have been
369 * marked special by the system.
370 *
371 * If the PageReserved would not be checked here then f.e.
372 * the location of the zero page could have an influence
373 * on MPOL_MF_STRICT, zero pages would be counted for
374 * the per node stats, and there would be useless attempts
375 * to put zero pages on the migration list.
376 */
f4598c8b
CL
377 if (PageReserved(page))
378 continue;
6aab341e 379 nid = page_to_nid(page);
38e35860
CL
380 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
381 continue;
382
1a75a6c8 383 if (flags & MPOL_MF_STATS)
397874df 384 gather_stats(page, private, pte_dirty(*pte));
053837fc 385 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
fc301289 386 migrate_page_add(page, private, flags);
38e35860
CL
387 else
388 break;
91612e0d 389 } while (pte++, addr += PAGE_SIZE, addr != end);
705e87c0 390 pte_unmap_unlock(orig_pte, ptl);
91612e0d
HD
391 return addr != end;
392}
393
b5810039 394static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
dc9aa5b9
CL
395 unsigned long addr, unsigned long end,
396 const nodemask_t *nodes, unsigned long flags,
38e35860 397 void *private)
91612e0d
HD
398{
399 pmd_t *pmd;
400 unsigned long next;
401
402 pmd = pmd_offset(pud, addr);
403 do {
404 next = pmd_addr_end(addr, end);
405 if (pmd_none_or_clear_bad(pmd))
406 continue;
dc9aa5b9 407 if (check_pte_range(vma, pmd, addr, next, nodes,
38e35860 408 flags, private))
91612e0d
HD
409 return -EIO;
410 } while (pmd++, addr = next, addr != end);
411 return 0;
412}
413
b5810039 414static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
dc9aa5b9
CL
415 unsigned long addr, unsigned long end,
416 const nodemask_t *nodes, unsigned long flags,
38e35860 417 void *private)
91612e0d
HD
418{
419 pud_t *pud;
420 unsigned long next;
421
422 pud = pud_offset(pgd, addr);
423 do {
424 next = pud_addr_end(addr, end);
425 if (pud_none_or_clear_bad(pud))
426 continue;
dc9aa5b9 427 if (check_pmd_range(vma, pud, addr, next, nodes,
38e35860 428 flags, private))
91612e0d
HD
429 return -EIO;
430 } while (pud++, addr = next, addr != end);
431 return 0;
432}
433
b5810039 434static inline int check_pgd_range(struct vm_area_struct *vma,
dc9aa5b9
CL
435 unsigned long addr, unsigned long end,
436 const nodemask_t *nodes, unsigned long flags,
38e35860 437 void *private)
91612e0d
HD
438{
439 pgd_t *pgd;
440 unsigned long next;
441
b5810039 442 pgd = pgd_offset(vma->vm_mm, addr);
91612e0d
HD
443 do {
444 next = pgd_addr_end(addr, end);
445 if (pgd_none_or_clear_bad(pgd))
446 continue;
dc9aa5b9 447 if (check_pud_range(vma, pgd, addr, next, nodes,
38e35860 448 flags, private))
91612e0d
HD
449 return -EIO;
450 } while (pgd++, addr = next, addr != end);
451 return 0;
1da177e4
LT
452}
453
dc9aa5b9
CL
454/*
455 * Check if all pages in a range are on a set of nodes.
456 * If pagelist != NULL then isolate pages from the LRU and
457 * put them on the pagelist.
458 */
1da177e4
LT
459static struct vm_area_struct *
460check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
38e35860 461 const nodemask_t *nodes, unsigned long flags, void *private)
1da177e4
LT
462{
463 int err;
464 struct vm_area_struct *first, *vma, *prev;
465
90036ee5 466 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
90036ee5 467
b20a3503
CL
468 err = migrate_prep();
469 if (err)
470 return ERR_PTR(err);
90036ee5 471 }
053837fc 472
1da177e4
LT
473 first = find_vma(mm, start);
474 if (!first)
475 return ERR_PTR(-EFAULT);
476 prev = NULL;
477 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
dc9aa5b9
CL
478 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
479 if (!vma->vm_next && vma->vm_end < end)
480 return ERR_PTR(-EFAULT);
481 if (prev && prev->vm_end < vma->vm_start)
482 return ERR_PTR(-EFAULT);
483 }
484 if (!is_vm_hugetlb_page(vma) &&
485 ((flags & MPOL_MF_STRICT) ||
486 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
487 vma_migratable(vma)))) {
5b952b3c 488 unsigned long endvma = vma->vm_end;
dc9aa5b9 489
5b952b3c
AK
490 if (endvma > end)
491 endvma = end;
492 if (vma->vm_start > start)
493 start = vma->vm_start;
dc9aa5b9 494 err = check_pgd_range(vma, start, endvma, nodes,
38e35860 495 flags, private);
1da177e4
LT
496 if (err) {
497 first = ERR_PTR(err);
498 break;
499 }
500 }
501 prev = vma;
502 }
503 return first;
504}
505
506/* Apply policy to a single VMA */
507static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
508{
509 int err = 0;
510 struct mempolicy *old = vma->vm_policy;
511
140d5a49 512 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
1da177e4
LT
513 vma->vm_start, vma->vm_end, vma->vm_pgoff,
514 vma->vm_ops, vma->vm_file,
515 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
516
517 if (vma->vm_ops && vma->vm_ops->set_policy)
518 err = vma->vm_ops->set_policy(vma, new);
519 if (!err) {
520 mpol_get(new);
521 vma->vm_policy = new;
522 mpol_free(old);
523 }
524 return err;
525}
526
527/* Step 2: apply policy to a range and do splits. */
528static int mbind_range(struct vm_area_struct *vma, unsigned long start,
529 unsigned long end, struct mempolicy *new)
530{
531 struct vm_area_struct *next;
532 int err;
533
534 err = 0;
535 for (; vma && vma->vm_start < end; vma = next) {
536 next = vma->vm_next;
537 if (vma->vm_start < start)
538 err = split_vma(vma->vm_mm, vma, start, 1);
539 if (!err && vma->vm_end > end)
540 err = split_vma(vma->vm_mm, vma, end, 0);
541 if (!err)
542 err = policy_vma(vma, new);
543 if (err)
544 break;
545 }
546 return err;
547}
548
c61afb18
PJ
549/*
550 * Update task->flags PF_MEMPOLICY bit: set iff non-default
551 * mempolicy. Allows more rapid checking of this (combined perhaps
552 * with other PF_* flag bits) on memory allocation hot code paths.
553 *
554 * If called from outside this file, the task 'p' should -only- be
555 * a newly forked child not yet visible on the task list, because
556 * manipulating the task flags of a visible task is not safe.
557 *
558 * The above limitation is why this routine has the funny name
559 * mpol_fix_fork_child_flag().
560 *
561 * It is also safe to call this with a task pointer of current,
562 * which the static wrapper mpol_set_task_struct_flag() does,
563 * for use within this file.
564 */
565
566void mpol_fix_fork_child_flag(struct task_struct *p)
567{
568 if (p->mempolicy)
569 p->flags |= PF_MEMPOLICY;
570 else
571 p->flags &= ~PF_MEMPOLICY;
572}
573
574static void mpol_set_task_struct_flag(void)
575{
576 mpol_fix_fork_child_flag(current);
577}
578
1da177e4 579/* Set the process memory policy */
028fec41
DR
580static long do_set_mempolicy(unsigned short mode, unsigned short flags,
581 nodemask_t *nodes)
1da177e4 582{
1da177e4 583 struct mempolicy *new;
1da177e4 584
028fec41 585 new = mpol_new(mode, flags, nodes);
1da177e4
LT
586 if (IS_ERR(new))
587 return PTR_ERR(new);
588 mpol_free(current->mempolicy);
589 current->mempolicy = new;
c61afb18 590 mpol_set_task_struct_flag();
f5b087b5
DR
591 if (new && new->policy == MPOL_INTERLEAVE &&
592 nodes_weight(new->v.nodes))
dfcd3c0d 593 current->il_next = first_node(new->v.nodes);
1da177e4
LT
594 return 0;
595}
596
597/* Fill a zone bitmap for a policy */
dfcd3c0d 598static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4 599{
dfcd3c0d 600 nodes_clear(*nodes);
1da177e4 601 switch (p->policy) {
1da177e4
LT
602 case MPOL_DEFAULT:
603 break;
19770b32
MG
604 case MPOL_BIND:
605 /* Fall through */
1da177e4 606 case MPOL_INTERLEAVE:
dfcd3c0d 607 *nodes = p->v.nodes;
1da177e4
LT
608 break;
609 case MPOL_PREFERRED:
56bbd65d 610 /* or use current node instead of memory_map? */
1da177e4 611 if (p->v.preferred_node < 0)
56bbd65d 612 *nodes = node_states[N_HIGH_MEMORY];
1da177e4 613 else
dfcd3c0d 614 node_set(p->v.preferred_node, *nodes);
1da177e4
LT
615 break;
616 default:
617 BUG();
618 }
619}
620
621static int lookup_node(struct mm_struct *mm, unsigned long addr)
622{
623 struct page *p;
624 int err;
625
626 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
627 if (err >= 0) {
628 err = page_to_nid(p);
629 put_page(p);
630 }
631 return err;
632}
633
1da177e4 634/* Retrieve NUMA policy */
dbcb0f19
AB
635static long do_get_mempolicy(int *policy, nodemask_t *nmask,
636 unsigned long addr, unsigned long flags)
1da177e4 637{
8bccd85f 638 int err;
1da177e4
LT
639 struct mm_struct *mm = current->mm;
640 struct vm_area_struct *vma = NULL;
641 struct mempolicy *pol = current->mempolicy;
642
cf2a473c 643 cpuset_update_task_memory_state();
754af6f5
LS
644 if (flags &
645 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1da177e4 646 return -EINVAL;
754af6f5
LS
647
648 if (flags & MPOL_F_MEMS_ALLOWED) {
649 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
650 return -EINVAL;
651 *policy = 0; /* just so it's initialized */
652 *nmask = cpuset_current_mems_allowed;
653 return 0;
654 }
655
1da177e4
LT
656 if (flags & MPOL_F_ADDR) {
657 down_read(&mm->mmap_sem);
658 vma = find_vma_intersection(mm, addr, addr+1);
659 if (!vma) {
660 up_read(&mm->mmap_sem);
661 return -EFAULT;
662 }
663 if (vma->vm_ops && vma->vm_ops->get_policy)
664 pol = vma->vm_ops->get_policy(vma, addr);
665 else
666 pol = vma->vm_policy;
667 } else if (addr)
668 return -EINVAL;
669
670 if (!pol)
671 pol = &default_policy;
672
673 if (flags & MPOL_F_NODE) {
674 if (flags & MPOL_F_ADDR) {
675 err = lookup_node(mm, addr);
676 if (err < 0)
677 goto out;
8bccd85f 678 *policy = err;
1da177e4
LT
679 } else if (pol == current->mempolicy &&
680 pol->policy == MPOL_INTERLEAVE) {
8bccd85f 681 *policy = current->il_next;
1da177e4
LT
682 } else {
683 err = -EINVAL;
684 goto out;
685 }
686 } else
028fec41 687 *policy = pol->policy | pol->flags;
1da177e4
LT
688
689 if (vma) {
690 up_read(&current->mm->mmap_sem);
691 vma = NULL;
692 }
693
1da177e4 694 err = 0;
8bccd85f
CL
695 if (nmask)
696 get_zonemask(pol, nmask);
1da177e4
LT
697
698 out:
699 if (vma)
700 up_read(&current->mm->mmap_sem);
701 return err;
702}
703
b20a3503 704#ifdef CONFIG_MIGRATION
6ce3c4c0
CL
705/*
706 * page migration
707 */
fc301289
CL
708static void migrate_page_add(struct page *page, struct list_head *pagelist,
709 unsigned long flags)
6ce3c4c0
CL
710{
711 /*
fc301289 712 * Avoid migrating a page that is shared with others.
6ce3c4c0 713 */
b20a3503
CL
714 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
715 isolate_lru_page(page, pagelist);
7e2ab150 716}
6ce3c4c0 717
742755a1 718static struct page *new_node_page(struct page *page, unsigned long node, int **x)
95a402c3 719{
769848c0 720 return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
95a402c3
CL
721}
722
7e2ab150
CL
723/*
724 * Migrate pages from one node to a target node.
725 * Returns error or the number of pages not migrated.
726 */
dbcb0f19
AB
727static int migrate_to_node(struct mm_struct *mm, int source, int dest,
728 int flags)
7e2ab150
CL
729{
730 nodemask_t nmask;
731 LIST_HEAD(pagelist);
732 int err = 0;
733
734 nodes_clear(nmask);
735 node_set(source, nmask);
6ce3c4c0 736
7e2ab150
CL
737 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
738 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
739
aaa994b3 740 if (!list_empty(&pagelist))
95a402c3
CL
741 err = migrate_pages(&pagelist, new_node_page, dest);
742
7e2ab150 743 return err;
6ce3c4c0
CL
744}
745
39743889 746/*
7e2ab150
CL
747 * Move pages between the two nodesets so as to preserve the physical
748 * layout as much as possible.
39743889
CL
749 *
750 * Returns the number of page that could not be moved.
751 */
752int do_migrate_pages(struct mm_struct *mm,
753 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
754{
755 LIST_HEAD(pagelist);
7e2ab150
CL
756 int busy = 0;
757 int err = 0;
758 nodemask_t tmp;
39743889 759
7e2ab150 760 down_read(&mm->mmap_sem);
39743889 761
7b2259b3
CL
762 err = migrate_vmas(mm, from_nodes, to_nodes, flags);
763 if (err)
764 goto out;
765
7e2ab150
CL
766/*
767 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
768 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
769 * bit in 'tmp', and return that <source, dest> pair for migration.
770 * The pair of nodemasks 'to' and 'from' define the map.
771 *
772 * If no pair of bits is found that way, fallback to picking some
773 * pair of 'source' and 'dest' bits that are not the same. If the
774 * 'source' and 'dest' bits are the same, this represents a node
775 * that will be migrating to itself, so no pages need move.
776 *
777 * If no bits are left in 'tmp', or if all remaining bits left
778 * in 'tmp' correspond to the same bit in 'to', return false
779 * (nothing left to migrate).
780 *
781 * This lets us pick a pair of nodes to migrate between, such that
782 * if possible the dest node is not already occupied by some other
783 * source node, minimizing the risk of overloading the memory on a
784 * node that would happen if we migrated incoming memory to a node
785 * before migrating outgoing memory source that same node.
786 *
787 * A single scan of tmp is sufficient. As we go, we remember the
788 * most recent <s, d> pair that moved (s != d). If we find a pair
789 * that not only moved, but what's better, moved to an empty slot
790 * (d is not set in tmp), then we break out then, with that pair.
791 * Otherwise when we finish scannng from_tmp, we at least have the
792 * most recent <s, d> pair that moved. If we get all the way through
793 * the scan of tmp without finding any node that moved, much less
794 * moved to an empty node, then there is nothing left worth migrating.
795 */
d4984711 796
7e2ab150
CL
797 tmp = *from_nodes;
798 while (!nodes_empty(tmp)) {
799 int s,d;
800 int source = -1;
801 int dest = 0;
802
803 for_each_node_mask(s, tmp) {
804 d = node_remap(s, *from_nodes, *to_nodes);
805 if (s == d)
806 continue;
807
808 source = s; /* Node moved. Memorize */
809 dest = d;
810
811 /* dest not in remaining from nodes? */
812 if (!node_isset(dest, tmp))
813 break;
814 }
815 if (source == -1)
816 break;
817
818 node_clear(source, tmp);
819 err = migrate_to_node(mm, source, dest, flags);
820 if (err > 0)
821 busy += err;
822 if (err < 0)
823 break;
39743889 824 }
7b2259b3 825out:
39743889 826 up_read(&mm->mmap_sem);
7e2ab150
CL
827 if (err < 0)
828 return err;
829 return busy;
b20a3503
CL
830
831}
832
3ad33b24
LS
833/*
834 * Allocate a new page for page migration based on vma policy.
835 * Start assuming that page is mapped by vma pointed to by @private.
836 * Search forward from there, if not. N.B., this assumes that the
837 * list of pages handed to migrate_pages()--which is how we get here--
838 * is in virtual address order.
839 */
742755a1 840static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
95a402c3
CL
841{
842 struct vm_area_struct *vma = (struct vm_area_struct *)private;
3ad33b24 843 unsigned long uninitialized_var(address);
95a402c3 844
3ad33b24
LS
845 while (vma) {
846 address = page_address_in_vma(page, vma);
847 if (address != -EFAULT)
848 break;
849 vma = vma->vm_next;
850 }
851
852 /*
853 * if !vma, alloc_page_vma() will use task or system default policy
854 */
855 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
95a402c3 856}
b20a3503
CL
857#else
858
859static void migrate_page_add(struct page *page, struct list_head *pagelist,
860 unsigned long flags)
861{
39743889
CL
862}
863
b20a3503
CL
864int do_migrate_pages(struct mm_struct *mm,
865 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
866{
867 return -ENOSYS;
868}
95a402c3 869
69939749 870static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
95a402c3
CL
871{
872 return NULL;
873}
b20a3503
CL
874#endif
875
dbcb0f19 876static long do_mbind(unsigned long start, unsigned long len,
028fec41
DR
877 unsigned short mode, unsigned short mode_flags,
878 nodemask_t *nmask, unsigned long flags)
6ce3c4c0
CL
879{
880 struct vm_area_struct *vma;
881 struct mm_struct *mm = current->mm;
882 struct mempolicy *new;
883 unsigned long end;
884 int err;
885 LIST_HEAD(pagelist);
886
a3b51e01
DR
887 if (flags & ~(unsigned long)(MPOL_MF_STRICT |
888 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
6ce3c4c0 889 return -EINVAL;
74c00241 890 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
6ce3c4c0
CL
891 return -EPERM;
892
893 if (start & ~PAGE_MASK)
894 return -EINVAL;
895
896 if (mode == MPOL_DEFAULT)
897 flags &= ~MPOL_MF_STRICT;
898
899 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
900 end = start + len;
901
902 if (end < start)
903 return -EINVAL;
904 if (end == start)
905 return 0;
906
028fec41 907 new = mpol_new(mode, mode_flags, nmask);
6ce3c4c0
CL
908 if (IS_ERR(new))
909 return PTR_ERR(new);
910
911 /*
912 * If we are using the default policy then operation
913 * on discontinuous address spaces is okay after all
914 */
915 if (!new)
916 flags |= MPOL_MF_DISCONTIG_OK;
917
028fec41
DR
918 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
919 start, start + len, mode, mode_flags,
920 nmask ? nodes_addr(*nmask)[0] : -1);
6ce3c4c0
CL
921
922 down_write(&mm->mmap_sem);
923 vma = check_range(mm, start, end, nmask,
924 flags | MPOL_MF_INVERT, &pagelist);
925
926 err = PTR_ERR(vma);
927 if (!IS_ERR(vma)) {
928 int nr_failed = 0;
929
930 err = mbind_range(vma, start, end, new);
7e2ab150 931
6ce3c4c0 932 if (!list_empty(&pagelist))
95a402c3
CL
933 nr_failed = migrate_pages(&pagelist, new_vma_page,
934 (unsigned long)vma);
6ce3c4c0
CL
935
936 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
937 err = -EIO;
938 }
b20a3503 939
6ce3c4c0
CL
940 up_write(&mm->mmap_sem);
941 mpol_free(new);
942 return err;
943}
944
8bccd85f
CL
945/*
946 * User space interface with variable sized bitmaps for nodelists.
947 */
948
949/* Copy a node mask from user space. */
39743889 950static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85f
CL
951 unsigned long maxnode)
952{
953 unsigned long k;
954 unsigned long nlongs;
955 unsigned long endmask;
956
957 --maxnode;
958 nodes_clear(*nodes);
959 if (maxnode == 0 || !nmask)
960 return 0;
a9c930ba 961 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
636f13c1 962 return -EINVAL;
8bccd85f
CL
963
964 nlongs = BITS_TO_LONGS(maxnode);
965 if ((maxnode % BITS_PER_LONG) == 0)
966 endmask = ~0UL;
967 else
968 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
969
970 /* When the user specified more nodes than supported just check
971 if the non supported part is all zero. */
972 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
973 if (nlongs > PAGE_SIZE/sizeof(long))
974 return -EINVAL;
975 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
976 unsigned long t;
977 if (get_user(t, nmask + k))
978 return -EFAULT;
979 if (k == nlongs - 1) {
980 if (t & endmask)
981 return -EINVAL;
982 } else if (t)
983 return -EINVAL;
984 }
985 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
986 endmask = ~0UL;
987 }
988
989 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
990 return -EFAULT;
991 nodes_addr(*nodes)[nlongs-1] &= endmask;
992 return 0;
993}
994
995/* Copy a kernel node mask to user space */
996static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
997 nodemask_t *nodes)
998{
999 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1000 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1001
1002 if (copy > nbytes) {
1003 if (copy > PAGE_SIZE)
1004 return -EINVAL;
1005 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1006 return -EFAULT;
1007 copy = nbytes;
1008 }
1009 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1010}
1011
1012asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1013 unsigned long mode,
1014 unsigned long __user *nmask, unsigned long maxnode,
1015 unsigned flags)
1016{
1017 nodemask_t nodes;
1018 int err;
028fec41 1019 unsigned short mode_flags;
8bccd85f 1020
028fec41
DR
1021 mode_flags = mode & MPOL_MODE_FLAGS;
1022 mode &= ~MPOL_MODE_FLAGS;
a3b51e01
DR
1023 if (mode >= MPOL_MAX)
1024 return -EINVAL;
4c50bc01
DR
1025 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1026 (mode_flags & MPOL_F_RELATIVE_NODES))
1027 return -EINVAL;
8bccd85f
CL
1028 err = get_nodes(&nodes, nmask, maxnode);
1029 if (err)
1030 return err;
028fec41 1031 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
8bccd85f
CL
1032}
1033
1034/* Set the process memory policy */
1035asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1036 unsigned long maxnode)
1037{
1038 int err;
1039 nodemask_t nodes;
028fec41 1040 unsigned short flags;
8bccd85f 1041
028fec41
DR
1042 flags = mode & MPOL_MODE_FLAGS;
1043 mode &= ~MPOL_MODE_FLAGS;
1044 if ((unsigned int)mode >= MPOL_MAX)
8bccd85f 1045 return -EINVAL;
4c50bc01
DR
1046 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1047 return -EINVAL;
8bccd85f
CL
1048 err = get_nodes(&nodes, nmask, maxnode);
1049 if (err)
1050 return err;
028fec41 1051 return do_set_mempolicy(mode, flags, &nodes);
8bccd85f
CL
1052}
1053
39743889
CL
1054asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1055 const unsigned long __user *old_nodes,
1056 const unsigned long __user *new_nodes)
1057{
1058 struct mm_struct *mm;
1059 struct task_struct *task;
1060 nodemask_t old;
1061 nodemask_t new;
1062 nodemask_t task_nodes;
1063 int err;
1064
1065 err = get_nodes(&old, old_nodes, maxnode);
1066 if (err)
1067 return err;
1068
1069 err = get_nodes(&new, new_nodes, maxnode);
1070 if (err)
1071 return err;
1072
1073 /* Find the mm_struct */
1074 read_lock(&tasklist_lock);
228ebcbe 1075 task = pid ? find_task_by_vpid(pid) : current;
39743889
CL
1076 if (!task) {
1077 read_unlock(&tasklist_lock);
1078 return -ESRCH;
1079 }
1080 mm = get_task_mm(task);
1081 read_unlock(&tasklist_lock);
1082
1083 if (!mm)
1084 return -EINVAL;
1085
1086 /*
1087 * Check if this process has the right to modify the specified
1088 * process. The right exists if the process has administrative
7f927fcc 1089 * capabilities, superuser privileges or the same
39743889
CL
1090 * userid as the target process.
1091 */
1092 if ((current->euid != task->suid) && (current->euid != task->uid) &&
1093 (current->uid != task->suid) && (current->uid != task->uid) &&
74c00241 1094 !capable(CAP_SYS_NICE)) {
39743889
CL
1095 err = -EPERM;
1096 goto out;
1097 }
1098
1099 task_nodes = cpuset_mems_allowed(task);
1100 /* Is the user allowed to access the target nodes? */
74c00241 1101 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
39743889
CL
1102 err = -EPERM;
1103 goto out;
1104 }
1105
37b07e41 1106 if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
3b42d28b
CL
1107 err = -EINVAL;
1108 goto out;
1109 }
1110
86c3a764
DQ
1111 err = security_task_movememory(task);
1112 if (err)
1113 goto out;
1114
511030bc 1115 err = do_migrate_pages(mm, &old, &new,
74c00241 1116 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
39743889
CL
1117out:
1118 mmput(mm);
1119 return err;
1120}
1121
1122
8bccd85f
CL
1123/* Retrieve NUMA policy */
1124asmlinkage long sys_get_mempolicy(int __user *policy,
1125 unsigned long __user *nmask,
1126 unsigned long maxnode,
1127 unsigned long addr, unsigned long flags)
1128{
dbcb0f19
AB
1129 int err;
1130 int uninitialized_var(pval);
8bccd85f
CL
1131 nodemask_t nodes;
1132
1133 if (nmask != NULL && maxnode < MAX_NUMNODES)
1134 return -EINVAL;
1135
1136 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1137
1138 if (err)
1139 return err;
1140
1141 if (policy && put_user(pval, policy))
1142 return -EFAULT;
1143
1144 if (nmask)
1145 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1146
1147 return err;
1148}
1149
1da177e4
LT
1150#ifdef CONFIG_COMPAT
1151
1152asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1153 compat_ulong_t __user *nmask,
1154 compat_ulong_t maxnode,
1155 compat_ulong_t addr, compat_ulong_t flags)
1156{
1157 long err;
1158 unsigned long __user *nm = NULL;
1159 unsigned long nr_bits, alloc_size;
1160 DECLARE_BITMAP(bm, MAX_NUMNODES);
1161
1162 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1163 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1164
1165 if (nmask)
1166 nm = compat_alloc_user_space(alloc_size);
1167
1168 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1169
1170 if (!err && nmask) {
1171 err = copy_from_user(bm, nm, alloc_size);
1172 /* ensure entire bitmap is zeroed */
1173 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1174 err |= compat_put_bitmap(nmask, bm, nr_bits);
1175 }
1176
1177 return err;
1178}
1179
1180asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1181 compat_ulong_t maxnode)
1182{
1183 long err = 0;
1184 unsigned long __user *nm = NULL;
1185 unsigned long nr_bits, alloc_size;
1186 DECLARE_BITMAP(bm, MAX_NUMNODES);
1187
1188 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1189 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1190
1191 if (nmask) {
1192 err = compat_get_bitmap(bm, nmask, nr_bits);
1193 nm = compat_alloc_user_space(alloc_size);
1194 err |= copy_to_user(nm, bm, alloc_size);
1195 }
1196
1197 if (err)
1198 return -EFAULT;
1199
1200 return sys_set_mempolicy(mode, nm, nr_bits+1);
1201}
1202
1203asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1204 compat_ulong_t mode, compat_ulong_t __user *nmask,
1205 compat_ulong_t maxnode, compat_ulong_t flags)
1206{
1207 long err = 0;
1208 unsigned long __user *nm = NULL;
1209 unsigned long nr_bits, alloc_size;
dfcd3c0d 1210 nodemask_t bm;
1da177e4
LT
1211
1212 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1213 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1214
1215 if (nmask) {
dfcd3c0d 1216 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1da177e4 1217 nm = compat_alloc_user_space(alloc_size);
dfcd3c0d 1218 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1da177e4
LT
1219 }
1220
1221 if (err)
1222 return -EFAULT;
1223
1224 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1225}
1226
1227#endif
1228
480eccf9
LS
1229/*
1230 * get_vma_policy(@task, @vma, @addr)
1231 * @task - task for fallback if vma policy == default
1232 * @vma - virtual memory area whose policy is sought
1233 * @addr - address in @vma for shared policy lookup
1234 *
1235 * Returns effective policy for a VMA at specified address.
1236 * Falls back to @task or system default policy, as necessary.
1237 * Returned policy has extra reference count if shared, vma,
1238 * or some other task's policy [show_numa_maps() can pass
1239 * @task != current]. It is the caller's responsibility to
1240 * free the reference in these cases.
1241 */
48fce342
CL
1242static struct mempolicy * get_vma_policy(struct task_struct *task,
1243 struct vm_area_struct *vma, unsigned long addr)
1da177e4 1244{
6e21c8f1 1245 struct mempolicy *pol = task->mempolicy;
480eccf9 1246 int shared_pol = 0;
1da177e4
LT
1247
1248 if (vma) {
480eccf9 1249 if (vma->vm_ops && vma->vm_ops->get_policy) {
8bccd85f 1250 pol = vma->vm_ops->get_policy(vma, addr);
480eccf9
LS
1251 shared_pol = 1; /* if pol non-NULL, add ref below */
1252 } else if (vma->vm_policy &&
1da177e4
LT
1253 vma->vm_policy->policy != MPOL_DEFAULT)
1254 pol = vma->vm_policy;
1255 }
1256 if (!pol)
1257 pol = &default_policy;
480eccf9
LS
1258 else if (!shared_pol && pol != current->mempolicy)
1259 mpol_get(pol); /* vma or other task's policy */
1da177e4
LT
1260 return pol;
1261}
1262
19770b32
MG
1263/* Return a nodemask representing a mempolicy */
1264static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
1265{
1266 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1267 if (unlikely(policy->policy == MPOL_BIND) &&
1268 gfp_zone(gfp) >= policy_zone &&
1269 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1270 return &policy->v.nodes;
1271
1272 return NULL;
1273}
1274
1da177e4 1275/* Return a zonelist representing a mempolicy */
dd0fc66f 1276static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1da177e4
LT
1277{
1278 int nd;
1279
1280 switch (policy->policy) {
1281 case MPOL_PREFERRED:
1282 nd = policy->v.preferred_node;
1283 if (nd < 0)
1284 nd = numa_node_id();
1285 break;
1286 case MPOL_BIND:
19770b32
MG
1287 /*
1288 * Normally, MPOL_BIND allocations node-local are node-local
1289 * within the allowed nodemask. However, if __GFP_THISNODE is
1290 * set and the current node is part of the mask, we use the
1291 * the zonelist for the first node in the mask instead.
1292 */
1293 nd = numa_node_id();
1294 if (unlikely(gfp & __GFP_THISNODE) &&
1295 unlikely(!node_isset(nd, policy->v.nodes)))
1296 nd = first_node(policy->v.nodes);
1297 break;
1da177e4
LT
1298 case MPOL_INTERLEAVE: /* should not happen */
1299 case MPOL_DEFAULT:
1300 nd = numa_node_id();
1301 break;
1302 default:
1303 nd = 0;
1304 BUG();
1305 }
0e88460d 1306 return node_zonelist(nd, gfp);
1da177e4
LT
1307}
1308
1309/* Do dynamic interleaving for a process */
1310static unsigned interleave_nodes(struct mempolicy *policy)
1311{
1312 unsigned nid, next;
1313 struct task_struct *me = current;
1314
1315 nid = me->il_next;
dfcd3c0d 1316 next = next_node(nid, policy->v.nodes);
1da177e4 1317 if (next >= MAX_NUMNODES)
dfcd3c0d 1318 next = first_node(policy->v.nodes);
f5b087b5
DR
1319 if (next < MAX_NUMNODES)
1320 me->il_next = next;
1da177e4
LT
1321 return nid;
1322}
1323
dc85da15
CL
1324/*
1325 * Depending on the memory policy provide a node from which to allocate the
1326 * next slab entry.
1327 */
1328unsigned slab_node(struct mempolicy *policy)
1329{
a3b51e01 1330 unsigned short pol = policy ? policy->policy : MPOL_DEFAULT;
765c4507
CL
1331
1332 switch (pol) {
dc85da15
CL
1333 case MPOL_INTERLEAVE:
1334 return interleave_nodes(policy);
1335
dd1a239f 1336 case MPOL_BIND: {
dc85da15
CL
1337 /*
1338 * Follow bind policy behavior and start allocation at the
1339 * first node.
1340 */
19770b32
MG
1341 struct zonelist *zonelist;
1342 struct zone *zone;
1343 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1344 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1345 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1346 &policy->v.nodes,
1347 &zone);
1348 return zone->node;
dd1a239f 1349 }
dc85da15
CL
1350
1351 case MPOL_PREFERRED:
1352 if (policy->v.preferred_node >= 0)
1353 return policy->v.preferred_node;
1354 /* Fall through */
1355
1356 default:
1357 return numa_node_id();
1358 }
1359}
1360
1da177e4
LT
1361/* Do static interleaving for a VMA with known offset. */
1362static unsigned offset_il_node(struct mempolicy *pol,
1363 struct vm_area_struct *vma, unsigned long off)
1364{
dfcd3c0d 1365 unsigned nnodes = nodes_weight(pol->v.nodes);
f5b087b5 1366 unsigned target;
1da177e4
LT
1367 int c;
1368 int nid = -1;
1369
f5b087b5
DR
1370 if (!nnodes)
1371 return numa_node_id();
1372 target = (unsigned int)off % nnodes;
1da177e4
LT
1373 c = 0;
1374 do {
dfcd3c0d 1375 nid = next_node(nid, pol->v.nodes);
1da177e4
LT
1376 c++;
1377 } while (c <= target);
1da177e4
LT
1378 return nid;
1379}
1380
5da7ca86
CL
1381/* Determine a node number for interleave */
1382static inline unsigned interleave_nid(struct mempolicy *pol,
1383 struct vm_area_struct *vma, unsigned long addr, int shift)
1384{
1385 if (vma) {
1386 unsigned long off;
1387
3b98b087
NA
1388 /*
1389 * for small pages, there is no difference between
1390 * shift and PAGE_SHIFT, so the bit-shift is safe.
1391 * for huge pages, since vm_pgoff is in units of small
1392 * pages, we need to shift off the always 0 bits to get
1393 * a useful offset.
1394 */
1395 BUG_ON(shift < PAGE_SHIFT);
1396 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
5da7ca86
CL
1397 off += (addr - vma->vm_start) >> shift;
1398 return offset_il_node(pol, vma, off);
1399 } else
1400 return interleave_nodes(pol);
1401}
1402
00ac59ad 1403#ifdef CONFIG_HUGETLBFS
480eccf9
LS
1404/*
1405 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1406 * @vma = virtual memory area whose policy is sought
1407 * @addr = address in @vma for shared policy lookup and interleave policy
1408 * @gfp_flags = for requested zone
19770b32
MG
1409 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1410 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
480eccf9
LS
1411 *
1412 * Returns a zonelist suitable for a huge page allocation.
19770b32
MG
1413 * If the effective policy is 'BIND, returns pointer to local node's zonelist,
1414 * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
480eccf9 1415 * If it is also a policy for which get_vma_policy() returns an extra
19770b32 1416 * reference, we must hold that reference until after the allocation.
480eccf9 1417 * In that case, return policy via @mpol so hugetlb allocation can drop
19770b32 1418 * the reference. For non-'BIND referenced policies, we can/do drop the
480eccf9
LS
1419 * reference here, so the caller doesn't need to know about the special case
1420 * for default and current task policy.
1421 */
396faf03 1422struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
19770b32
MG
1423 gfp_t gfp_flags, struct mempolicy **mpol,
1424 nodemask_t **nodemask)
5da7ca86
CL
1425{
1426 struct mempolicy *pol = get_vma_policy(current, vma, addr);
480eccf9 1427 struct zonelist *zl;
5da7ca86 1428
480eccf9 1429 *mpol = NULL; /* probably no unref needed */
19770b32
MG
1430 *nodemask = NULL; /* assume !MPOL_BIND */
1431 if (pol->policy == MPOL_BIND) {
1432 *nodemask = &pol->v.nodes;
1433 } else if (pol->policy == MPOL_INTERLEAVE) {
5da7ca86
CL
1434 unsigned nid;
1435
1436 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
69682d85
LS
1437 if (unlikely(pol != &default_policy &&
1438 pol != current->mempolicy))
1439 __mpol_free(pol); /* finished with pol */
0e88460d 1440 return node_zonelist(nid, gfp_flags);
5da7ca86 1441 }
480eccf9
LS
1442
1443 zl = zonelist_policy(GFP_HIGHUSER, pol);
1444 if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1445 if (pol->policy != MPOL_BIND)
1446 __mpol_free(pol); /* finished with pol */
1447 else
1448 *mpol = pol; /* unref needed after allocation */
1449 }
1450 return zl;
5da7ca86 1451}
00ac59ad 1452#endif
5da7ca86 1453
1da177e4
LT
1454/* Allocate a page in interleaved policy.
1455 Own path because it needs to do special accounting. */
662f3a0b
AK
1456static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1457 unsigned nid)
1da177e4
LT
1458{
1459 struct zonelist *zl;
1460 struct page *page;
1461
0e88460d 1462 zl = node_zonelist(nid, gfp);
1da177e4 1463 page = __alloc_pages(gfp, order, zl);
dd1a239f 1464 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
ca889e6c 1465 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1da177e4
LT
1466 return page;
1467}
1468
1469/**
1470 * alloc_page_vma - Allocate a page for a VMA.
1471 *
1472 * @gfp:
1473 * %GFP_USER user allocation.
1474 * %GFP_KERNEL kernel allocations,
1475 * %GFP_HIGHMEM highmem/user allocations,
1476 * %GFP_FS allocation should not call back into a file system.
1477 * %GFP_ATOMIC don't sleep.
1478 *
1479 * @vma: Pointer to VMA or NULL if not available.
1480 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1481 *
1482 * This function allocates a page from the kernel page pool and applies
1483 * a NUMA policy associated with the VMA or the current process.
1484 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1485 * mm_struct of the VMA to prevent it from going away. Should be used for
1486 * all allocations for pages that will be mapped into
1487 * user space. Returns NULL when no page can be allocated.
1488 *
1489 * Should be called with the mm_sem of the vma hold.
1490 */
1491struct page *
dd0fc66f 1492alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1da177e4 1493{
6e21c8f1 1494 struct mempolicy *pol = get_vma_policy(current, vma, addr);
480eccf9 1495 struct zonelist *zl;
1da177e4 1496
cf2a473c 1497 cpuset_update_task_memory_state();
1da177e4
LT
1498
1499 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1500 unsigned nid;
5da7ca86
CL
1501
1502 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
69682d85
LS
1503 if (unlikely(pol != &default_policy &&
1504 pol != current->mempolicy))
1505 __mpol_free(pol); /* finished with pol */
1da177e4
LT
1506 return alloc_page_interleave(gfp, 0, nid);
1507 }
480eccf9
LS
1508 zl = zonelist_policy(gfp, pol);
1509 if (pol != &default_policy && pol != current->mempolicy) {
1510 /*
1511 * slow path: ref counted policy -- shared or vma
1512 */
19770b32
MG
1513 struct page *page = __alloc_pages_nodemask(gfp, 0,
1514 zl, nodemask_policy(gfp, pol));
480eccf9
LS
1515 __mpol_free(pol);
1516 return page;
1517 }
1518 /*
1519 * fast path: default or task policy
1520 */
19770b32 1521 return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
1da177e4
LT
1522}
1523
1524/**
1525 * alloc_pages_current - Allocate pages.
1526 *
1527 * @gfp:
1528 * %GFP_USER user allocation,
1529 * %GFP_KERNEL kernel allocation,
1530 * %GFP_HIGHMEM highmem allocation,
1531 * %GFP_FS don't call back into a file system.
1532 * %GFP_ATOMIC don't sleep.
1533 * @order: Power of two of allocation size in pages. 0 is a single page.
1534 *
1535 * Allocate a page from the kernel page pool. When not in
1536 * interrupt context and apply the current process NUMA policy.
1537 * Returns NULL when no page can be allocated.
1538 *
cf2a473c 1539 * Don't call cpuset_update_task_memory_state() unless
1da177e4
LT
1540 * 1) it's ok to take cpuset_sem (can WAIT), and
1541 * 2) allocating for current task (not interrupt).
1542 */
dd0fc66f 1543struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1da177e4
LT
1544{
1545 struct mempolicy *pol = current->mempolicy;
1546
1547 if ((gfp & __GFP_WAIT) && !in_interrupt())
cf2a473c 1548 cpuset_update_task_memory_state();
9b819d20 1549 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1da177e4
LT
1550 pol = &default_policy;
1551 if (pol->policy == MPOL_INTERLEAVE)
1552 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
19770b32
MG
1553 return __alloc_pages_nodemask(gfp, order,
1554 zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
1da177e4
LT
1555}
1556EXPORT_SYMBOL(alloc_pages_current);
1557
4225399a
PJ
1558/*
1559 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1560 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1561 * with the mems_allowed returned by cpuset_mems_allowed(). This
1562 * keeps mempolicies cpuset relative after its cpuset moves. See
1563 * further kernel/cpuset.c update_nodemask().
1564 */
4225399a 1565
1da177e4
LT
1566/* Slow path of a mempolicy copy */
1567struct mempolicy *__mpol_copy(struct mempolicy *old)
1568{
1569 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1570
1571 if (!new)
1572 return ERR_PTR(-ENOMEM);
4225399a
PJ
1573 if (current_cpuset_is_being_rebound()) {
1574 nodemask_t mems = cpuset_mems_allowed(current);
1575 mpol_rebind_policy(old, &mems);
1576 }
1da177e4
LT
1577 *new = *old;
1578 atomic_set(&new->refcnt, 1);
1da177e4
LT
1579 return new;
1580}
1581
f5b087b5
DR
1582static int mpol_match_intent(const struct mempolicy *a,
1583 const struct mempolicy *b)
1584{
1585 if (a->flags != b->flags)
1586 return 0;
1587 if (!mpol_store_user_nodemask(a))
1588 return 1;
1589 return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1590}
1591
1da177e4
LT
1592/* Slow path of a mempolicy comparison */
1593int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1594{
1595 if (!a || !b)
1596 return 0;
1597 if (a->policy != b->policy)
1598 return 0;
f5b087b5
DR
1599 if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b))
1600 return 0;
1da177e4
LT
1601 switch (a->policy) {
1602 case MPOL_DEFAULT:
1603 return 1;
19770b32
MG
1604 case MPOL_BIND:
1605 /* Fall through */
1da177e4 1606 case MPOL_INTERLEAVE:
dfcd3c0d 1607 return nodes_equal(a->v.nodes, b->v.nodes);
1da177e4
LT
1608 case MPOL_PREFERRED:
1609 return a->v.preferred_node == b->v.preferred_node;
1da177e4
LT
1610 default:
1611 BUG();
1612 return 0;
1613 }
1614}
1615
1616/* Slow path of a mpol destructor. */
1617void __mpol_free(struct mempolicy *p)
1618{
1619 if (!atomic_dec_and_test(&p->refcnt))
1620 return;
1da177e4
LT
1621 p->policy = MPOL_DEFAULT;
1622 kmem_cache_free(policy_cache, p);
1623}
1624
1da177e4
LT
1625/*
1626 * Shared memory backing store policy support.
1627 *
1628 * Remember policies even when nobody has shared memory mapped.
1629 * The policies are kept in Red-Black tree linked from the inode.
1630 * They are protected by the sp->lock spinlock, which should be held
1631 * for any accesses to the tree.
1632 */
1633
1634/* lookup first element intersecting start-end */
1635/* Caller holds sp->lock */
1636static struct sp_node *
1637sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1638{
1639 struct rb_node *n = sp->root.rb_node;
1640
1641 while (n) {
1642 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1643
1644 if (start >= p->end)
1645 n = n->rb_right;
1646 else if (end <= p->start)
1647 n = n->rb_left;
1648 else
1649 break;
1650 }
1651 if (!n)
1652 return NULL;
1653 for (;;) {
1654 struct sp_node *w = NULL;
1655 struct rb_node *prev = rb_prev(n);
1656 if (!prev)
1657 break;
1658 w = rb_entry(prev, struct sp_node, nd);
1659 if (w->end <= start)
1660 break;
1661 n = prev;
1662 }
1663 return rb_entry(n, struct sp_node, nd);
1664}
1665
1666/* Insert a new shared policy into the list. */
1667/* Caller holds sp->lock */
1668static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1669{
1670 struct rb_node **p = &sp->root.rb_node;
1671 struct rb_node *parent = NULL;
1672 struct sp_node *nd;
1673
1674 while (*p) {
1675 parent = *p;
1676 nd = rb_entry(parent, struct sp_node, nd);
1677 if (new->start < nd->start)
1678 p = &(*p)->rb_left;
1679 else if (new->end > nd->end)
1680 p = &(*p)->rb_right;
1681 else
1682 BUG();
1683 }
1684 rb_link_node(&new->nd, parent, p);
1685 rb_insert_color(&new->nd, &sp->root);
140d5a49 1686 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1da177e4
LT
1687 new->policy ? new->policy->policy : 0);
1688}
1689
1690/* Find shared policy intersecting idx */
1691struct mempolicy *
1692mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1693{
1694 struct mempolicy *pol = NULL;
1695 struct sp_node *sn;
1696
1697 if (!sp->root.rb_node)
1698 return NULL;
1699 spin_lock(&sp->lock);
1700 sn = sp_lookup(sp, idx, idx+1);
1701 if (sn) {
1702 mpol_get(sn->policy);
1703 pol = sn->policy;
1704 }
1705 spin_unlock(&sp->lock);
1706 return pol;
1707}
1708
1709static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1710{
140d5a49 1711 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1da177e4
LT
1712 rb_erase(&n->nd, &sp->root);
1713 mpol_free(n->policy);
1714 kmem_cache_free(sn_cache, n);
1715}
1716
dbcb0f19
AB
1717static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1718 struct mempolicy *pol)
1da177e4
LT
1719{
1720 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1721
1722 if (!n)
1723 return NULL;
1724 n->start = start;
1725 n->end = end;
1726 mpol_get(pol);
1727 n->policy = pol;
1728 return n;
1729}
1730
1731/* Replace a policy range. */
1732static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1733 unsigned long end, struct sp_node *new)
1734{
1735 struct sp_node *n, *new2 = NULL;
1736
1737restart:
1738 spin_lock(&sp->lock);
1739 n = sp_lookup(sp, start, end);
1740 /* Take care of old policies in the same range. */
1741 while (n && n->start < end) {
1742 struct rb_node *next = rb_next(&n->nd);
1743 if (n->start >= start) {
1744 if (n->end <= end)
1745 sp_delete(sp, n);
1746 else
1747 n->start = end;
1748 } else {
1749 /* Old policy spanning whole new range. */
1750 if (n->end > end) {
1751 if (!new2) {
1752 spin_unlock(&sp->lock);
1753 new2 = sp_alloc(end, n->end, n->policy);
1754 if (!new2)
1755 return -ENOMEM;
1756 goto restart;
1757 }
1758 n->end = start;
1759 sp_insert(sp, new2);
1760 new2 = NULL;
1761 break;
1762 } else
1763 n->end = start;
1764 }
1765 if (!next)
1766 break;
1767 n = rb_entry(next, struct sp_node, nd);
1768 }
1769 if (new)
1770 sp_insert(sp, new);
1771 spin_unlock(&sp->lock);
1772 if (new2) {
1773 mpol_free(new2->policy);
1774 kmem_cache_free(sn_cache, new2);
1775 }
1776 return 0;
1777}
1778
a3b51e01 1779void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
028fec41 1780 unsigned short flags, nodemask_t *policy_nodes)
7339ff83
RH
1781{
1782 info->root = RB_ROOT;
1783 spin_lock_init(&info->lock);
1784
1785 if (policy != MPOL_DEFAULT) {
1786 struct mempolicy *newpol;
1787
1788 /* Falls back to MPOL_DEFAULT on any error */
028fec41 1789 newpol = mpol_new(policy, flags, policy_nodes);
7339ff83
RH
1790 if (!IS_ERR(newpol)) {
1791 /* Create pseudo-vma that contains just the policy */
1792 struct vm_area_struct pvma;
1793
1794 memset(&pvma, 0, sizeof(struct vm_area_struct));
1795 /* Policy covers entire file */
1796 pvma.vm_end = TASK_SIZE;
1797 mpol_set_shared_policy(info, &pvma, newpol);
1798 mpol_free(newpol);
1799 }
1800 }
1801}
1802
1da177e4
LT
1803int mpol_set_shared_policy(struct shared_policy *info,
1804 struct vm_area_struct *vma, struct mempolicy *npol)
1805{
1806 int err;
1807 struct sp_node *new = NULL;
1808 unsigned long sz = vma_pages(vma);
1809
028fec41 1810 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1da177e4 1811 vma->vm_pgoff,
028fec41
DR
1812 sz, npol ? npol->policy : -1,
1813 npol ? npol->flags : -1,
140d5a49 1814 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1da177e4
LT
1815
1816 if (npol) {
1817 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1818 if (!new)
1819 return -ENOMEM;
1820 }
1821 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1822 if (err && new)
1823 kmem_cache_free(sn_cache, new);
1824 return err;
1825}
1826
1827/* Free a backing policy store on inode delete. */
1828void mpol_free_shared_policy(struct shared_policy *p)
1829{
1830 struct sp_node *n;
1831 struct rb_node *next;
1832
1833 if (!p->root.rb_node)
1834 return;
1835 spin_lock(&p->lock);
1836 next = rb_first(&p->root);
1837 while (next) {
1838 n = rb_entry(next, struct sp_node, nd);
1839 next = rb_next(&n->nd);
90c5029e 1840 rb_erase(&n->nd, &p->root);
1da177e4
LT
1841 mpol_free(n->policy);
1842 kmem_cache_free(sn_cache, n);
1843 }
1844 spin_unlock(&p->lock);
1da177e4
LT
1845}
1846
1847/* assumes fs == KERNEL_DS */
1848void __init numa_policy_init(void)
1849{
b71636e2
PM
1850 nodemask_t interleave_nodes;
1851 unsigned long largest = 0;
1852 int nid, prefer = 0;
1853
1da177e4
LT
1854 policy_cache = kmem_cache_create("numa_policy",
1855 sizeof(struct mempolicy),
20c2df83 1856 0, SLAB_PANIC, NULL);
1da177e4
LT
1857
1858 sn_cache = kmem_cache_create("shared_policy_node",
1859 sizeof(struct sp_node),
20c2df83 1860 0, SLAB_PANIC, NULL);
1da177e4 1861
b71636e2
PM
1862 /*
1863 * Set interleaving policy for system init. Interleaving is only
1864 * enabled across suitably sized nodes (default is >= 16MB), or
1865 * fall back to the largest node if they're all smaller.
1866 */
1867 nodes_clear(interleave_nodes);
56bbd65d 1868 for_each_node_state(nid, N_HIGH_MEMORY) {
b71636e2
PM
1869 unsigned long total_pages = node_present_pages(nid);
1870
1871 /* Preserve the largest node */
1872 if (largest < total_pages) {
1873 largest = total_pages;
1874 prefer = nid;
1875 }
1876
1877 /* Interleave this node? */
1878 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1879 node_set(nid, interleave_nodes);
1880 }
1881
1882 /* All too small, use the largest */
1883 if (unlikely(nodes_empty(interleave_nodes)))
1884 node_set(prefer, interleave_nodes);
1da177e4 1885
028fec41 1886 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1da177e4
LT
1887 printk("numa_policy_init: interleaving failed\n");
1888}
1889
8bccd85f 1890/* Reset policy of current process to default */
1da177e4
LT
1891void numa_default_policy(void)
1892{
028fec41 1893 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1da177e4 1894}
68860ec1 1895
1a75a6c8
CL
1896/*
1897 * Display pages allocated per node and memory policy via /proc.
1898 */
15ad7cdc
HD
1899static const char * const policy_types[] =
1900 { "default", "prefer", "bind", "interleave" };
1a75a6c8
CL
1901
1902/*
1903 * Convert a mempolicy into a string.
1904 * Returns the number of characters in buffer (if positive)
1905 * or an error (negative)
1906 */
1907static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1908{
1909 char *p = buffer;
1910 int l;
1911 nodemask_t nodes;
a3b51e01 1912 unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
f5b087b5 1913 unsigned short flags = pol ? pol->flags : 0;
1a75a6c8
CL
1914
1915 switch (mode) {
1916 case MPOL_DEFAULT:
1917 nodes_clear(nodes);
1918 break;
1919
1920 case MPOL_PREFERRED:
1921 nodes_clear(nodes);
1922 node_set(pol->v.preferred_node, nodes);
1923 break;
1924
1925 case MPOL_BIND:
19770b32 1926 /* Fall through */
1a75a6c8
CL
1927 case MPOL_INTERLEAVE:
1928 nodes = pol->v.nodes;
1929 break;
1930
1931 default:
1932 BUG();
1933 return -EFAULT;
1934 }
1935
1936 l = strlen(policy_types[mode]);
1937 if (buffer + maxlen < p + l + 1)
1938 return -ENOSPC;
1939
1940 strcpy(p, policy_types[mode]);
1941 p += l;
1942
f5b087b5
DR
1943 if (flags) {
1944 int need_bar = 0;
1945
1946 if (buffer + maxlen < p + 2)
1947 return -ENOSPC;
1948 *p++ = '=';
1949
1950 if (flags & MPOL_F_STATIC_NODES)
1951 p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
4c50bc01
DR
1952 if (flags & MPOL_F_RELATIVE_NODES)
1953 p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
f5b087b5
DR
1954 }
1955
1a75a6c8
CL
1956 if (!nodes_empty(nodes)) {
1957 if (buffer + maxlen < p + 2)
1958 return -ENOSPC;
1959 *p++ = '=';
1960 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1961 }
1962 return p - buffer;
1963}
1964
1965struct numa_maps {
1966 unsigned long pages;
1967 unsigned long anon;
397874df
CL
1968 unsigned long active;
1969 unsigned long writeback;
1a75a6c8 1970 unsigned long mapcount_max;
397874df
CL
1971 unsigned long dirty;
1972 unsigned long swapcache;
1a75a6c8
CL
1973 unsigned long node[MAX_NUMNODES];
1974};
1975
397874df 1976static void gather_stats(struct page *page, void *private, int pte_dirty)
1a75a6c8
CL
1977{
1978 struct numa_maps *md = private;
1979 int count = page_mapcount(page);
1980
397874df
CL
1981 md->pages++;
1982 if (pte_dirty || PageDirty(page))
1983 md->dirty++;
1a75a6c8 1984
397874df
CL
1985 if (PageSwapCache(page))
1986 md->swapcache++;
1a75a6c8 1987
397874df
CL
1988 if (PageActive(page))
1989 md->active++;
1990
1991 if (PageWriteback(page))
1992 md->writeback++;
1a75a6c8
CL
1993
1994 if (PageAnon(page))
1995 md->anon++;
1996
397874df
CL
1997 if (count > md->mapcount_max)
1998 md->mapcount_max = count;
1999
1a75a6c8 2000 md->node[page_to_nid(page)]++;
1a75a6c8
CL
2001}
2002
7f709ed0 2003#ifdef CONFIG_HUGETLB_PAGE
397874df
CL
2004static void check_huge_range(struct vm_area_struct *vma,
2005 unsigned long start, unsigned long end,
2006 struct numa_maps *md)
2007{
2008 unsigned long addr;
2009 struct page *page;
2010
2011 for (addr = start; addr < end; addr += HPAGE_SIZE) {
2012 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
2013 pte_t pte;
2014
2015 if (!ptep)
2016 continue;
2017
2018 pte = *ptep;
2019 if (pte_none(pte))
2020 continue;
2021
2022 page = pte_page(pte);
2023 if (!page)
2024 continue;
2025
2026 gather_stats(page, md, pte_dirty(*ptep));
2027 }
2028}
7f709ed0
AM
2029#else
2030static inline void check_huge_range(struct vm_area_struct *vma,
2031 unsigned long start, unsigned long end,
2032 struct numa_maps *md)
2033{
2034}
2035#endif
397874df 2036
1a75a6c8
CL
2037int show_numa_map(struct seq_file *m, void *v)
2038{
99f89551 2039 struct proc_maps_private *priv = m->private;
1a75a6c8
CL
2040 struct vm_area_struct *vma = v;
2041 struct numa_maps *md;
397874df
CL
2042 struct file *file = vma->vm_file;
2043 struct mm_struct *mm = vma->vm_mm;
480eccf9 2044 struct mempolicy *pol;
1a75a6c8
CL
2045 int n;
2046 char buffer[50];
2047
397874df 2048 if (!mm)
1a75a6c8
CL
2049 return 0;
2050
2051 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2052 if (!md)
2053 return 0;
2054
480eccf9
LS
2055 pol = get_vma_policy(priv->task, vma, vma->vm_start);
2056 mpol_to_str(buffer, sizeof(buffer), pol);
2057 /*
2058 * unref shared or other task's mempolicy
2059 */
2060 if (pol != &default_policy && pol != current->mempolicy)
2061 __mpol_free(pol);
397874df
CL
2062
2063 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2064
2065 if (file) {
2066 seq_printf(m, " file=");
c32c2f63 2067 seq_path(m, &file->f_path, "\n\t= ");
397874df
CL
2068 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2069 seq_printf(m, " heap");
2070 } else if (vma->vm_start <= mm->start_stack &&
2071 vma->vm_end >= mm->start_stack) {
2072 seq_printf(m, " stack");
2073 }
2074
2075 if (is_vm_hugetlb_page(vma)) {
2076 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2077 seq_printf(m, " huge");
2078 } else {
a57ebfdb 2079 check_pgd_range(vma, vma->vm_start, vma->vm_end,
56bbd65d 2080 &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
397874df
CL
2081 }
2082
2083 if (!md->pages)
2084 goto out;
1a75a6c8 2085
397874df
CL
2086 if (md->anon)
2087 seq_printf(m," anon=%lu",md->anon);
1a75a6c8 2088
397874df
CL
2089 if (md->dirty)
2090 seq_printf(m," dirty=%lu",md->dirty);
1a75a6c8 2091
397874df
CL
2092 if (md->pages != md->anon && md->pages != md->dirty)
2093 seq_printf(m, " mapped=%lu", md->pages);
1a75a6c8 2094
397874df
CL
2095 if (md->mapcount_max > 1)
2096 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1a75a6c8 2097
397874df
CL
2098 if (md->swapcache)
2099 seq_printf(m," swapcache=%lu", md->swapcache);
2100
2101 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2102 seq_printf(m," active=%lu", md->active);
2103
2104 if (md->writeback)
2105 seq_printf(m," writeback=%lu", md->writeback);
2106
56bbd65d 2107 for_each_node_state(n, N_HIGH_MEMORY)
397874df
CL
2108 if (md->node[n])
2109 seq_printf(m, " N%d=%lu", n, md->node[n]);
2110out:
2111 seq_putc(m, '\n');
1a75a6c8
CL
2112 kfree(md);
2113
2114 if (m->count < m->size)
99f89551 2115 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1a75a6c8
CL
2116 return 0;
2117}