]> bbs.cooldavid.org Git - net-next-2.6.git/blame - mm/mempolicy.c
[PATCH] cpusets: bitmap and mask remap operators
[net-next-2.6.git] / mm / mempolicy.c
CommitLineData
1da177e4
LT
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
1da177e4
LT
28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
1da177e4
LT
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 could replace all the switch()es with a mempolicy_ops structure.
67*/
68
69#include <linux/mempolicy.h>
70#include <linux/mm.h>
71#include <linux/highmem.h>
72#include <linux/hugetlb.h>
73#include <linux/kernel.h>
74#include <linux/sched.h>
75#include <linux/mm.h>
76#include <linux/nodemask.h>
77#include <linux/cpuset.h>
78#include <linux/gfp.h>
79#include <linux/slab.h>
80#include <linux/string.h>
81#include <linux/module.h>
82#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
85#include <linux/mempolicy.h>
86#include <asm/tlbflush.h>
87#include <asm/uaccess.h>
88
89static kmem_cache_t *policy_cache;
90static kmem_cache_t *sn_cache;
91
92#define PDprintk(fmt...)
93
94/* Highest zone. An specific allocation for a zone below that is not
95 policied. */
96static int policy_zone;
97
d42c6997 98struct mempolicy default_policy = {
1da177e4
LT
99 .refcnt = ATOMIC_INIT(1), /* never free it */
100 .policy = MPOL_DEFAULT,
101};
102
1da177e4 103/* Do sanity checking on a policy */
dfcd3c0d 104static int mpol_check_policy(int mode, nodemask_t *nodes)
1da177e4 105{
dfcd3c0d 106 int empty = nodes_empty(*nodes);
1da177e4
LT
107
108 switch (mode) {
109 case MPOL_DEFAULT:
110 if (!empty)
111 return -EINVAL;
112 break;
113 case MPOL_BIND:
114 case MPOL_INTERLEAVE:
115 /* Preferred will only use the first bit, but allow
116 more for now. */
117 if (empty)
118 return -EINVAL;
119 break;
120 }
dfcd3c0d 121 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
1da177e4 122}
1da177e4 123/* Generate a custom zonelist for the BIND policy. */
dfcd3c0d 124static struct zonelist *bind_zonelist(nodemask_t *nodes)
1da177e4
LT
125{
126 struct zonelist *zl;
127 int num, max, nd;
128
dfcd3c0d 129 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
1da177e4
LT
130 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
131 if (!zl)
132 return NULL;
133 num = 0;
dfcd3c0d 134 for_each_node_mask(nd, *nodes) {
1da177e4
LT
135 int k;
136 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
137 struct zone *z = &NODE_DATA(nd)->node_zones[k];
138 if (!z->present_pages)
139 continue;
140 zl->zones[num++] = z;
141 if (k > policy_zone)
142 policy_zone = k;
143 }
144 }
1da177e4
LT
145 zl->zones[num] = NULL;
146 return zl;
147}
148
149/* Create a new policy */
dfcd3c0d 150static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
1da177e4
LT
151{
152 struct mempolicy *policy;
153
dfcd3c0d 154 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
1da177e4
LT
155 if (mode == MPOL_DEFAULT)
156 return NULL;
157 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
158 if (!policy)
159 return ERR_PTR(-ENOMEM);
160 atomic_set(&policy->refcnt, 1);
161 switch (mode) {
162 case MPOL_INTERLEAVE:
dfcd3c0d 163 policy->v.nodes = *nodes;
1da177e4
LT
164 break;
165 case MPOL_PREFERRED:
dfcd3c0d 166 policy->v.preferred_node = first_node(*nodes);
1da177e4
LT
167 if (policy->v.preferred_node >= MAX_NUMNODES)
168 policy->v.preferred_node = -1;
169 break;
170 case MPOL_BIND:
171 policy->v.zonelist = bind_zonelist(nodes);
172 if (policy->v.zonelist == NULL) {
173 kmem_cache_free(policy_cache, policy);
174 return ERR_PTR(-ENOMEM);
175 }
176 break;
177 }
178 policy->policy = mode;
179 return policy;
180}
181
182/* Ensure all existing pages follow the policy. */
b5810039 183static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
dfcd3c0d 184 unsigned long addr, unsigned long end, nodemask_t *nodes)
1da177e4 185{
91612e0d
HD
186 pte_t *orig_pte;
187 pte_t *pte;
705e87c0 188 spinlock_t *ptl;
941150a3 189
705e87c0 190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
91612e0d
HD
191 do {
192 unsigned long pfn;
193 unsigned int nid;
194
195 if (!pte_present(*pte))
1da177e4 196 continue;
91612e0d 197 pfn = pte_pfn(*pte);
b5810039
NP
198 if (!pfn_valid(pfn)) {
199 print_bad_pte(vma, *pte, addr);
1da177e4 200 continue;
b5810039 201 }
91612e0d 202 nid = pfn_to_nid(pfn);
dfcd3c0d 203 if (!node_isset(nid, *nodes))
91612e0d
HD
204 break;
205 } while (pte++, addr += PAGE_SIZE, addr != end);
705e87c0 206 pte_unmap_unlock(orig_pte, ptl);
91612e0d
HD
207 return addr != end;
208}
209
b5810039 210static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
dfcd3c0d 211 unsigned long addr, unsigned long end, nodemask_t *nodes)
91612e0d
HD
212{
213 pmd_t *pmd;
214 unsigned long next;
215
216 pmd = pmd_offset(pud, addr);
217 do {
218 next = pmd_addr_end(addr, end);
219 if (pmd_none_or_clear_bad(pmd))
220 continue;
b5810039 221 if (check_pte_range(vma, pmd, addr, next, nodes))
91612e0d
HD
222 return -EIO;
223 } while (pmd++, addr = next, addr != end);
224 return 0;
225}
226
b5810039 227static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
dfcd3c0d 228 unsigned long addr, unsigned long end, nodemask_t *nodes)
91612e0d
HD
229{
230 pud_t *pud;
231 unsigned long next;
232
233 pud = pud_offset(pgd, addr);
234 do {
235 next = pud_addr_end(addr, end);
236 if (pud_none_or_clear_bad(pud))
237 continue;
b5810039 238 if (check_pmd_range(vma, pud, addr, next, nodes))
91612e0d
HD
239 return -EIO;
240 } while (pud++, addr = next, addr != end);
241 return 0;
242}
243
b5810039 244static inline int check_pgd_range(struct vm_area_struct *vma,
dfcd3c0d 245 unsigned long addr, unsigned long end, nodemask_t *nodes)
91612e0d
HD
246{
247 pgd_t *pgd;
248 unsigned long next;
249
b5810039 250 pgd = pgd_offset(vma->vm_mm, addr);
91612e0d
HD
251 do {
252 next = pgd_addr_end(addr, end);
253 if (pgd_none_or_clear_bad(pgd))
254 continue;
b5810039 255 if (check_pud_range(vma, pgd, addr, next, nodes))
91612e0d
HD
256 return -EIO;
257 } while (pgd++, addr = next, addr != end);
258 return 0;
1da177e4
LT
259}
260
261/* Step 1: check the range */
262static struct vm_area_struct *
263check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
dfcd3c0d 264 nodemask_t *nodes, unsigned long flags)
1da177e4
LT
265{
266 int err;
267 struct vm_area_struct *first, *vma, *prev;
268
269 first = find_vma(mm, start);
270 if (!first)
271 return ERR_PTR(-EFAULT);
b5810039
NP
272 if (first->vm_flags & VM_RESERVED)
273 return ERR_PTR(-EACCES);
1da177e4
LT
274 prev = NULL;
275 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
276 if (!vma->vm_next && vma->vm_end < end)
277 return ERR_PTR(-EFAULT);
278 if (prev && prev->vm_end < vma->vm_start)
279 return ERR_PTR(-EFAULT);
280 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
5b952b3c
AK
281 unsigned long endvma = vma->vm_end;
282 if (endvma > end)
283 endvma = end;
284 if (vma->vm_start > start)
285 start = vma->vm_start;
b5810039 286 err = check_pgd_range(vma, start, endvma, nodes);
1da177e4
LT
287 if (err) {
288 first = ERR_PTR(err);
289 break;
290 }
291 }
292 prev = vma;
293 }
294 return first;
295}
296
297/* Apply policy to a single VMA */
298static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
299{
300 int err = 0;
301 struct mempolicy *old = vma->vm_policy;
302
303 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
304 vma->vm_start, vma->vm_end, vma->vm_pgoff,
305 vma->vm_ops, vma->vm_file,
306 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
307
308 if (vma->vm_ops && vma->vm_ops->set_policy)
309 err = vma->vm_ops->set_policy(vma, new);
310 if (!err) {
311 mpol_get(new);
312 vma->vm_policy = new;
313 mpol_free(old);
314 }
315 return err;
316}
317
318/* Step 2: apply policy to a range and do splits. */
319static int mbind_range(struct vm_area_struct *vma, unsigned long start,
320 unsigned long end, struct mempolicy *new)
321{
322 struct vm_area_struct *next;
323 int err;
324
325 err = 0;
326 for (; vma && vma->vm_start < end; vma = next) {
327 next = vma->vm_next;
328 if (vma->vm_start < start)
329 err = split_vma(vma->vm_mm, vma, start, 1);
330 if (!err && vma->vm_end > end)
331 err = split_vma(vma->vm_mm, vma, end, 0);
332 if (!err)
333 err = policy_vma(vma, new);
334 if (err)
335 break;
336 }
337 return err;
338}
339
8bccd85f
CL
340static int contextualize_policy(int mode, nodemask_t *nodes)
341{
342 if (!nodes)
343 return 0;
344
345 /* Update current mems_allowed */
346 cpuset_update_current_mems_allowed();
347 /* Ignore nodes not set in current->mems_allowed */
348 cpuset_restrict_to_mems_allowed(nodes->bits);
349 return mpol_check_policy(mode, nodes);
350}
351
352long do_mbind(unsigned long start, unsigned long len,
353 unsigned long mode, nodemask_t *nmask, unsigned long flags)
1da177e4
LT
354{
355 struct vm_area_struct *vma;
356 struct mm_struct *mm = current->mm;
357 struct mempolicy *new;
358 unsigned long end;
1da177e4
LT
359 int err;
360
361 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
362 return -EINVAL;
363 if (start & ~PAGE_MASK)
364 return -EINVAL;
365 if (mode == MPOL_DEFAULT)
366 flags &= ~MPOL_MF_STRICT;
367 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
368 end = start + len;
369 if (end < start)
370 return -EINVAL;
371 if (end == start)
372 return 0;
5fcbb230 373 if (mpol_check_policy(mode, nmask))
8bccd85f
CL
374 return -EINVAL;
375 new = mpol_new(mode, nmask);
1da177e4
LT
376 if (IS_ERR(new))
377 return PTR_ERR(new);
378
379 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
dfcd3c0d 380 mode,nodes_addr(nodes)[0]);
1da177e4
LT
381
382 down_write(&mm->mmap_sem);
8bccd85f 383 vma = check_range(mm, start, end, nmask, flags);
1da177e4
LT
384 err = PTR_ERR(vma);
385 if (!IS_ERR(vma))
386 err = mbind_range(vma, start, end, new);
387 up_write(&mm->mmap_sem);
388 mpol_free(new);
389 return err;
390}
391
392/* Set the process memory policy */
8bccd85f 393long do_set_mempolicy(int mode, nodemask_t *nodes)
1da177e4 394{
1da177e4 395 struct mempolicy *new;
1da177e4 396
8bccd85f 397 if (contextualize_policy(mode, nodes))
1da177e4 398 return -EINVAL;
8bccd85f 399 new = mpol_new(mode, nodes);
1da177e4
LT
400 if (IS_ERR(new))
401 return PTR_ERR(new);
402 mpol_free(current->mempolicy);
403 current->mempolicy = new;
404 if (new && new->policy == MPOL_INTERLEAVE)
dfcd3c0d 405 current->il_next = first_node(new->v.nodes);
1da177e4
LT
406 return 0;
407}
408
409/* Fill a zone bitmap for a policy */
dfcd3c0d 410static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4
LT
411{
412 int i;
413
dfcd3c0d 414 nodes_clear(*nodes);
1da177e4
LT
415 switch (p->policy) {
416 case MPOL_BIND:
417 for (i = 0; p->v.zonelist->zones[i]; i++)
8bccd85f
CL
418 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
419 *nodes);
1da177e4
LT
420 break;
421 case MPOL_DEFAULT:
422 break;
423 case MPOL_INTERLEAVE:
dfcd3c0d 424 *nodes = p->v.nodes;
1da177e4
LT
425 break;
426 case MPOL_PREFERRED:
427 /* or use current node instead of online map? */
428 if (p->v.preferred_node < 0)
dfcd3c0d 429 *nodes = node_online_map;
1da177e4 430 else
dfcd3c0d 431 node_set(p->v.preferred_node, *nodes);
1da177e4
LT
432 break;
433 default:
434 BUG();
435 }
436}
437
438static int lookup_node(struct mm_struct *mm, unsigned long addr)
439{
440 struct page *p;
441 int err;
442
443 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
444 if (err >= 0) {
445 err = page_to_nid(p);
446 put_page(p);
447 }
448 return err;
449}
450
1da177e4 451/* Retrieve NUMA policy */
8bccd85f
CL
452long do_get_mempolicy(int *policy, nodemask_t *nmask,
453 unsigned long addr, unsigned long flags)
1da177e4 454{
8bccd85f 455 int err;
1da177e4
LT
456 struct mm_struct *mm = current->mm;
457 struct vm_area_struct *vma = NULL;
458 struct mempolicy *pol = current->mempolicy;
459
460 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
461 return -EINVAL;
1da177e4
LT
462 if (flags & MPOL_F_ADDR) {
463 down_read(&mm->mmap_sem);
464 vma = find_vma_intersection(mm, addr, addr+1);
465 if (!vma) {
466 up_read(&mm->mmap_sem);
467 return -EFAULT;
468 }
469 if (vma->vm_ops && vma->vm_ops->get_policy)
470 pol = vma->vm_ops->get_policy(vma, addr);
471 else
472 pol = vma->vm_policy;
473 } else if (addr)
474 return -EINVAL;
475
476 if (!pol)
477 pol = &default_policy;
478
479 if (flags & MPOL_F_NODE) {
480 if (flags & MPOL_F_ADDR) {
481 err = lookup_node(mm, addr);
482 if (err < 0)
483 goto out;
8bccd85f 484 *policy = err;
1da177e4
LT
485 } else if (pol == current->mempolicy &&
486 pol->policy == MPOL_INTERLEAVE) {
8bccd85f 487 *policy = current->il_next;
1da177e4
LT
488 } else {
489 err = -EINVAL;
490 goto out;
491 }
492 } else
8bccd85f 493 *policy = pol->policy;
1da177e4
LT
494
495 if (vma) {
496 up_read(&current->mm->mmap_sem);
497 vma = NULL;
498 }
499
1da177e4 500 err = 0;
8bccd85f
CL
501 if (nmask)
502 get_zonemask(pol, nmask);
1da177e4
LT
503
504 out:
505 if (vma)
506 up_read(&current->mm->mmap_sem);
507 return err;
508}
509
8bccd85f
CL
510/*
511 * User space interface with variable sized bitmaps for nodelists.
512 */
513
514/* Copy a node mask from user space. */
515static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
516 unsigned long maxnode)
517{
518 unsigned long k;
519 unsigned long nlongs;
520 unsigned long endmask;
521
522 --maxnode;
523 nodes_clear(*nodes);
524 if (maxnode == 0 || !nmask)
525 return 0;
526
527 nlongs = BITS_TO_LONGS(maxnode);
528 if ((maxnode % BITS_PER_LONG) == 0)
529 endmask = ~0UL;
530 else
531 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
532
533 /* When the user specified more nodes than supported just check
534 if the non supported part is all zero. */
535 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
536 if (nlongs > PAGE_SIZE/sizeof(long))
537 return -EINVAL;
538 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
539 unsigned long t;
540 if (get_user(t, nmask + k))
541 return -EFAULT;
542 if (k == nlongs - 1) {
543 if (t & endmask)
544 return -EINVAL;
545 } else if (t)
546 return -EINVAL;
547 }
548 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
549 endmask = ~0UL;
550 }
551
552 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
553 return -EFAULT;
554 nodes_addr(*nodes)[nlongs-1] &= endmask;
555 return 0;
556}
557
558/* Copy a kernel node mask to user space */
559static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
560 nodemask_t *nodes)
561{
562 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
563 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
564
565 if (copy > nbytes) {
566 if (copy > PAGE_SIZE)
567 return -EINVAL;
568 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
569 return -EFAULT;
570 copy = nbytes;
571 }
572 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
573}
574
575asmlinkage long sys_mbind(unsigned long start, unsigned long len,
576 unsigned long mode,
577 unsigned long __user *nmask, unsigned long maxnode,
578 unsigned flags)
579{
580 nodemask_t nodes;
581 int err;
582
583 err = get_nodes(&nodes, nmask, maxnode);
584 if (err)
585 return err;
586 return do_mbind(start, len, mode, &nodes, flags);
587}
588
589/* Set the process memory policy */
590asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
591 unsigned long maxnode)
592{
593 int err;
594 nodemask_t nodes;
595
596 if (mode < 0 || mode > MPOL_MAX)
597 return -EINVAL;
598 err = get_nodes(&nodes, nmask, maxnode);
599 if (err)
600 return err;
601 return do_set_mempolicy(mode, &nodes);
602}
603
604/* Retrieve NUMA policy */
605asmlinkage long sys_get_mempolicy(int __user *policy,
606 unsigned long __user *nmask,
607 unsigned long maxnode,
608 unsigned long addr, unsigned long flags)
609{
610 int err, pval;
611 nodemask_t nodes;
612
613 if (nmask != NULL && maxnode < MAX_NUMNODES)
614 return -EINVAL;
615
616 err = do_get_mempolicy(&pval, &nodes, addr, flags);
617
618 if (err)
619 return err;
620
621 if (policy && put_user(pval, policy))
622 return -EFAULT;
623
624 if (nmask)
625 err = copy_nodes_to_user(nmask, maxnode, &nodes);
626
627 return err;
628}
629
1da177e4
LT
630#ifdef CONFIG_COMPAT
631
632asmlinkage long compat_sys_get_mempolicy(int __user *policy,
633 compat_ulong_t __user *nmask,
634 compat_ulong_t maxnode,
635 compat_ulong_t addr, compat_ulong_t flags)
636{
637 long err;
638 unsigned long __user *nm = NULL;
639 unsigned long nr_bits, alloc_size;
640 DECLARE_BITMAP(bm, MAX_NUMNODES);
641
642 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
643 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
644
645 if (nmask)
646 nm = compat_alloc_user_space(alloc_size);
647
648 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
649
650 if (!err && nmask) {
651 err = copy_from_user(bm, nm, alloc_size);
652 /* ensure entire bitmap is zeroed */
653 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
654 err |= compat_put_bitmap(nmask, bm, nr_bits);
655 }
656
657 return err;
658}
659
660asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
661 compat_ulong_t maxnode)
662{
663 long err = 0;
664 unsigned long __user *nm = NULL;
665 unsigned long nr_bits, alloc_size;
666 DECLARE_BITMAP(bm, MAX_NUMNODES);
667
668 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
669 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
670
671 if (nmask) {
672 err = compat_get_bitmap(bm, nmask, nr_bits);
673 nm = compat_alloc_user_space(alloc_size);
674 err |= copy_to_user(nm, bm, alloc_size);
675 }
676
677 if (err)
678 return -EFAULT;
679
680 return sys_set_mempolicy(mode, nm, nr_bits+1);
681}
682
683asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
684 compat_ulong_t mode, compat_ulong_t __user *nmask,
685 compat_ulong_t maxnode, compat_ulong_t flags)
686{
687 long err = 0;
688 unsigned long __user *nm = NULL;
689 unsigned long nr_bits, alloc_size;
dfcd3c0d 690 nodemask_t bm;
1da177e4
LT
691
692 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
693 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
694
695 if (nmask) {
dfcd3c0d 696 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1da177e4 697 nm = compat_alloc_user_space(alloc_size);
dfcd3c0d 698 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1da177e4
LT
699 }
700
701 if (err)
702 return -EFAULT;
703
704 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
705}
706
707#endif
708
709/* Return effective policy for a VMA */
6e21c8f1
CL
710struct mempolicy *
711get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
1da177e4 712{
6e21c8f1 713 struct mempolicy *pol = task->mempolicy;
1da177e4
LT
714
715 if (vma) {
716 if (vma->vm_ops && vma->vm_ops->get_policy)
8bccd85f 717 pol = vma->vm_ops->get_policy(vma, addr);
1da177e4
LT
718 else if (vma->vm_policy &&
719 vma->vm_policy->policy != MPOL_DEFAULT)
720 pol = vma->vm_policy;
721 }
722 if (!pol)
723 pol = &default_policy;
724 return pol;
725}
726
727/* Return a zonelist representing a mempolicy */
dd0fc66f 728static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1da177e4
LT
729{
730 int nd;
731
732 switch (policy->policy) {
733 case MPOL_PREFERRED:
734 nd = policy->v.preferred_node;
735 if (nd < 0)
736 nd = numa_node_id();
737 break;
738 case MPOL_BIND:
739 /* Lower zones don't get a policy applied */
740 /* Careful: current->mems_allowed might have moved */
af4ca457 741 if (gfp_zone(gfp) >= policy_zone)
1da177e4
LT
742 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
743 return policy->v.zonelist;
744 /*FALL THROUGH*/
745 case MPOL_INTERLEAVE: /* should not happen */
746 case MPOL_DEFAULT:
747 nd = numa_node_id();
748 break;
749 default:
750 nd = 0;
751 BUG();
752 }
af4ca457 753 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1da177e4
LT
754}
755
756/* Do dynamic interleaving for a process */
757static unsigned interleave_nodes(struct mempolicy *policy)
758{
759 unsigned nid, next;
760 struct task_struct *me = current;
761
762 nid = me->il_next;
dfcd3c0d 763 next = next_node(nid, policy->v.nodes);
1da177e4 764 if (next >= MAX_NUMNODES)
dfcd3c0d 765 next = first_node(policy->v.nodes);
1da177e4
LT
766 me->il_next = next;
767 return nid;
768}
769
770/* Do static interleaving for a VMA with known offset. */
771static unsigned offset_il_node(struct mempolicy *pol,
772 struct vm_area_struct *vma, unsigned long off)
773{
dfcd3c0d 774 unsigned nnodes = nodes_weight(pol->v.nodes);
1da177e4
LT
775 unsigned target = (unsigned)off % nnodes;
776 int c;
777 int nid = -1;
778
779 c = 0;
780 do {
dfcd3c0d 781 nid = next_node(nid, pol->v.nodes);
1da177e4
LT
782 c++;
783 } while (c <= target);
1da177e4
LT
784 return nid;
785}
786
787/* Allocate a page in interleaved policy.
788 Own path because it needs to do special accounting. */
662f3a0b
AK
789static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
790 unsigned nid)
1da177e4
LT
791{
792 struct zonelist *zl;
793 struct page *page;
794
af4ca457 795 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1da177e4
LT
796 page = __alloc_pages(gfp, order, zl);
797 if (page && page_zone(page) == zl->zones[0]) {
e7c8d5c9 798 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1da177e4
LT
799 put_cpu();
800 }
801 return page;
802}
803
804/**
805 * alloc_page_vma - Allocate a page for a VMA.
806 *
807 * @gfp:
808 * %GFP_USER user allocation.
809 * %GFP_KERNEL kernel allocations,
810 * %GFP_HIGHMEM highmem/user allocations,
811 * %GFP_FS allocation should not call back into a file system.
812 * %GFP_ATOMIC don't sleep.
813 *
814 * @vma: Pointer to VMA or NULL if not available.
815 * @addr: Virtual Address of the allocation. Must be inside the VMA.
816 *
817 * This function allocates a page from the kernel page pool and applies
818 * a NUMA policy associated with the VMA or the current process.
819 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
820 * mm_struct of the VMA to prevent it from going away. Should be used for
821 * all allocations for pages that will be mapped into
822 * user space. Returns NULL when no page can be allocated.
823 *
824 * Should be called with the mm_sem of the vma hold.
825 */
826struct page *
dd0fc66f 827alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1da177e4 828{
6e21c8f1 829 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1da177e4
LT
830
831 cpuset_update_current_mems_allowed();
832
833 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
834 unsigned nid;
835 if (vma) {
836 unsigned long off;
1da177e4
LT
837 off = vma->vm_pgoff;
838 off += (addr - vma->vm_start) >> PAGE_SHIFT;
839 nid = offset_il_node(pol, vma, off);
840 } else {
841 /* fall back to process interleaving */
842 nid = interleave_nodes(pol);
843 }
844 return alloc_page_interleave(gfp, 0, nid);
845 }
846 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
847}
848
849/**
850 * alloc_pages_current - Allocate pages.
851 *
852 * @gfp:
853 * %GFP_USER user allocation,
854 * %GFP_KERNEL kernel allocation,
855 * %GFP_HIGHMEM highmem allocation,
856 * %GFP_FS don't call back into a file system.
857 * %GFP_ATOMIC don't sleep.
858 * @order: Power of two of allocation size in pages. 0 is a single page.
859 *
860 * Allocate a page from the kernel page pool. When not in
861 * interrupt context and apply the current process NUMA policy.
862 * Returns NULL when no page can be allocated.
863 *
864 * Don't call cpuset_update_current_mems_allowed() unless
865 * 1) it's ok to take cpuset_sem (can WAIT), and
866 * 2) allocating for current task (not interrupt).
867 */
dd0fc66f 868struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1da177e4
LT
869{
870 struct mempolicy *pol = current->mempolicy;
871
872 if ((gfp & __GFP_WAIT) && !in_interrupt())
873 cpuset_update_current_mems_allowed();
874 if (!pol || in_interrupt())
875 pol = &default_policy;
876 if (pol->policy == MPOL_INTERLEAVE)
877 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
878 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
879}
880EXPORT_SYMBOL(alloc_pages_current);
881
882/* Slow path of a mempolicy copy */
883struct mempolicy *__mpol_copy(struct mempolicy *old)
884{
885 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
886
887 if (!new)
888 return ERR_PTR(-ENOMEM);
889 *new = *old;
890 atomic_set(&new->refcnt, 1);
891 if (new->policy == MPOL_BIND) {
892 int sz = ksize(old->v.zonelist);
893 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
894 if (!new->v.zonelist) {
895 kmem_cache_free(policy_cache, new);
896 return ERR_PTR(-ENOMEM);
897 }
898 memcpy(new->v.zonelist, old->v.zonelist, sz);
899 }
900 return new;
901}
902
903/* Slow path of a mempolicy comparison */
904int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
905{
906 if (!a || !b)
907 return 0;
908 if (a->policy != b->policy)
909 return 0;
910 switch (a->policy) {
911 case MPOL_DEFAULT:
912 return 1;
913 case MPOL_INTERLEAVE:
dfcd3c0d 914 return nodes_equal(a->v.nodes, b->v.nodes);
1da177e4
LT
915 case MPOL_PREFERRED:
916 return a->v.preferred_node == b->v.preferred_node;
917 case MPOL_BIND: {
918 int i;
919 for (i = 0; a->v.zonelist->zones[i]; i++)
920 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
921 return 0;
922 return b->v.zonelist->zones[i] == NULL;
923 }
924 default:
925 BUG();
926 return 0;
927 }
928}
929
930/* Slow path of a mpol destructor. */
931void __mpol_free(struct mempolicy *p)
932{
933 if (!atomic_dec_and_test(&p->refcnt))
934 return;
935 if (p->policy == MPOL_BIND)
936 kfree(p->v.zonelist);
937 p->policy = MPOL_DEFAULT;
938 kmem_cache_free(policy_cache, p);
939}
940
941/*
942 * Hugetlb policy. Same as above, just works with node numbers instead of
943 * zonelists.
944 */
945
946/* Find first node suitable for an allocation */
947int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
948{
6e21c8f1 949 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1da177e4
LT
950
951 switch (pol->policy) {
952 case MPOL_DEFAULT:
953 return numa_node_id();
954 case MPOL_BIND:
955 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
956 case MPOL_INTERLEAVE:
957 return interleave_nodes(pol);
958 case MPOL_PREFERRED:
959 return pol->v.preferred_node >= 0 ?
960 pol->v.preferred_node : numa_node_id();
961 }
962 BUG();
963 return 0;
964}
965
966/* Find secondary valid nodes for an allocation */
967int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
968{
6e21c8f1 969 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1da177e4
LT
970
971 switch (pol->policy) {
972 case MPOL_PREFERRED:
973 case MPOL_DEFAULT:
974 case MPOL_INTERLEAVE:
975 return 1;
976 case MPOL_BIND: {
977 struct zone **z;
978 for (z = pol->v.zonelist->zones; *z; z++)
979 if ((*z)->zone_pgdat->node_id == nid)
980 return 1;
981 return 0;
982 }
983 default:
984 BUG();
985 return 0;
986 }
987}
988
989/*
990 * Shared memory backing store policy support.
991 *
992 * Remember policies even when nobody has shared memory mapped.
993 * The policies are kept in Red-Black tree linked from the inode.
994 * They are protected by the sp->lock spinlock, which should be held
995 * for any accesses to the tree.
996 */
997
998/* lookup first element intersecting start-end */
999/* Caller holds sp->lock */
1000static struct sp_node *
1001sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1002{
1003 struct rb_node *n = sp->root.rb_node;
1004
1005 while (n) {
1006 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1007
1008 if (start >= p->end)
1009 n = n->rb_right;
1010 else if (end <= p->start)
1011 n = n->rb_left;
1012 else
1013 break;
1014 }
1015 if (!n)
1016 return NULL;
1017 for (;;) {
1018 struct sp_node *w = NULL;
1019 struct rb_node *prev = rb_prev(n);
1020 if (!prev)
1021 break;
1022 w = rb_entry(prev, struct sp_node, nd);
1023 if (w->end <= start)
1024 break;
1025 n = prev;
1026 }
1027 return rb_entry(n, struct sp_node, nd);
1028}
1029
1030/* Insert a new shared policy into the list. */
1031/* Caller holds sp->lock */
1032static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1033{
1034 struct rb_node **p = &sp->root.rb_node;
1035 struct rb_node *parent = NULL;
1036 struct sp_node *nd;
1037
1038 while (*p) {
1039 parent = *p;
1040 nd = rb_entry(parent, struct sp_node, nd);
1041 if (new->start < nd->start)
1042 p = &(*p)->rb_left;
1043 else if (new->end > nd->end)
1044 p = &(*p)->rb_right;
1045 else
1046 BUG();
1047 }
1048 rb_link_node(&new->nd, parent, p);
1049 rb_insert_color(&new->nd, &sp->root);
1050 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1051 new->policy ? new->policy->policy : 0);
1052}
1053
1054/* Find shared policy intersecting idx */
1055struct mempolicy *
1056mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1057{
1058 struct mempolicy *pol = NULL;
1059 struct sp_node *sn;
1060
1061 if (!sp->root.rb_node)
1062 return NULL;
1063 spin_lock(&sp->lock);
1064 sn = sp_lookup(sp, idx, idx+1);
1065 if (sn) {
1066 mpol_get(sn->policy);
1067 pol = sn->policy;
1068 }
1069 spin_unlock(&sp->lock);
1070 return pol;
1071}
1072
1073static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1074{
1075 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1076 rb_erase(&n->nd, &sp->root);
1077 mpol_free(n->policy);
1078 kmem_cache_free(sn_cache, n);
1079}
1080
1081struct sp_node *
1082sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1083{
1084 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1085
1086 if (!n)
1087 return NULL;
1088 n->start = start;
1089 n->end = end;
1090 mpol_get(pol);
1091 n->policy = pol;
1092 return n;
1093}
1094
1095/* Replace a policy range. */
1096static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1097 unsigned long end, struct sp_node *new)
1098{
1099 struct sp_node *n, *new2 = NULL;
1100
1101restart:
1102 spin_lock(&sp->lock);
1103 n = sp_lookup(sp, start, end);
1104 /* Take care of old policies in the same range. */
1105 while (n && n->start < end) {
1106 struct rb_node *next = rb_next(&n->nd);
1107 if (n->start >= start) {
1108 if (n->end <= end)
1109 sp_delete(sp, n);
1110 else
1111 n->start = end;
1112 } else {
1113 /* Old policy spanning whole new range. */
1114 if (n->end > end) {
1115 if (!new2) {
1116 spin_unlock(&sp->lock);
1117 new2 = sp_alloc(end, n->end, n->policy);
1118 if (!new2)
1119 return -ENOMEM;
1120 goto restart;
1121 }
1122 n->end = start;
1123 sp_insert(sp, new2);
1124 new2 = NULL;
1125 break;
1126 } else
1127 n->end = start;
1128 }
1129 if (!next)
1130 break;
1131 n = rb_entry(next, struct sp_node, nd);
1132 }
1133 if (new)
1134 sp_insert(sp, new);
1135 spin_unlock(&sp->lock);
1136 if (new2) {
1137 mpol_free(new2->policy);
1138 kmem_cache_free(sn_cache, new2);
1139 }
1140 return 0;
1141}
1142
1143int mpol_set_shared_policy(struct shared_policy *info,
1144 struct vm_area_struct *vma, struct mempolicy *npol)
1145{
1146 int err;
1147 struct sp_node *new = NULL;
1148 unsigned long sz = vma_pages(vma);
1149
1150 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1151 vma->vm_pgoff,
1152 sz, npol? npol->policy : -1,
dfcd3c0d 1153 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1da177e4
LT
1154
1155 if (npol) {
1156 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1157 if (!new)
1158 return -ENOMEM;
1159 }
1160 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1161 if (err && new)
1162 kmem_cache_free(sn_cache, new);
1163 return err;
1164}
1165
1166/* Free a backing policy store on inode delete. */
1167void mpol_free_shared_policy(struct shared_policy *p)
1168{
1169 struct sp_node *n;
1170 struct rb_node *next;
1171
1172 if (!p->root.rb_node)
1173 return;
1174 spin_lock(&p->lock);
1175 next = rb_first(&p->root);
1176 while (next) {
1177 n = rb_entry(next, struct sp_node, nd);
1178 next = rb_next(&n->nd);
90c5029e 1179 rb_erase(&n->nd, &p->root);
1da177e4
LT
1180 mpol_free(n->policy);
1181 kmem_cache_free(sn_cache, n);
1182 }
1183 spin_unlock(&p->lock);
1da177e4
LT
1184}
1185
1186/* assumes fs == KERNEL_DS */
1187void __init numa_policy_init(void)
1188{
1189 policy_cache = kmem_cache_create("numa_policy",
1190 sizeof(struct mempolicy),
1191 0, SLAB_PANIC, NULL, NULL);
1192
1193 sn_cache = kmem_cache_create("shared_policy_node",
1194 sizeof(struct sp_node),
1195 0, SLAB_PANIC, NULL, NULL);
1196
1197 /* Set interleaving policy for system init. This way not all
1198 the data structures allocated at system boot end up in node zero. */
1199
8bccd85f 1200 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1da177e4
LT
1201 printk("numa_policy_init: interleaving failed\n");
1202}
1203
8bccd85f 1204/* Reset policy of current process to default */
1da177e4
LT
1205void numa_default_policy(void)
1206{
8bccd85f 1207 do_set_mempolicy(MPOL_DEFAULT, NULL);
1da177e4 1208}