]> bbs.cooldavid.org Git - net-next-2.6.git/blame - mm/mempolicy.c
[PATCH] mm: page fault handler locking
[net-next-2.6.git] / mm / mempolicy.c
CommitLineData
1da177e4
LT
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
6 *
7 * NUMA policy allows the user to give hints in which node(s) memory should
8 * be allocated.
9 *
10 * Support four policies per VMA and per process:
11 *
12 * The VMA policy has priority over the process policy for a page fault.
13 *
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
19 * is used.
20 * bind Only allocate memory on a specific set of nodes,
21 * no fallback.
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
26 * process policy.
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
30 *
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
35 *
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
39 *
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
44 *
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
47 */
48
49/* Notebook:
50 fix mmap readahead to honour policy and enable policy for any page cache
51 object
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
54 first item above.
55 handle mremap for shared memory (currently ignored for the policy)
56 grows down?
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
60*/
61
62#include <linux/mempolicy.h>
63#include <linux/mm.h>
64#include <linux/highmem.h>
65#include <linux/hugetlb.h>
66#include <linux/kernel.h>
67#include <linux/sched.h>
68#include <linux/mm.h>
69#include <linux/nodemask.h>
70#include <linux/cpuset.h>
71#include <linux/gfp.h>
72#include <linux/slab.h>
73#include <linux/string.h>
74#include <linux/module.h>
75#include <linux/interrupt.h>
76#include <linux/init.h>
77#include <linux/compat.h>
78#include <linux/mempolicy.h>
79#include <asm/tlbflush.h>
80#include <asm/uaccess.h>
81
82static kmem_cache_t *policy_cache;
83static kmem_cache_t *sn_cache;
84
85#define PDprintk(fmt...)
86
87/* Highest zone. An specific allocation for a zone below that is not
88 policied. */
89static int policy_zone;
90
d42c6997 91struct mempolicy default_policy = {
1da177e4
LT
92 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
94};
95
1da177e4 96/* Do sanity checking on a policy */
dfcd3c0d 97static int mpol_check_policy(int mode, nodemask_t *nodes)
1da177e4 98{
dfcd3c0d 99 int empty = nodes_empty(*nodes);
1da177e4
LT
100
101 switch (mode) {
102 case MPOL_DEFAULT:
103 if (!empty)
104 return -EINVAL;
105 break;
106 case MPOL_BIND:
107 case MPOL_INTERLEAVE:
108 /* Preferred will only use the first bit, but allow
109 more for now. */
110 if (empty)
111 return -EINVAL;
112 break;
113 }
dfcd3c0d 114 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
1da177e4
LT
115}
116
117/* Copy a node mask from user space. */
dfcd3c0d 118static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
1da177e4
LT
119 unsigned long maxnode, int mode)
120{
121 unsigned long k;
122 unsigned long nlongs;
123 unsigned long endmask;
124
125 --maxnode;
dfcd3c0d 126 nodes_clear(*nodes);
1da177e4
LT
127 if (maxnode == 0 || !nmask)
128 return 0;
129
130 nlongs = BITS_TO_LONGS(maxnode);
131 if ((maxnode % BITS_PER_LONG) == 0)
132 endmask = ~0UL;
133 else
134 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
135
136 /* When the user specified more nodes than supported just check
137 if the non supported part is all zero. */
138 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
139 if (nlongs > PAGE_SIZE/sizeof(long))
140 return -EINVAL;
141 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
142 unsigned long t;
dfcd3c0d 143 if (get_user(t, nmask + k))
1da177e4
LT
144 return -EFAULT;
145 if (k == nlongs - 1) {
146 if (t & endmask)
147 return -EINVAL;
148 } else if (t)
149 return -EINVAL;
150 }
151 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
152 endmask = ~0UL;
153 }
154
dfcd3c0d 155 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1da177e4 156 return -EFAULT;
dfcd3c0d 157 nodes_addr(*nodes)[nlongs-1] &= endmask;
1da177e4
LT
158 /* Update current mems_allowed */
159 cpuset_update_current_mems_allowed();
160 /* Ignore nodes not set in current->mems_allowed */
dfcd3c0d
AK
161 /* AK: shouldn't this error out instead? */
162 cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
1da177e4
LT
163 return mpol_check_policy(mode, nodes);
164}
165
166/* Generate a custom zonelist for the BIND policy. */
dfcd3c0d 167static struct zonelist *bind_zonelist(nodemask_t *nodes)
1da177e4
LT
168{
169 struct zonelist *zl;
170 int num, max, nd;
171
dfcd3c0d 172 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
1da177e4
LT
173 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
174 if (!zl)
175 return NULL;
176 num = 0;
dfcd3c0d 177 for_each_node_mask(nd, *nodes) {
1da177e4
LT
178 int k;
179 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
180 struct zone *z = &NODE_DATA(nd)->node_zones[k];
181 if (!z->present_pages)
182 continue;
183 zl->zones[num++] = z;
184 if (k > policy_zone)
185 policy_zone = k;
186 }
187 }
1da177e4
LT
188 zl->zones[num] = NULL;
189 return zl;
190}
191
192/* Create a new policy */
dfcd3c0d 193static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
1da177e4
LT
194{
195 struct mempolicy *policy;
196
dfcd3c0d 197 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
1da177e4
LT
198 if (mode == MPOL_DEFAULT)
199 return NULL;
200 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
201 if (!policy)
202 return ERR_PTR(-ENOMEM);
203 atomic_set(&policy->refcnt, 1);
204 switch (mode) {
205 case MPOL_INTERLEAVE:
dfcd3c0d 206 policy->v.nodes = *nodes;
1da177e4
LT
207 break;
208 case MPOL_PREFERRED:
dfcd3c0d 209 policy->v.preferred_node = first_node(*nodes);
1da177e4
LT
210 if (policy->v.preferred_node >= MAX_NUMNODES)
211 policy->v.preferred_node = -1;
212 break;
213 case MPOL_BIND:
214 policy->v.zonelist = bind_zonelist(nodes);
215 if (policy->v.zonelist == NULL) {
216 kmem_cache_free(policy_cache, policy);
217 return ERR_PTR(-ENOMEM);
218 }
219 break;
220 }
221 policy->policy = mode;
222 return policy;
223}
224
225/* Ensure all existing pages follow the policy. */
b5810039 226static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
dfcd3c0d 227 unsigned long addr, unsigned long end, nodemask_t *nodes)
1da177e4 228{
91612e0d
HD
229 pte_t *orig_pte;
230 pte_t *pte;
941150a3 231
b5810039 232 spin_lock(&vma->vm_mm->page_table_lock);
91612e0d
HD
233 orig_pte = pte = pte_offset_map(pmd, addr);
234 do {
235 unsigned long pfn;
236 unsigned int nid;
237
238 if (!pte_present(*pte))
1da177e4 239 continue;
91612e0d 240 pfn = pte_pfn(*pte);
b5810039
NP
241 if (!pfn_valid(pfn)) {
242 print_bad_pte(vma, *pte, addr);
1da177e4 243 continue;
b5810039 244 }
91612e0d 245 nid = pfn_to_nid(pfn);
dfcd3c0d 246 if (!node_isset(nid, *nodes))
91612e0d
HD
247 break;
248 } while (pte++, addr += PAGE_SIZE, addr != end);
249 pte_unmap(orig_pte);
b5810039 250 spin_unlock(&vma->vm_mm->page_table_lock);
91612e0d
HD
251 return addr != end;
252}
253
b5810039 254static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
dfcd3c0d 255 unsigned long addr, unsigned long end, nodemask_t *nodes)
91612e0d
HD
256{
257 pmd_t *pmd;
258 unsigned long next;
259
260 pmd = pmd_offset(pud, addr);
261 do {
262 next = pmd_addr_end(addr, end);
263 if (pmd_none_or_clear_bad(pmd))
264 continue;
b5810039 265 if (check_pte_range(vma, pmd, addr, next, nodes))
91612e0d
HD
266 return -EIO;
267 } while (pmd++, addr = next, addr != end);
268 return 0;
269}
270
b5810039 271static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
dfcd3c0d 272 unsigned long addr, unsigned long end, nodemask_t *nodes)
91612e0d
HD
273{
274 pud_t *pud;
275 unsigned long next;
276
277 pud = pud_offset(pgd, addr);
278 do {
279 next = pud_addr_end(addr, end);
280 if (pud_none_or_clear_bad(pud))
281 continue;
b5810039 282 if (check_pmd_range(vma, pud, addr, next, nodes))
91612e0d
HD
283 return -EIO;
284 } while (pud++, addr = next, addr != end);
285 return 0;
286}
287
b5810039 288static inline int check_pgd_range(struct vm_area_struct *vma,
dfcd3c0d 289 unsigned long addr, unsigned long end, nodemask_t *nodes)
91612e0d
HD
290{
291 pgd_t *pgd;
292 unsigned long next;
293
b5810039 294 pgd = pgd_offset(vma->vm_mm, addr);
91612e0d
HD
295 do {
296 next = pgd_addr_end(addr, end);
297 if (pgd_none_or_clear_bad(pgd))
298 continue;
b5810039 299 if (check_pud_range(vma, pgd, addr, next, nodes))
91612e0d
HD
300 return -EIO;
301 } while (pgd++, addr = next, addr != end);
302 return 0;
1da177e4
LT
303}
304
305/* Step 1: check the range */
306static struct vm_area_struct *
307check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
dfcd3c0d 308 nodemask_t *nodes, unsigned long flags)
1da177e4
LT
309{
310 int err;
311 struct vm_area_struct *first, *vma, *prev;
312
313 first = find_vma(mm, start);
314 if (!first)
315 return ERR_PTR(-EFAULT);
b5810039
NP
316 if (first->vm_flags & VM_RESERVED)
317 return ERR_PTR(-EACCES);
1da177e4
LT
318 prev = NULL;
319 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
320 if (!vma->vm_next && vma->vm_end < end)
321 return ERR_PTR(-EFAULT);
322 if (prev && prev->vm_end < vma->vm_start)
323 return ERR_PTR(-EFAULT);
324 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
5b952b3c
AK
325 unsigned long endvma = vma->vm_end;
326 if (endvma > end)
327 endvma = end;
328 if (vma->vm_start > start)
329 start = vma->vm_start;
b5810039 330 err = check_pgd_range(vma, start, endvma, nodes);
1da177e4
LT
331 if (err) {
332 first = ERR_PTR(err);
333 break;
334 }
335 }
336 prev = vma;
337 }
338 return first;
339}
340
341/* Apply policy to a single VMA */
342static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
343{
344 int err = 0;
345 struct mempolicy *old = vma->vm_policy;
346
347 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
348 vma->vm_start, vma->vm_end, vma->vm_pgoff,
349 vma->vm_ops, vma->vm_file,
350 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
351
352 if (vma->vm_ops && vma->vm_ops->set_policy)
353 err = vma->vm_ops->set_policy(vma, new);
354 if (!err) {
355 mpol_get(new);
356 vma->vm_policy = new;
357 mpol_free(old);
358 }
359 return err;
360}
361
362/* Step 2: apply policy to a range and do splits. */
363static int mbind_range(struct vm_area_struct *vma, unsigned long start,
364 unsigned long end, struct mempolicy *new)
365{
366 struct vm_area_struct *next;
367 int err;
368
369 err = 0;
370 for (; vma && vma->vm_start < end; vma = next) {
371 next = vma->vm_next;
372 if (vma->vm_start < start)
373 err = split_vma(vma->vm_mm, vma, start, 1);
374 if (!err && vma->vm_end > end)
375 err = split_vma(vma->vm_mm, vma, end, 0);
376 if (!err)
377 err = policy_vma(vma, new);
378 if (err)
379 break;
380 }
381 return err;
382}
383
384/* Change policy for a memory range */
385asmlinkage long sys_mbind(unsigned long start, unsigned long len,
386 unsigned long mode,
387 unsigned long __user *nmask, unsigned long maxnode,
388 unsigned flags)
389{
390 struct vm_area_struct *vma;
391 struct mm_struct *mm = current->mm;
392 struct mempolicy *new;
393 unsigned long end;
dfcd3c0d 394 nodemask_t nodes;
1da177e4
LT
395 int err;
396
397 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
398 return -EINVAL;
399 if (start & ~PAGE_MASK)
400 return -EINVAL;
401 if (mode == MPOL_DEFAULT)
402 flags &= ~MPOL_MF_STRICT;
403 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
404 end = start + len;
405 if (end < start)
406 return -EINVAL;
407 if (end == start)
408 return 0;
409
dfcd3c0d 410 err = get_nodes(&nodes, nmask, maxnode, mode);
1da177e4
LT
411 if (err)
412 return err;
413
dfcd3c0d 414 new = mpol_new(mode, &nodes);
1da177e4
LT
415 if (IS_ERR(new))
416 return PTR_ERR(new);
417
418 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
dfcd3c0d 419 mode,nodes_addr(nodes)[0]);
1da177e4
LT
420
421 down_write(&mm->mmap_sem);
dfcd3c0d 422 vma = check_range(mm, start, end, &nodes, flags);
1da177e4
LT
423 err = PTR_ERR(vma);
424 if (!IS_ERR(vma))
425 err = mbind_range(vma, start, end, new);
426 up_write(&mm->mmap_sem);
427 mpol_free(new);
428 return err;
429}
430
431/* Set the process memory policy */
432asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
433 unsigned long maxnode)
434{
435 int err;
436 struct mempolicy *new;
dfcd3c0d 437 nodemask_t nodes;
1da177e4 438
ba17101b 439 if (mode < 0 || mode > MPOL_MAX)
1da177e4 440 return -EINVAL;
dfcd3c0d 441 err = get_nodes(&nodes, nmask, maxnode, mode);
1da177e4
LT
442 if (err)
443 return err;
dfcd3c0d 444 new = mpol_new(mode, &nodes);
1da177e4
LT
445 if (IS_ERR(new))
446 return PTR_ERR(new);
447 mpol_free(current->mempolicy);
448 current->mempolicy = new;
449 if (new && new->policy == MPOL_INTERLEAVE)
dfcd3c0d 450 current->il_next = first_node(new->v.nodes);
1da177e4
LT
451 return 0;
452}
453
454/* Fill a zone bitmap for a policy */
dfcd3c0d 455static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4
LT
456{
457 int i;
458
dfcd3c0d 459 nodes_clear(*nodes);
1da177e4
LT
460 switch (p->policy) {
461 case MPOL_BIND:
462 for (i = 0; p->v.zonelist->zones[i]; i++)
dfcd3c0d 463 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes);
1da177e4
LT
464 break;
465 case MPOL_DEFAULT:
466 break;
467 case MPOL_INTERLEAVE:
dfcd3c0d 468 *nodes = p->v.nodes;
1da177e4
LT
469 break;
470 case MPOL_PREFERRED:
471 /* or use current node instead of online map? */
472 if (p->v.preferred_node < 0)
dfcd3c0d 473 *nodes = node_online_map;
1da177e4 474 else
dfcd3c0d 475 node_set(p->v.preferred_node, *nodes);
1da177e4
LT
476 break;
477 default:
478 BUG();
479 }
480}
481
482static int lookup_node(struct mm_struct *mm, unsigned long addr)
483{
484 struct page *p;
485 int err;
486
487 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
488 if (err >= 0) {
489 err = page_to_nid(p);
490 put_page(p);
491 }
492 return err;
493}
494
495/* Copy a kernel node mask to user space */
496static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
dfcd3c0d 497 nodemask_t *nodes)
1da177e4
LT
498{
499 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
dfcd3c0d 500 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1da177e4
LT
501
502 if (copy > nbytes) {
503 if (copy > PAGE_SIZE)
504 return -EINVAL;
505 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
506 return -EFAULT;
507 copy = nbytes;
508 }
dfcd3c0d 509 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1da177e4
LT
510}
511
512/* Retrieve NUMA policy */
513asmlinkage long sys_get_mempolicy(int __user *policy,
514 unsigned long __user *nmask,
515 unsigned long maxnode,
516 unsigned long addr, unsigned long flags)
517{
518 int err, pval;
519 struct mm_struct *mm = current->mm;
520 struct vm_area_struct *vma = NULL;
521 struct mempolicy *pol = current->mempolicy;
522
523 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
524 return -EINVAL;
525 if (nmask != NULL && maxnode < MAX_NUMNODES)
526 return -EINVAL;
527 if (flags & MPOL_F_ADDR) {
528 down_read(&mm->mmap_sem);
529 vma = find_vma_intersection(mm, addr, addr+1);
530 if (!vma) {
531 up_read(&mm->mmap_sem);
532 return -EFAULT;
533 }
534 if (vma->vm_ops && vma->vm_ops->get_policy)
535 pol = vma->vm_ops->get_policy(vma, addr);
536 else
537 pol = vma->vm_policy;
538 } else if (addr)
539 return -EINVAL;
540
541 if (!pol)
542 pol = &default_policy;
543
544 if (flags & MPOL_F_NODE) {
545 if (flags & MPOL_F_ADDR) {
546 err = lookup_node(mm, addr);
547 if (err < 0)
548 goto out;
549 pval = err;
550 } else if (pol == current->mempolicy &&
551 pol->policy == MPOL_INTERLEAVE) {
552 pval = current->il_next;
553 } else {
554 err = -EINVAL;
555 goto out;
556 }
557 } else
558 pval = pol->policy;
559
560 if (vma) {
561 up_read(&current->mm->mmap_sem);
562 vma = NULL;
563 }
564
565 if (policy && put_user(pval, policy))
566 return -EFAULT;
567
568 err = 0;
569 if (nmask) {
dfcd3c0d
AK
570 nodemask_t nodes;
571 get_zonemask(pol, &nodes);
572 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1da177e4
LT
573 }
574
575 out:
576 if (vma)
577 up_read(&current->mm->mmap_sem);
578 return err;
579}
580
581#ifdef CONFIG_COMPAT
582
583asmlinkage long compat_sys_get_mempolicy(int __user *policy,
584 compat_ulong_t __user *nmask,
585 compat_ulong_t maxnode,
586 compat_ulong_t addr, compat_ulong_t flags)
587{
588 long err;
589 unsigned long __user *nm = NULL;
590 unsigned long nr_bits, alloc_size;
591 DECLARE_BITMAP(bm, MAX_NUMNODES);
592
593 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
594 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
595
596 if (nmask)
597 nm = compat_alloc_user_space(alloc_size);
598
599 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
600
601 if (!err && nmask) {
602 err = copy_from_user(bm, nm, alloc_size);
603 /* ensure entire bitmap is zeroed */
604 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
605 err |= compat_put_bitmap(nmask, bm, nr_bits);
606 }
607
608 return err;
609}
610
611asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
612 compat_ulong_t maxnode)
613{
614 long err = 0;
615 unsigned long __user *nm = NULL;
616 unsigned long nr_bits, alloc_size;
617 DECLARE_BITMAP(bm, MAX_NUMNODES);
618
619 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
620 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
621
622 if (nmask) {
623 err = compat_get_bitmap(bm, nmask, nr_bits);
624 nm = compat_alloc_user_space(alloc_size);
625 err |= copy_to_user(nm, bm, alloc_size);
626 }
627
628 if (err)
629 return -EFAULT;
630
631 return sys_set_mempolicy(mode, nm, nr_bits+1);
632}
633
634asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
635 compat_ulong_t mode, compat_ulong_t __user *nmask,
636 compat_ulong_t maxnode, compat_ulong_t flags)
637{
638 long err = 0;
639 unsigned long __user *nm = NULL;
640 unsigned long nr_bits, alloc_size;
dfcd3c0d 641 nodemask_t bm;
1da177e4
LT
642
643 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
644 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
645
646 if (nmask) {
dfcd3c0d 647 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1da177e4 648 nm = compat_alloc_user_space(alloc_size);
dfcd3c0d 649 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1da177e4
LT
650 }
651
652 if (err)
653 return -EFAULT;
654
655 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
656}
657
658#endif
659
660/* Return effective policy for a VMA */
6e21c8f1
CL
661struct mempolicy *
662get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
1da177e4 663{
6e21c8f1 664 struct mempolicy *pol = task->mempolicy;
1da177e4
LT
665
666 if (vma) {
667 if (vma->vm_ops && vma->vm_ops->get_policy)
668 pol = vma->vm_ops->get_policy(vma, addr);
669 else if (vma->vm_policy &&
670 vma->vm_policy->policy != MPOL_DEFAULT)
671 pol = vma->vm_policy;
672 }
673 if (!pol)
674 pol = &default_policy;
675 return pol;
676}
677
678/* Return a zonelist representing a mempolicy */
dd0fc66f 679static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1da177e4
LT
680{
681 int nd;
682
683 switch (policy->policy) {
684 case MPOL_PREFERRED:
685 nd = policy->v.preferred_node;
686 if (nd < 0)
687 nd = numa_node_id();
688 break;
689 case MPOL_BIND:
690 /* Lower zones don't get a policy applied */
691 /* Careful: current->mems_allowed might have moved */
af4ca457 692 if (gfp_zone(gfp) >= policy_zone)
1da177e4
LT
693 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
694 return policy->v.zonelist;
695 /*FALL THROUGH*/
696 case MPOL_INTERLEAVE: /* should not happen */
697 case MPOL_DEFAULT:
698 nd = numa_node_id();
699 break;
700 default:
701 nd = 0;
702 BUG();
703 }
af4ca457 704 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1da177e4
LT
705}
706
707/* Do dynamic interleaving for a process */
708static unsigned interleave_nodes(struct mempolicy *policy)
709{
710 unsigned nid, next;
711 struct task_struct *me = current;
712
713 nid = me->il_next;
dfcd3c0d 714 next = next_node(nid, policy->v.nodes);
1da177e4 715 if (next >= MAX_NUMNODES)
dfcd3c0d 716 next = first_node(policy->v.nodes);
1da177e4
LT
717 me->il_next = next;
718 return nid;
719}
720
721/* Do static interleaving for a VMA with known offset. */
722static unsigned offset_il_node(struct mempolicy *pol,
723 struct vm_area_struct *vma, unsigned long off)
724{
dfcd3c0d 725 unsigned nnodes = nodes_weight(pol->v.nodes);
1da177e4
LT
726 unsigned target = (unsigned)off % nnodes;
727 int c;
728 int nid = -1;
729
730 c = 0;
731 do {
dfcd3c0d 732 nid = next_node(nid, pol->v.nodes);
1da177e4
LT
733 c++;
734 } while (c <= target);
1da177e4
LT
735 return nid;
736}
737
738/* Allocate a page in interleaved policy.
739 Own path because it needs to do special accounting. */
662f3a0b
AK
740static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
741 unsigned nid)
1da177e4
LT
742{
743 struct zonelist *zl;
744 struct page *page;
745
af4ca457 746 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1da177e4
LT
747 page = __alloc_pages(gfp, order, zl);
748 if (page && page_zone(page) == zl->zones[0]) {
e7c8d5c9 749 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1da177e4
LT
750 put_cpu();
751 }
752 return page;
753}
754
755/**
756 * alloc_page_vma - Allocate a page for a VMA.
757 *
758 * @gfp:
759 * %GFP_USER user allocation.
760 * %GFP_KERNEL kernel allocations,
761 * %GFP_HIGHMEM highmem/user allocations,
762 * %GFP_FS allocation should not call back into a file system.
763 * %GFP_ATOMIC don't sleep.
764 *
765 * @vma: Pointer to VMA or NULL if not available.
766 * @addr: Virtual Address of the allocation. Must be inside the VMA.
767 *
768 * This function allocates a page from the kernel page pool and applies
769 * a NUMA policy associated with the VMA or the current process.
770 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
771 * mm_struct of the VMA to prevent it from going away. Should be used for
772 * all allocations for pages that will be mapped into
773 * user space. Returns NULL when no page can be allocated.
774 *
775 * Should be called with the mm_sem of the vma hold.
776 */
777struct page *
dd0fc66f 778alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1da177e4 779{
6e21c8f1 780 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1da177e4
LT
781
782 cpuset_update_current_mems_allowed();
783
784 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
785 unsigned nid;
786 if (vma) {
787 unsigned long off;
1da177e4
LT
788 off = vma->vm_pgoff;
789 off += (addr - vma->vm_start) >> PAGE_SHIFT;
790 nid = offset_il_node(pol, vma, off);
791 } else {
792 /* fall back to process interleaving */
793 nid = interleave_nodes(pol);
794 }
795 return alloc_page_interleave(gfp, 0, nid);
796 }
797 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
798}
799
800/**
801 * alloc_pages_current - Allocate pages.
802 *
803 * @gfp:
804 * %GFP_USER user allocation,
805 * %GFP_KERNEL kernel allocation,
806 * %GFP_HIGHMEM highmem allocation,
807 * %GFP_FS don't call back into a file system.
808 * %GFP_ATOMIC don't sleep.
809 * @order: Power of two of allocation size in pages. 0 is a single page.
810 *
811 * Allocate a page from the kernel page pool. When not in
812 * interrupt context and apply the current process NUMA policy.
813 * Returns NULL when no page can be allocated.
814 *
815 * Don't call cpuset_update_current_mems_allowed() unless
816 * 1) it's ok to take cpuset_sem (can WAIT), and
817 * 2) allocating for current task (not interrupt).
818 */
dd0fc66f 819struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1da177e4
LT
820{
821 struct mempolicy *pol = current->mempolicy;
822
823 if ((gfp & __GFP_WAIT) && !in_interrupt())
824 cpuset_update_current_mems_allowed();
825 if (!pol || in_interrupt())
826 pol = &default_policy;
827 if (pol->policy == MPOL_INTERLEAVE)
828 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
829 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
830}
831EXPORT_SYMBOL(alloc_pages_current);
832
833/* Slow path of a mempolicy copy */
834struct mempolicy *__mpol_copy(struct mempolicy *old)
835{
836 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
837
838 if (!new)
839 return ERR_PTR(-ENOMEM);
840 *new = *old;
841 atomic_set(&new->refcnt, 1);
842 if (new->policy == MPOL_BIND) {
843 int sz = ksize(old->v.zonelist);
844 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
845 if (!new->v.zonelist) {
846 kmem_cache_free(policy_cache, new);
847 return ERR_PTR(-ENOMEM);
848 }
849 memcpy(new->v.zonelist, old->v.zonelist, sz);
850 }
851 return new;
852}
853
854/* Slow path of a mempolicy comparison */
855int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
856{
857 if (!a || !b)
858 return 0;
859 if (a->policy != b->policy)
860 return 0;
861 switch (a->policy) {
862 case MPOL_DEFAULT:
863 return 1;
864 case MPOL_INTERLEAVE:
dfcd3c0d 865 return nodes_equal(a->v.nodes, b->v.nodes);
1da177e4
LT
866 case MPOL_PREFERRED:
867 return a->v.preferred_node == b->v.preferred_node;
868 case MPOL_BIND: {
869 int i;
870 for (i = 0; a->v.zonelist->zones[i]; i++)
871 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
872 return 0;
873 return b->v.zonelist->zones[i] == NULL;
874 }
875 default:
876 BUG();
877 return 0;
878 }
879}
880
881/* Slow path of a mpol destructor. */
882void __mpol_free(struct mempolicy *p)
883{
884 if (!atomic_dec_and_test(&p->refcnt))
885 return;
886 if (p->policy == MPOL_BIND)
887 kfree(p->v.zonelist);
888 p->policy = MPOL_DEFAULT;
889 kmem_cache_free(policy_cache, p);
890}
891
892/*
893 * Hugetlb policy. Same as above, just works with node numbers instead of
894 * zonelists.
895 */
896
897/* Find first node suitable for an allocation */
898int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
899{
6e21c8f1 900 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1da177e4
LT
901
902 switch (pol->policy) {
903 case MPOL_DEFAULT:
904 return numa_node_id();
905 case MPOL_BIND:
906 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
907 case MPOL_INTERLEAVE:
908 return interleave_nodes(pol);
909 case MPOL_PREFERRED:
910 return pol->v.preferred_node >= 0 ?
911 pol->v.preferred_node : numa_node_id();
912 }
913 BUG();
914 return 0;
915}
916
917/* Find secondary valid nodes for an allocation */
918int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
919{
6e21c8f1 920 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1da177e4
LT
921
922 switch (pol->policy) {
923 case MPOL_PREFERRED:
924 case MPOL_DEFAULT:
925 case MPOL_INTERLEAVE:
926 return 1;
927 case MPOL_BIND: {
928 struct zone **z;
929 for (z = pol->v.zonelist->zones; *z; z++)
930 if ((*z)->zone_pgdat->node_id == nid)
931 return 1;
932 return 0;
933 }
934 default:
935 BUG();
936 return 0;
937 }
938}
939
940/*
941 * Shared memory backing store policy support.
942 *
943 * Remember policies even when nobody has shared memory mapped.
944 * The policies are kept in Red-Black tree linked from the inode.
945 * They are protected by the sp->lock spinlock, which should be held
946 * for any accesses to the tree.
947 */
948
949/* lookup first element intersecting start-end */
950/* Caller holds sp->lock */
951static struct sp_node *
952sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
953{
954 struct rb_node *n = sp->root.rb_node;
955
956 while (n) {
957 struct sp_node *p = rb_entry(n, struct sp_node, nd);
958
959 if (start >= p->end)
960 n = n->rb_right;
961 else if (end <= p->start)
962 n = n->rb_left;
963 else
964 break;
965 }
966 if (!n)
967 return NULL;
968 for (;;) {
969 struct sp_node *w = NULL;
970 struct rb_node *prev = rb_prev(n);
971 if (!prev)
972 break;
973 w = rb_entry(prev, struct sp_node, nd);
974 if (w->end <= start)
975 break;
976 n = prev;
977 }
978 return rb_entry(n, struct sp_node, nd);
979}
980
981/* Insert a new shared policy into the list. */
982/* Caller holds sp->lock */
983static void sp_insert(struct shared_policy *sp, struct sp_node *new)
984{
985 struct rb_node **p = &sp->root.rb_node;
986 struct rb_node *parent = NULL;
987 struct sp_node *nd;
988
989 while (*p) {
990 parent = *p;
991 nd = rb_entry(parent, struct sp_node, nd);
992 if (new->start < nd->start)
993 p = &(*p)->rb_left;
994 else if (new->end > nd->end)
995 p = &(*p)->rb_right;
996 else
997 BUG();
998 }
999 rb_link_node(&new->nd, parent, p);
1000 rb_insert_color(&new->nd, &sp->root);
1001 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1002 new->policy ? new->policy->policy : 0);
1003}
1004
1005/* Find shared policy intersecting idx */
1006struct mempolicy *
1007mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1008{
1009 struct mempolicy *pol = NULL;
1010 struct sp_node *sn;
1011
1012 if (!sp->root.rb_node)
1013 return NULL;
1014 spin_lock(&sp->lock);
1015 sn = sp_lookup(sp, idx, idx+1);
1016 if (sn) {
1017 mpol_get(sn->policy);
1018 pol = sn->policy;
1019 }
1020 spin_unlock(&sp->lock);
1021 return pol;
1022}
1023
1024static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1025{
1026 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1027 rb_erase(&n->nd, &sp->root);
1028 mpol_free(n->policy);
1029 kmem_cache_free(sn_cache, n);
1030}
1031
1032struct sp_node *
1033sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1034{
1035 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1036
1037 if (!n)
1038 return NULL;
1039 n->start = start;
1040 n->end = end;
1041 mpol_get(pol);
1042 n->policy = pol;
1043 return n;
1044}
1045
1046/* Replace a policy range. */
1047static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1048 unsigned long end, struct sp_node *new)
1049{
1050 struct sp_node *n, *new2 = NULL;
1051
1052restart:
1053 spin_lock(&sp->lock);
1054 n = sp_lookup(sp, start, end);
1055 /* Take care of old policies in the same range. */
1056 while (n && n->start < end) {
1057 struct rb_node *next = rb_next(&n->nd);
1058 if (n->start >= start) {
1059 if (n->end <= end)
1060 sp_delete(sp, n);
1061 else
1062 n->start = end;
1063 } else {
1064 /* Old policy spanning whole new range. */
1065 if (n->end > end) {
1066 if (!new2) {
1067 spin_unlock(&sp->lock);
1068 new2 = sp_alloc(end, n->end, n->policy);
1069 if (!new2)
1070 return -ENOMEM;
1071 goto restart;
1072 }
1073 n->end = start;
1074 sp_insert(sp, new2);
1075 new2 = NULL;
1076 break;
1077 } else
1078 n->end = start;
1079 }
1080 if (!next)
1081 break;
1082 n = rb_entry(next, struct sp_node, nd);
1083 }
1084 if (new)
1085 sp_insert(sp, new);
1086 spin_unlock(&sp->lock);
1087 if (new2) {
1088 mpol_free(new2->policy);
1089 kmem_cache_free(sn_cache, new2);
1090 }
1091 return 0;
1092}
1093
1094int mpol_set_shared_policy(struct shared_policy *info,
1095 struct vm_area_struct *vma, struct mempolicy *npol)
1096{
1097 int err;
1098 struct sp_node *new = NULL;
1099 unsigned long sz = vma_pages(vma);
1100
1101 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1102 vma->vm_pgoff,
1103 sz, npol? npol->policy : -1,
dfcd3c0d 1104 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1da177e4
LT
1105
1106 if (npol) {
1107 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1108 if (!new)
1109 return -ENOMEM;
1110 }
1111 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1112 if (err && new)
1113 kmem_cache_free(sn_cache, new);
1114 return err;
1115}
1116
1117/* Free a backing policy store on inode delete. */
1118void mpol_free_shared_policy(struct shared_policy *p)
1119{
1120 struct sp_node *n;
1121 struct rb_node *next;
1122
1123 if (!p->root.rb_node)
1124 return;
1125 spin_lock(&p->lock);
1126 next = rb_first(&p->root);
1127 while (next) {
1128 n = rb_entry(next, struct sp_node, nd);
1129 next = rb_next(&n->nd);
90c5029e 1130 rb_erase(&n->nd, &p->root);
1da177e4
LT
1131 mpol_free(n->policy);
1132 kmem_cache_free(sn_cache, n);
1133 }
1134 spin_unlock(&p->lock);
1da177e4
LT
1135}
1136
1137/* assumes fs == KERNEL_DS */
1138void __init numa_policy_init(void)
1139{
1140 policy_cache = kmem_cache_create("numa_policy",
1141 sizeof(struct mempolicy),
1142 0, SLAB_PANIC, NULL, NULL);
1143
1144 sn_cache = kmem_cache_create("shared_policy_node",
1145 sizeof(struct sp_node),
1146 0, SLAB_PANIC, NULL, NULL);
1147
1148 /* Set interleaving policy for system init. This way not all
1149 the data structures allocated at system boot end up in node zero. */
1150
1151 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1152 MAX_NUMNODES) < 0)
1153 printk("numa_policy_init: interleaving failed\n");
1154}
1155
1156/* Reset policy of current process to default.
1157 * Assumes fs == KERNEL_DS */
1158void numa_default_policy(void)
1159{
1160 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1161}