]> bbs.cooldavid.org Git - net-next-2.6.git/blame - mm/mempolicy.c
[PATCH] mempolicies: private pointer in check_range and MPOL_MF_INVERT
[net-next-2.6.git] / mm / mempolicy.c
CommitLineData
1da177e4
LT
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
1da177e4
LT
28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
1da177e4
LT
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 could replace all the switch()es with a mempolicy_ops structure.
67*/
68
69#include <linux/mempolicy.h>
70#include <linux/mm.h>
71#include <linux/highmem.h>
72#include <linux/hugetlb.h>
73#include <linux/kernel.h>
74#include <linux/sched.h>
75#include <linux/mm.h>
76#include <linux/nodemask.h>
77#include <linux/cpuset.h>
78#include <linux/gfp.h>
79#include <linux/slab.h>
80#include <linux/string.h>
81#include <linux/module.h>
82#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
85#include <linux/mempolicy.h>
dc9aa5b9
CL
86#include <linux/swap.h>
87
1da177e4
LT
88#include <asm/tlbflush.h>
89#include <asm/uaccess.h>
90
38e35860 91/* Internal flags */
dc9aa5b9 92#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
38e35860 93#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
dc9aa5b9 94
1da177e4
LT
95static kmem_cache_t *policy_cache;
96static kmem_cache_t *sn_cache;
97
98#define PDprintk(fmt...)
99
100/* Highest zone. An specific allocation for a zone below that is not
101 policied. */
4be38e35 102int policy_zone = ZONE_DMA;
1da177e4 103
d42c6997 104struct mempolicy default_policy = {
1da177e4
LT
105 .refcnt = ATOMIC_INIT(1), /* never free it */
106 .policy = MPOL_DEFAULT,
107};
108
1da177e4 109/* Do sanity checking on a policy */
dfcd3c0d 110static int mpol_check_policy(int mode, nodemask_t *nodes)
1da177e4 111{
dfcd3c0d 112 int empty = nodes_empty(*nodes);
1da177e4
LT
113
114 switch (mode) {
115 case MPOL_DEFAULT:
116 if (!empty)
117 return -EINVAL;
118 break;
119 case MPOL_BIND:
120 case MPOL_INTERLEAVE:
121 /* Preferred will only use the first bit, but allow
122 more for now. */
123 if (empty)
124 return -EINVAL;
125 break;
126 }
dfcd3c0d 127 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
1da177e4 128}
1da177e4 129/* Generate a custom zonelist for the BIND policy. */
dfcd3c0d 130static struct zonelist *bind_zonelist(nodemask_t *nodes)
1da177e4
LT
131{
132 struct zonelist *zl;
133 int num, max, nd;
134
dfcd3c0d 135 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
1da177e4
LT
136 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
137 if (!zl)
138 return NULL;
139 num = 0;
4be38e35
CL
140 for_each_node_mask(nd, *nodes)
141 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
1da177e4
LT
142 zl->zones[num] = NULL;
143 return zl;
144}
145
146/* Create a new policy */
dfcd3c0d 147static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
1da177e4
LT
148{
149 struct mempolicy *policy;
150
dfcd3c0d 151 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
1da177e4
LT
152 if (mode == MPOL_DEFAULT)
153 return NULL;
154 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
155 if (!policy)
156 return ERR_PTR(-ENOMEM);
157 atomic_set(&policy->refcnt, 1);
158 switch (mode) {
159 case MPOL_INTERLEAVE:
dfcd3c0d 160 policy->v.nodes = *nodes;
8f493d79
AK
161 if (nodes_weight(*nodes) == 0) {
162 kmem_cache_free(policy_cache, policy);
163 return ERR_PTR(-EINVAL);
164 }
1da177e4
LT
165 break;
166 case MPOL_PREFERRED:
dfcd3c0d 167 policy->v.preferred_node = first_node(*nodes);
1da177e4
LT
168 if (policy->v.preferred_node >= MAX_NUMNODES)
169 policy->v.preferred_node = -1;
170 break;
171 case MPOL_BIND:
172 policy->v.zonelist = bind_zonelist(nodes);
173 if (policy->v.zonelist == NULL) {
174 kmem_cache_free(policy_cache, policy);
175 return ERR_PTR(-ENOMEM);
176 }
177 break;
178 }
179 policy->policy = mode;
180 return policy;
181}
182
dc9aa5b9
CL
183/* Check if we are the only process mapping the page in question */
184static inline int single_mm_mapping(struct mm_struct *mm,
185 struct address_space *mapping)
186{
187 struct vm_area_struct *vma;
188 struct prio_tree_iter iter;
189 int rc = 1;
190
191 spin_lock(&mapping->i_mmap_lock);
192 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
193 if (mm != vma->vm_mm) {
194 rc = 0;
195 goto out;
196 }
197 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
198 if (mm != vma->vm_mm) {
199 rc = 0;
200 goto out;
201 }
202out:
203 spin_unlock(&mapping->i_mmap_lock);
204 return rc;
205}
206
207/*
208 * Add a page to be migrated to the pagelist
209 */
210static void migrate_page_add(struct vm_area_struct *vma,
211 struct page *page, struct list_head *pagelist, unsigned long flags)
212{
213 /*
214 * Avoid migrating a page that is shared by others and not writable.
215 */
216 if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
217 mapping_writably_mapped(page->mapping) ||
218 single_mm_mapping(vma->vm_mm, page->mapping)) {
219 int rc = isolate_lru_page(page);
220
221 if (rc == 1)
222 list_add(&page->lru, pagelist);
223 /*
224 * If the isolate attempt was not successful then we just
225 * encountered an unswappable page. Something must be wrong.
226 */
227 WARN_ON(rc == 0);
228 }
229}
230
38e35860 231/* Scan through pages checking if pages follow certain conditions. */
b5810039 232static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
dc9aa5b9
CL
233 unsigned long addr, unsigned long end,
234 const nodemask_t *nodes, unsigned long flags,
38e35860 235 void *private)
1da177e4 236{
91612e0d
HD
237 pte_t *orig_pte;
238 pte_t *pte;
705e87c0 239 spinlock_t *ptl;
941150a3 240
705e87c0 241 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
91612e0d 242 do {
6aab341e 243 struct page *page;
91612e0d
HD
244 unsigned int nid;
245
246 if (!pte_present(*pte))
1da177e4 247 continue;
6aab341e
LT
248 page = vm_normal_page(vma, addr, *pte);
249 if (!page)
1da177e4 250 continue;
6aab341e 251 nid = page_to_nid(page);
38e35860
CL
252 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
253 continue;
254
255 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
256 migrate_page_add(vma, page, private, flags);
257 else
258 break;
91612e0d 259 } while (pte++, addr += PAGE_SIZE, addr != end);
705e87c0 260 pte_unmap_unlock(orig_pte, ptl);
91612e0d
HD
261 return addr != end;
262}
263
b5810039 264static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
dc9aa5b9
CL
265 unsigned long addr, unsigned long end,
266 const nodemask_t *nodes, unsigned long flags,
38e35860 267 void *private)
91612e0d
HD
268{
269 pmd_t *pmd;
270 unsigned long next;
271
272 pmd = pmd_offset(pud, addr);
273 do {
274 next = pmd_addr_end(addr, end);
275 if (pmd_none_or_clear_bad(pmd))
276 continue;
dc9aa5b9 277 if (check_pte_range(vma, pmd, addr, next, nodes,
38e35860 278 flags, private))
91612e0d
HD
279 return -EIO;
280 } while (pmd++, addr = next, addr != end);
281 return 0;
282}
283
b5810039 284static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
dc9aa5b9
CL
285 unsigned long addr, unsigned long end,
286 const nodemask_t *nodes, unsigned long flags,
38e35860 287 void *private)
91612e0d
HD
288{
289 pud_t *pud;
290 unsigned long next;
291
292 pud = pud_offset(pgd, addr);
293 do {
294 next = pud_addr_end(addr, end);
295 if (pud_none_or_clear_bad(pud))
296 continue;
dc9aa5b9 297 if (check_pmd_range(vma, pud, addr, next, nodes,
38e35860 298 flags, private))
91612e0d
HD
299 return -EIO;
300 } while (pud++, addr = next, addr != end);
301 return 0;
302}
303
b5810039 304static inline int check_pgd_range(struct vm_area_struct *vma,
dc9aa5b9
CL
305 unsigned long addr, unsigned long end,
306 const nodemask_t *nodes, unsigned long flags,
38e35860 307 void *private)
91612e0d
HD
308{
309 pgd_t *pgd;
310 unsigned long next;
311
b5810039 312 pgd = pgd_offset(vma->vm_mm, addr);
91612e0d
HD
313 do {
314 next = pgd_addr_end(addr, end);
315 if (pgd_none_or_clear_bad(pgd))
316 continue;
dc9aa5b9 317 if (check_pud_range(vma, pgd, addr, next, nodes,
38e35860 318 flags, private))
91612e0d
HD
319 return -EIO;
320 } while (pgd++, addr = next, addr != end);
321 return 0;
1da177e4
LT
322}
323
dc9aa5b9
CL
324/* Check if a vma is migratable */
325static inline int vma_migratable(struct vm_area_struct *vma)
326{
327 if (vma->vm_flags & (
328 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
329 return 0;
330 return 1;
331}
332
333/*
334 * Check if all pages in a range are on a set of nodes.
335 * If pagelist != NULL then isolate pages from the LRU and
336 * put them on the pagelist.
337 */
1da177e4
LT
338static struct vm_area_struct *
339check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
38e35860 340 const nodemask_t *nodes, unsigned long flags, void *private)
1da177e4
LT
341{
342 int err;
343 struct vm_area_struct *first, *vma, *prev;
344
345 first = find_vma(mm, start);
346 if (!first)
347 return ERR_PTR(-EFAULT);
348 prev = NULL;
349 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
dc9aa5b9
CL
350 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
351 if (!vma->vm_next && vma->vm_end < end)
352 return ERR_PTR(-EFAULT);
353 if (prev && prev->vm_end < vma->vm_start)
354 return ERR_PTR(-EFAULT);
355 }
356 if (!is_vm_hugetlb_page(vma) &&
357 ((flags & MPOL_MF_STRICT) ||
358 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
359 vma_migratable(vma)))) {
5b952b3c 360 unsigned long endvma = vma->vm_end;
dc9aa5b9 361
5b952b3c
AK
362 if (endvma > end)
363 endvma = end;
364 if (vma->vm_start > start)
365 start = vma->vm_start;
dc9aa5b9 366 err = check_pgd_range(vma, start, endvma, nodes,
38e35860 367 flags, private);
1da177e4
LT
368 if (err) {
369 first = ERR_PTR(err);
370 break;
371 }
372 }
373 prev = vma;
374 }
375 return first;
376}
377
378/* Apply policy to a single VMA */
379static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
380{
381 int err = 0;
382 struct mempolicy *old = vma->vm_policy;
383
384 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
385 vma->vm_start, vma->vm_end, vma->vm_pgoff,
386 vma->vm_ops, vma->vm_file,
387 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
388
389 if (vma->vm_ops && vma->vm_ops->set_policy)
390 err = vma->vm_ops->set_policy(vma, new);
391 if (!err) {
392 mpol_get(new);
393 vma->vm_policy = new;
394 mpol_free(old);
395 }
396 return err;
397}
398
399/* Step 2: apply policy to a range and do splits. */
400static int mbind_range(struct vm_area_struct *vma, unsigned long start,
401 unsigned long end, struct mempolicy *new)
402{
403 struct vm_area_struct *next;
404 int err;
405
406 err = 0;
407 for (; vma && vma->vm_start < end; vma = next) {
408 next = vma->vm_next;
409 if (vma->vm_start < start)
410 err = split_vma(vma->vm_mm, vma, start, 1);
411 if (!err && vma->vm_end > end)
412 err = split_vma(vma->vm_mm, vma, end, 0);
413 if (!err)
414 err = policy_vma(vma, new);
415 if (err)
416 break;
417 }
418 return err;
419}
420
8bccd85f
CL
421static int contextualize_policy(int mode, nodemask_t *nodes)
422{
423 if (!nodes)
424 return 0;
425
426 /* Update current mems_allowed */
427 cpuset_update_current_mems_allowed();
428 /* Ignore nodes not set in current->mems_allowed */
429 cpuset_restrict_to_mems_allowed(nodes->bits);
430 return mpol_check_policy(mode, nodes);
431}
432
d4984711
CL
433static int swap_pages(struct list_head *pagelist)
434{
435 LIST_HEAD(moved);
436 LIST_HEAD(failed);
437 int n;
438
439 n = migrate_pages(pagelist, NULL, &moved, &failed);
440 putback_lru_pages(&failed);
441 putback_lru_pages(&moved);
442
443 return n;
444}
445
8bccd85f
CL
446long do_mbind(unsigned long start, unsigned long len,
447 unsigned long mode, nodemask_t *nmask, unsigned long flags)
1da177e4
LT
448{
449 struct vm_area_struct *vma;
450 struct mm_struct *mm = current->mm;
451 struct mempolicy *new;
452 unsigned long end;
1da177e4 453 int err;
dc9aa5b9 454 LIST_HEAD(pagelist);
1da177e4 455
38e35860
CL
456 if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
457 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
dc9aa5b9 458 || mode > MPOL_MAX)
1da177e4 459 return -EINVAL;
dc9aa5b9
CL
460 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
461 return -EPERM;
462
1da177e4
LT
463 if (start & ~PAGE_MASK)
464 return -EINVAL;
dc9aa5b9 465
1da177e4
LT
466 if (mode == MPOL_DEFAULT)
467 flags &= ~MPOL_MF_STRICT;
dc9aa5b9 468
1da177e4
LT
469 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
470 end = start + len;
dc9aa5b9 471
1da177e4
LT
472 if (end < start)
473 return -EINVAL;
474 if (end == start)
475 return 0;
dc9aa5b9 476
5fcbb230 477 if (mpol_check_policy(mode, nmask))
8bccd85f 478 return -EINVAL;
dc9aa5b9 479
8bccd85f 480 new = mpol_new(mode, nmask);
1da177e4
LT
481 if (IS_ERR(new))
482 return PTR_ERR(new);
483
dc9aa5b9
CL
484 /*
485 * If we are using the default policy then operation
486 * on discontinuous address spaces is okay after all
487 */
488 if (!new)
489 flags |= MPOL_MF_DISCONTIG_OK;
490
1da177e4 491 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
dfcd3c0d 492 mode,nodes_addr(nodes)[0]);
1da177e4
LT
493
494 down_write(&mm->mmap_sem);
38e35860
CL
495 vma = check_range(mm, start, end, nmask,
496 flags | MPOL_MF_INVERT, &pagelist);
497
1da177e4 498 err = PTR_ERR(vma);
dc9aa5b9 499 if (!IS_ERR(vma)) {
d4984711
CL
500 int nr_failed = 0;
501
1da177e4 502 err = mbind_range(vma, start, end, new);
dc9aa5b9 503 if (!list_empty(&pagelist))
d4984711
CL
504 nr_failed = swap_pages(&pagelist);
505
506 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
dc9aa5b9
CL
507 err = -EIO;
508 }
509 if (!list_empty(&pagelist))
510 putback_lru_pages(&pagelist);
511
1da177e4
LT
512 up_write(&mm->mmap_sem);
513 mpol_free(new);
514 return err;
515}
516
517/* Set the process memory policy */
8bccd85f 518long do_set_mempolicy(int mode, nodemask_t *nodes)
1da177e4 519{
1da177e4 520 struct mempolicy *new;
1da177e4 521
8bccd85f 522 if (contextualize_policy(mode, nodes))
1da177e4 523 return -EINVAL;
8bccd85f 524 new = mpol_new(mode, nodes);
1da177e4
LT
525 if (IS_ERR(new))
526 return PTR_ERR(new);
527 mpol_free(current->mempolicy);
528 current->mempolicy = new;
529 if (new && new->policy == MPOL_INTERLEAVE)
dfcd3c0d 530 current->il_next = first_node(new->v.nodes);
1da177e4
LT
531 return 0;
532}
533
534/* Fill a zone bitmap for a policy */
dfcd3c0d 535static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4
LT
536{
537 int i;
538
dfcd3c0d 539 nodes_clear(*nodes);
1da177e4
LT
540 switch (p->policy) {
541 case MPOL_BIND:
542 for (i = 0; p->v.zonelist->zones[i]; i++)
8bccd85f
CL
543 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
544 *nodes);
1da177e4
LT
545 break;
546 case MPOL_DEFAULT:
547 break;
548 case MPOL_INTERLEAVE:
dfcd3c0d 549 *nodes = p->v.nodes;
1da177e4
LT
550 break;
551 case MPOL_PREFERRED:
552 /* or use current node instead of online map? */
553 if (p->v.preferred_node < 0)
dfcd3c0d 554 *nodes = node_online_map;
1da177e4 555 else
dfcd3c0d 556 node_set(p->v.preferred_node, *nodes);
1da177e4
LT
557 break;
558 default:
559 BUG();
560 }
561}
562
563static int lookup_node(struct mm_struct *mm, unsigned long addr)
564{
565 struct page *p;
566 int err;
567
568 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
569 if (err >= 0) {
570 err = page_to_nid(p);
571 put_page(p);
572 }
573 return err;
574}
575
1da177e4 576/* Retrieve NUMA policy */
8bccd85f
CL
577long do_get_mempolicy(int *policy, nodemask_t *nmask,
578 unsigned long addr, unsigned long flags)
1da177e4 579{
8bccd85f 580 int err;
1da177e4
LT
581 struct mm_struct *mm = current->mm;
582 struct vm_area_struct *vma = NULL;
583 struct mempolicy *pol = current->mempolicy;
584
68860ec1 585 cpuset_update_current_mems_allowed();
1da177e4
LT
586 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
587 return -EINVAL;
1da177e4
LT
588 if (flags & MPOL_F_ADDR) {
589 down_read(&mm->mmap_sem);
590 vma = find_vma_intersection(mm, addr, addr+1);
591 if (!vma) {
592 up_read(&mm->mmap_sem);
593 return -EFAULT;
594 }
595 if (vma->vm_ops && vma->vm_ops->get_policy)
596 pol = vma->vm_ops->get_policy(vma, addr);
597 else
598 pol = vma->vm_policy;
599 } else if (addr)
600 return -EINVAL;
601
602 if (!pol)
603 pol = &default_policy;
604
605 if (flags & MPOL_F_NODE) {
606 if (flags & MPOL_F_ADDR) {
607 err = lookup_node(mm, addr);
608 if (err < 0)
609 goto out;
8bccd85f 610 *policy = err;
1da177e4
LT
611 } else if (pol == current->mempolicy &&
612 pol->policy == MPOL_INTERLEAVE) {
8bccd85f 613 *policy = current->il_next;
1da177e4
LT
614 } else {
615 err = -EINVAL;
616 goto out;
617 }
618 } else
8bccd85f 619 *policy = pol->policy;
1da177e4
LT
620
621 if (vma) {
622 up_read(&current->mm->mmap_sem);
623 vma = NULL;
624 }
625
1da177e4 626 err = 0;
8bccd85f
CL
627 if (nmask)
628 get_zonemask(pol, nmask);
1da177e4
LT
629
630 out:
631 if (vma)
632 up_read(&current->mm->mmap_sem);
633 return err;
634}
635
39743889
CL
636/*
637 * For now migrate_pages simply swaps out the pages from nodes that are in
638 * the source set but not in the target set. In the future, we would
639 * want a function that moves pages between the two nodesets in such
640 * a way as to preserve the physical layout as much as possible.
641 *
642 * Returns the number of page that could not be moved.
643 */
644int do_migrate_pages(struct mm_struct *mm,
645 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
646{
647 LIST_HEAD(pagelist);
648 int count = 0;
649 nodemask_t nodes;
650
651 nodes_andnot(nodes, *from_nodes, *to_nodes);
39743889
CL
652
653 down_read(&mm->mmap_sem);
654 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
655 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
d4984711 656
39743889 657 if (!list_empty(&pagelist)) {
d4984711
CL
658 count = swap_pages(&pagelist);
659 putback_lru_pages(&pagelist);
39743889 660 }
d4984711 661
39743889
CL
662 up_read(&mm->mmap_sem);
663 return count;
664}
665
8bccd85f
CL
666/*
667 * User space interface with variable sized bitmaps for nodelists.
668 */
669
670/* Copy a node mask from user space. */
39743889 671static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85f
CL
672 unsigned long maxnode)
673{
674 unsigned long k;
675 unsigned long nlongs;
676 unsigned long endmask;
677
678 --maxnode;
679 nodes_clear(*nodes);
680 if (maxnode == 0 || !nmask)
681 return 0;
682
683 nlongs = BITS_TO_LONGS(maxnode);
684 if ((maxnode % BITS_PER_LONG) == 0)
685 endmask = ~0UL;
686 else
687 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
688
689 /* When the user specified more nodes than supported just check
690 if the non supported part is all zero. */
691 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
692 if (nlongs > PAGE_SIZE/sizeof(long))
693 return -EINVAL;
694 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
695 unsigned long t;
696 if (get_user(t, nmask + k))
697 return -EFAULT;
698 if (k == nlongs - 1) {
699 if (t & endmask)
700 return -EINVAL;
701 } else if (t)
702 return -EINVAL;
703 }
704 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
705 endmask = ~0UL;
706 }
707
708 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
709 return -EFAULT;
710 nodes_addr(*nodes)[nlongs-1] &= endmask;
711 return 0;
712}
713
714/* Copy a kernel node mask to user space */
715static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
716 nodemask_t *nodes)
717{
718 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
719 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
720
721 if (copy > nbytes) {
722 if (copy > PAGE_SIZE)
723 return -EINVAL;
724 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
725 return -EFAULT;
726 copy = nbytes;
727 }
728 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
729}
730
731asmlinkage long sys_mbind(unsigned long start, unsigned long len,
732 unsigned long mode,
733 unsigned long __user *nmask, unsigned long maxnode,
734 unsigned flags)
735{
736 nodemask_t nodes;
737 int err;
738
739 err = get_nodes(&nodes, nmask, maxnode);
740 if (err)
741 return err;
742 return do_mbind(start, len, mode, &nodes, flags);
743}
744
745/* Set the process memory policy */
746asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
747 unsigned long maxnode)
748{
749 int err;
750 nodemask_t nodes;
751
752 if (mode < 0 || mode > MPOL_MAX)
753 return -EINVAL;
754 err = get_nodes(&nodes, nmask, maxnode);
755 if (err)
756 return err;
757 return do_set_mempolicy(mode, &nodes);
758}
759
39743889
CL
760/* Macro needed until Paul implements this function in kernel/cpusets.c */
761#define cpuset_mems_allowed(task) node_online_map
762
763asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
764 const unsigned long __user *old_nodes,
765 const unsigned long __user *new_nodes)
766{
767 struct mm_struct *mm;
768 struct task_struct *task;
769 nodemask_t old;
770 nodemask_t new;
771 nodemask_t task_nodes;
772 int err;
773
774 err = get_nodes(&old, old_nodes, maxnode);
775 if (err)
776 return err;
777
778 err = get_nodes(&new, new_nodes, maxnode);
779 if (err)
780 return err;
781
782 /* Find the mm_struct */
783 read_lock(&tasklist_lock);
784 task = pid ? find_task_by_pid(pid) : current;
785 if (!task) {
786 read_unlock(&tasklist_lock);
787 return -ESRCH;
788 }
789 mm = get_task_mm(task);
790 read_unlock(&tasklist_lock);
791
792 if (!mm)
793 return -EINVAL;
794
795 /*
796 * Check if this process has the right to modify the specified
797 * process. The right exists if the process has administrative
798 * capabilities, superuser priviledges or the same
799 * userid as the target process.
800 */
801 if ((current->euid != task->suid) && (current->euid != task->uid) &&
802 (current->uid != task->suid) && (current->uid != task->uid) &&
803 !capable(CAP_SYS_ADMIN)) {
804 err = -EPERM;
805 goto out;
806 }
807
808 task_nodes = cpuset_mems_allowed(task);
809 /* Is the user allowed to access the target nodes? */
810 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
811 err = -EPERM;
812 goto out;
813 }
814
815 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
816out:
817 mmput(mm);
818 return err;
819}
820
821
8bccd85f
CL
822/* Retrieve NUMA policy */
823asmlinkage long sys_get_mempolicy(int __user *policy,
824 unsigned long __user *nmask,
825 unsigned long maxnode,
826 unsigned long addr, unsigned long flags)
827{
828 int err, pval;
829 nodemask_t nodes;
830
831 if (nmask != NULL && maxnode < MAX_NUMNODES)
832 return -EINVAL;
833
834 err = do_get_mempolicy(&pval, &nodes, addr, flags);
835
836 if (err)
837 return err;
838
839 if (policy && put_user(pval, policy))
840 return -EFAULT;
841
842 if (nmask)
843 err = copy_nodes_to_user(nmask, maxnode, &nodes);
844
845 return err;
846}
847
1da177e4
LT
848#ifdef CONFIG_COMPAT
849
850asmlinkage long compat_sys_get_mempolicy(int __user *policy,
851 compat_ulong_t __user *nmask,
852 compat_ulong_t maxnode,
853 compat_ulong_t addr, compat_ulong_t flags)
854{
855 long err;
856 unsigned long __user *nm = NULL;
857 unsigned long nr_bits, alloc_size;
858 DECLARE_BITMAP(bm, MAX_NUMNODES);
859
860 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
861 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
862
863 if (nmask)
864 nm = compat_alloc_user_space(alloc_size);
865
866 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
867
868 if (!err && nmask) {
869 err = copy_from_user(bm, nm, alloc_size);
870 /* ensure entire bitmap is zeroed */
871 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
872 err |= compat_put_bitmap(nmask, bm, nr_bits);
873 }
874
875 return err;
876}
877
878asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
879 compat_ulong_t maxnode)
880{
881 long err = 0;
882 unsigned long __user *nm = NULL;
883 unsigned long nr_bits, alloc_size;
884 DECLARE_BITMAP(bm, MAX_NUMNODES);
885
886 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
887 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
888
889 if (nmask) {
890 err = compat_get_bitmap(bm, nmask, nr_bits);
891 nm = compat_alloc_user_space(alloc_size);
892 err |= copy_to_user(nm, bm, alloc_size);
893 }
894
895 if (err)
896 return -EFAULT;
897
898 return sys_set_mempolicy(mode, nm, nr_bits+1);
899}
900
901asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
902 compat_ulong_t mode, compat_ulong_t __user *nmask,
903 compat_ulong_t maxnode, compat_ulong_t flags)
904{
905 long err = 0;
906 unsigned long __user *nm = NULL;
907 unsigned long nr_bits, alloc_size;
dfcd3c0d 908 nodemask_t bm;
1da177e4
LT
909
910 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
911 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
912
913 if (nmask) {
dfcd3c0d 914 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1da177e4 915 nm = compat_alloc_user_space(alloc_size);
dfcd3c0d 916 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1da177e4
LT
917 }
918
919 if (err)
920 return -EFAULT;
921
922 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
923}
924
925#endif
926
927/* Return effective policy for a VMA */
6e21c8f1
CL
928struct mempolicy *
929get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
1da177e4 930{
6e21c8f1 931 struct mempolicy *pol = task->mempolicy;
1da177e4
LT
932
933 if (vma) {
934 if (vma->vm_ops && vma->vm_ops->get_policy)
8bccd85f 935 pol = vma->vm_ops->get_policy(vma, addr);
1da177e4
LT
936 else if (vma->vm_policy &&
937 vma->vm_policy->policy != MPOL_DEFAULT)
938 pol = vma->vm_policy;
939 }
940 if (!pol)
941 pol = &default_policy;
942 return pol;
943}
944
945/* Return a zonelist representing a mempolicy */
dd0fc66f 946static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1da177e4
LT
947{
948 int nd;
949
950 switch (policy->policy) {
951 case MPOL_PREFERRED:
952 nd = policy->v.preferred_node;
953 if (nd < 0)
954 nd = numa_node_id();
955 break;
956 case MPOL_BIND:
957 /* Lower zones don't get a policy applied */
958 /* Careful: current->mems_allowed might have moved */
af4ca457 959 if (gfp_zone(gfp) >= policy_zone)
1da177e4
LT
960 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
961 return policy->v.zonelist;
962 /*FALL THROUGH*/
963 case MPOL_INTERLEAVE: /* should not happen */
964 case MPOL_DEFAULT:
965 nd = numa_node_id();
966 break;
967 default:
968 nd = 0;
969 BUG();
970 }
af4ca457 971 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1da177e4
LT
972}
973
974/* Do dynamic interleaving for a process */
975static unsigned interleave_nodes(struct mempolicy *policy)
976{
977 unsigned nid, next;
978 struct task_struct *me = current;
979
980 nid = me->il_next;
dfcd3c0d 981 next = next_node(nid, policy->v.nodes);
1da177e4 982 if (next >= MAX_NUMNODES)
dfcd3c0d 983 next = first_node(policy->v.nodes);
1da177e4
LT
984 me->il_next = next;
985 return nid;
986}
987
988/* Do static interleaving for a VMA with known offset. */
989static unsigned offset_il_node(struct mempolicy *pol,
990 struct vm_area_struct *vma, unsigned long off)
991{
dfcd3c0d 992 unsigned nnodes = nodes_weight(pol->v.nodes);
1da177e4
LT
993 unsigned target = (unsigned)off % nnodes;
994 int c;
995 int nid = -1;
996
997 c = 0;
998 do {
dfcd3c0d 999 nid = next_node(nid, pol->v.nodes);
1da177e4
LT
1000 c++;
1001 } while (c <= target);
1da177e4
LT
1002 return nid;
1003}
1004
5da7ca86
CL
1005/* Determine a node number for interleave */
1006static inline unsigned interleave_nid(struct mempolicy *pol,
1007 struct vm_area_struct *vma, unsigned long addr, int shift)
1008{
1009 if (vma) {
1010 unsigned long off;
1011
1012 off = vma->vm_pgoff;
1013 off += (addr - vma->vm_start) >> shift;
1014 return offset_il_node(pol, vma, off);
1015 } else
1016 return interleave_nodes(pol);
1017}
1018
1019/* Return a zonelist suitable for a huge page allocation. */
1020struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1021{
1022 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1023
1024 if (pol->policy == MPOL_INTERLEAVE) {
1025 unsigned nid;
1026
1027 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1028 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1029 }
1030 return zonelist_policy(GFP_HIGHUSER, pol);
1031}
1032
1da177e4
LT
1033/* Allocate a page in interleaved policy.
1034 Own path because it needs to do special accounting. */
662f3a0b
AK
1035static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1036 unsigned nid)
1da177e4
LT
1037{
1038 struct zonelist *zl;
1039 struct page *page;
1040
af4ca457 1041 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1da177e4
LT
1042 page = __alloc_pages(gfp, order, zl);
1043 if (page && page_zone(page) == zl->zones[0]) {
e7c8d5c9 1044 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1da177e4
LT
1045 put_cpu();
1046 }
1047 return page;
1048}
1049
1050/**
1051 * alloc_page_vma - Allocate a page for a VMA.
1052 *
1053 * @gfp:
1054 * %GFP_USER user allocation.
1055 * %GFP_KERNEL kernel allocations,
1056 * %GFP_HIGHMEM highmem/user allocations,
1057 * %GFP_FS allocation should not call back into a file system.
1058 * %GFP_ATOMIC don't sleep.
1059 *
1060 * @vma: Pointer to VMA or NULL if not available.
1061 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1062 *
1063 * This function allocates a page from the kernel page pool and applies
1064 * a NUMA policy associated with the VMA or the current process.
1065 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1066 * mm_struct of the VMA to prevent it from going away. Should be used for
1067 * all allocations for pages that will be mapped into
1068 * user space. Returns NULL when no page can be allocated.
1069 *
1070 * Should be called with the mm_sem of the vma hold.
1071 */
1072struct page *
dd0fc66f 1073alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1da177e4 1074{
6e21c8f1 1075 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1da177e4
LT
1076
1077 cpuset_update_current_mems_allowed();
1078
1079 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1080 unsigned nid;
5da7ca86
CL
1081
1082 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1da177e4
LT
1083 return alloc_page_interleave(gfp, 0, nid);
1084 }
1085 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1086}
1087
1088/**
1089 * alloc_pages_current - Allocate pages.
1090 *
1091 * @gfp:
1092 * %GFP_USER user allocation,
1093 * %GFP_KERNEL kernel allocation,
1094 * %GFP_HIGHMEM highmem allocation,
1095 * %GFP_FS don't call back into a file system.
1096 * %GFP_ATOMIC don't sleep.
1097 * @order: Power of two of allocation size in pages. 0 is a single page.
1098 *
1099 * Allocate a page from the kernel page pool. When not in
1100 * interrupt context and apply the current process NUMA policy.
1101 * Returns NULL when no page can be allocated.
1102 *
1103 * Don't call cpuset_update_current_mems_allowed() unless
1104 * 1) it's ok to take cpuset_sem (can WAIT), and
1105 * 2) allocating for current task (not interrupt).
1106 */
dd0fc66f 1107struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1da177e4
LT
1108{
1109 struct mempolicy *pol = current->mempolicy;
1110
1111 if ((gfp & __GFP_WAIT) && !in_interrupt())
1112 cpuset_update_current_mems_allowed();
1113 if (!pol || in_interrupt())
1114 pol = &default_policy;
1115 if (pol->policy == MPOL_INTERLEAVE)
1116 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1117 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1118}
1119EXPORT_SYMBOL(alloc_pages_current);
1120
1121/* Slow path of a mempolicy copy */
1122struct mempolicy *__mpol_copy(struct mempolicy *old)
1123{
1124 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1125
1126 if (!new)
1127 return ERR_PTR(-ENOMEM);
1128 *new = *old;
1129 atomic_set(&new->refcnt, 1);
1130 if (new->policy == MPOL_BIND) {
1131 int sz = ksize(old->v.zonelist);
1132 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1133 if (!new->v.zonelist) {
1134 kmem_cache_free(policy_cache, new);
1135 return ERR_PTR(-ENOMEM);
1136 }
1137 memcpy(new->v.zonelist, old->v.zonelist, sz);
1138 }
1139 return new;
1140}
1141
1142/* Slow path of a mempolicy comparison */
1143int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1144{
1145 if (!a || !b)
1146 return 0;
1147 if (a->policy != b->policy)
1148 return 0;
1149 switch (a->policy) {
1150 case MPOL_DEFAULT:
1151 return 1;
1152 case MPOL_INTERLEAVE:
dfcd3c0d 1153 return nodes_equal(a->v.nodes, b->v.nodes);
1da177e4
LT
1154 case MPOL_PREFERRED:
1155 return a->v.preferred_node == b->v.preferred_node;
1156 case MPOL_BIND: {
1157 int i;
1158 for (i = 0; a->v.zonelist->zones[i]; i++)
1159 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1160 return 0;
1161 return b->v.zonelist->zones[i] == NULL;
1162 }
1163 default:
1164 BUG();
1165 return 0;
1166 }
1167}
1168
1169/* Slow path of a mpol destructor. */
1170void __mpol_free(struct mempolicy *p)
1171{
1172 if (!atomic_dec_and_test(&p->refcnt))
1173 return;
1174 if (p->policy == MPOL_BIND)
1175 kfree(p->v.zonelist);
1176 p->policy = MPOL_DEFAULT;
1177 kmem_cache_free(policy_cache, p);
1178}
1179
1da177e4
LT
1180/*
1181 * Shared memory backing store policy support.
1182 *
1183 * Remember policies even when nobody has shared memory mapped.
1184 * The policies are kept in Red-Black tree linked from the inode.
1185 * They are protected by the sp->lock spinlock, which should be held
1186 * for any accesses to the tree.
1187 */
1188
1189/* lookup first element intersecting start-end */
1190/* Caller holds sp->lock */
1191static struct sp_node *
1192sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1193{
1194 struct rb_node *n = sp->root.rb_node;
1195
1196 while (n) {
1197 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1198
1199 if (start >= p->end)
1200 n = n->rb_right;
1201 else if (end <= p->start)
1202 n = n->rb_left;
1203 else
1204 break;
1205 }
1206 if (!n)
1207 return NULL;
1208 for (;;) {
1209 struct sp_node *w = NULL;
1210 struct rb_node *prev = rb_prev(n);
1211 if (!prev)
1212 break;
1213 w = rb_entry(prev, struct sp_node, nd);
1214 if (w->end <= start)
1215 break;
1216 n = prev;
1217 }
1218 return rb_entry(n, struct sp_node, nd);
1219}
1220
1221/* Insert a new shared policy into the list. */
1222/* Caller holds sp->lock */
1223static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1224{
1225 struct rb_node **p = &sp->root.rb_node;
1226 struct rb_node *parent = NULL;
1227 struct sp_node *nd;
1228
1229 while (*p) {
1230 parent = *p;
1231 nd = rb_entry(parent, struct sp_node, nd);
1232 if (new->start < nd->start)
1233 p = &(*p)->rb_left;
1234 else if (new->end > nd->end)
1235 p = &(*p)->rb_right;
1236 else
1237 BUG();
1238 }
1239 rb_link_node(&new->nd, parent, p);
1240 rb_insert_color(&new->nd, &sp->root);
1241 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1242 new->policy ? new->policy->policy : 0);
1243}
1244
1245/* Find shared policy intersecting idx */
1246struct mempolicy *
1247mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1248{
1249 struct mempolicy *pol = NULL;
1250 struct sp_node *sn;
1251
1252 if (!sp->root.rb_node)
1253 return NULL;
1254 spin_lock(&sp->lock);
1255 sn = sp_lookup(sp, idx, idx+1);
1256 if (sn) {
1257 mpol_get(sn->policy);
1258 pol = sn->policy;
1259 }
1260 spin_unlock(&sp->lock);
1261 return pol;
1262}
1263
1264static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1265{
1266 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1267 rb_erase(&n->nd, &sp->root);
1268 mpol_free(n->policy);
1269 kmem_cache_free(sn_cache, n);
1270}
1271
1272struct sp_node *
1273sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1274{
1275 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1276
1277 if (!n)
1278 return NULL;
1279 n->start = start;
1280 n->end = end;
1281 mpol_get(pol);
1282 n->policy = pol;
1283 return n;
1284}
1285
1286/* Replace a policy range. */
1287static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1288 unsigned long end, struct sp_node *new)
1289{
1290 struct sp_node *n, *new2 = NULL;
1291
1292restart:
1293 spin_lock(&sp->lock);
1294 n = sp_lookup(sp, start, end);
1295 /* Take care of old policies in the same range. */
1296 while (n && n->start < end) {
1297 struct rb_node *next = rb_next(&n->nd);
1298 if (n->start >= start) {
1299 if (n->end <= end)
1300 sp_delete(sp, n);
1301 else
1302 n->start = end;
1303 } else {
1304 /* Old policy spanning whole new range. */
1305 if (n->end > end) {
1306 if (!new2) {
1307 spin_unlock(&sp->lock);
1308 new2 = sp_alloc(end, n->end, n->policy);
1309 if (!new2)
1310 return -ENOMEM;
1311 goto restart;
1312 }
1313 n->end = start;
1314 sp_insert(sp, new2);
1315 new2 = NULL;
1316 break;
1317 } else
1318 n->end = start;
1319 }
1320 if (!next)
1321 break;
1322 n = rb_entry(next, struct sp_node, nd);
1323 }
1324 if (new)
1325 sp_insert(sp, new);
1326 spin_unlock(&sp->lock);
1327 if (new2) {
1328 mpol_free(new2->policy);
1329 kmem_cache_free(sn_cache, new2);
1330 }
1331 return 0;
1332}
1333
1334int mpol_set_shared_policy(struct shared_policy *info,
1335 struct vm_area_struct *vma, struct mempolicy *npol)
1336{
1337 int err;
1338 struct sp_node *new = NULL;
1339 unsigned long sz = vma_pages(vma);
1340
1341 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1342 vma->vm_pgoff,
1343 sz, npol? npol->policy : -1,
dfcd3c0d 1344 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1da177e4
LT
1345
1346 if (npol) {
1347 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1348 if (!new)
1349 return -ENOMEM;
1350 }
1351 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1352 if (err && new)
1353 kmem_cache_free(sn_cache, new);
1354 return err;
1355}
1356
1357/* Free a backing policy store on inode delete. */
1358void mpol_free_shared_policy(struct shared_policy *p)
1359{
1360 struct sp_node *n;
1361 struct rb_node *next;
1362
1363 if (!p->root.rb_node)
1364 return;
1365 spin_lock(&p->lock);
1366 next = rb_first(&p->root);
1367 while (next) {
1368 n = rb_entry(next, struct sp_node, nd);
1369 next = rb_next(&n->nd);
90c5029e 1370 rb_erase(&n->nd, &p->root);
1da177e4
LT
1371 mpol_free(n->policy);
1372 kmem_cache_free(sn_cache, n);
1373 }
1374 spin_unlock(&p->lock);
1da177e4
LT
1375}
1376
1377/* assumes fs == KERNEL_DS */
1378void __init numa_policy_init(void)
1379{
1380 policy_cache = kmem_cache_create("numa_policy",
1381 sizeof(struct mempolicy),
1382 0, SLAB_PANIC, NULL, NULL);
1383
1384 sn_cache = kmem_cache_create("shared_policy_node",
1385 sizeof(struct sp_node),
1386 0, SLAB_PANIC, NULL, NULL);
1387
1388 /* Set interleaving policy for system init. This way not all
1389 the data structures allocated at system boot end up in node zero. */
1390
8bccd85f 1391 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1da177e4
LT
1392 printk("numa_policy_init: interleaving failed\n");
1393}
1394
8bccd85f 1395/* Reset policy of current process to default */
1da177e4
LT
1396void numa_default_policy(void)
1397{
8bccd85f 1398 do_set_mempolicy(MPOL_DEFAULT, NULL);
1da177e4 1399}
68860ec1
PJ
1400
1401/* Migrate a policy to a different set of nodes */
1402static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1403 const nodemask_t *new)
1404{
1405 nodemask_t tmp;
1406
1407 if (!pol)
1408 return;
1409
1410 switch (pol->policy) {
1411 case MPOL_DEFAULT:
1412 break;
1413 case MPOL_INTERLEAVE:
1414 nodes_remap(tmp, pol->v.nodes, *old, *new);
1415 pol->v.nodes = tmp;
1416 current->il_next = node_remap(current->il_next, *old, *new);
1417 break;
1418 case MPOL_PREFERRED:
1419 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1420 *old, *new);
1421 break;
1422 case MPOL_BIND: {
1423 nodemask_t nodes;
1424 struct zone **z;
1425 struct zonelist *zonelist;
1426
1427 nodes_clear(nodes);
1428 for (z = pol->v.zonelist->zones; *z; z++)
1429 node_set((*z)->zone_pgdat->node_id, nodes);
1430 nodes_remap(tmp, nodes, *old, *new);
1431 nodes = tmp;
1432
1433 zonelist = bind_zonelist(&nodes);
1434
1435 /* If no mem, then zonelist is NULL and we keep old zonelist.
1436 * If that old zonelist has no remaining mems_allowed nodes,
1437 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1438 */
1439
1440 if (zonelist) {
1441 /* Good - got mem - substitute new zonelist */
1442 kfree(pol->v.zonelist);
1443 pol->v.zonelist = zonelist;
1444 }
1445 break;
1446 }
1447 default:
1448 BUG();
1449 break;
1450 }
1451}
1452
1453/*
1454 * Someone moved this task to different nodes. Fixup mempolicies.
1455 *
1456 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1457 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1458 */
1459void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1460{
1461 rebind_policy(current->mempolicy, old, new);
1462}