]> bbs.cooldavid.org Git - net-next-2.6.git/blame - mm/mempolicy.c
[PATCH] mempolicies: unexport get_vma_policy()
[net-next-2.6.git] / mm / mempolicy.c
CommitLineData
1da177e4
LT
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
1da177e4
LT
28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
1da177e4
LT
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 could replace all the switch()es with a mempolicy_ops structure.
67*/
68
69#include <linux/mempolicy.h>
70#include <linux/mm.h>
71#include <linux/highmem.h>
72#include <linux/hugetlb.h>
73#include <linux/kernel.h>
74#include <linux/sched.h>
75#include <linux/mm.h>
76#include <linux/nodemask.h>
77#include <linux/cpuset.h>
78#include <linux/gfp.h>
79#include <linux/slab.h>
80#include <linux/string.h>
81#include <linux/module.h>
82#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
85#include <linux/mempolicy.h>
dc9aa5b9 86#include <linux/swap.h>
1a75a6c8
CL
87#include <linux/seq_file.h>
88#include <linux/proc_fs.h>
dc9aa5b9 89
1da177e4
LT
90#include <asm/tlbflush.h>
91#include <asm/uaccess.h>
92
38e35860 93/* Internal flags */
dc9aa5b9 94#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
38e35860 95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
1a75a6c8 96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
dc9aa5b9 97
1da177e4
LT
98static kmem_cache_t *policy_cache;
99static kmem_cache_t *sn_cache;
100
101#define PDprintk(fmt...)
102
103/* Highest zone. An specific allocation for a zone below that is not
104 policied. */
4be38e35 105int policy_zone = ZONE_DMA;
1da177e4 106
d42c6997 107struct mempolicy default_policy = {
1da177e4
LT
108 .refcnt = ATOMIC_INIT(1), /* never free it */
109 .policy = MPOL_DEFAULT,
110};
111
1da177e4 112/* Do sanity checking on a policy */
dfcd3c0d 113static int mpol_check_policy(int mode, nodemask_t *nodes)
1da177e4 114{
dfcd3c0d 115 int empty = nodes_empty(*nodes);
1da177e4
LT
116
117 switch (mode) {
118 case MPOL_DEFAULT:
119 if (!empty)
120 return -EINVAL;
121 break;
122 case MPOL_BIND:
123 case MPOL_INTERLEAVE:
124 /* Preferred will only use the first bit, but allow
125 more for now. */
126 if (empty)
127 return -EINVAL;
128 break;
129 }
dfcd3c0d 130 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
1da177e4 131}
1da177e4 132/* Generate a custom zonelist for the BIND policy. */
dfcd3c0d 133static struct zonelist *bind_zonelist(nodemask_t *nodes)
1da177e4
LT
134{
135 struct zonelist *zl;
136 int num, max, nd;
137
dfcd3c0d 138 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
1da177e4
LT
139 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
140 if (!zl)
141 return NULL;
142 num = 0;
4be38e35
CL
143 for_each_node_mask(nd, *nodes)
144 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
1da177e4
LT
145 zl->zones[num] = NULL;
146 return zl;
147}
148
149/* Create a new policy */
dfcd3c0d 150static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
1da177e4
LT
151{
152 struct mempolicy *policy;
153
dfcd3c0d 154 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
1da177e4
LT
155 if (mode == MPOL_DEFAULT)
156 return NULL;
157 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
158 if (!policy)
159 return ERR_PTR(-ENOMEM);
160 atomic_set(&policy->refcnt, 1);
161 switch (mode) {
162 case MPOL_INTERLEAVE:
dfcd3c0d 163 policy->v.nodes = *nodes;
8f493d79
AK
164 if (nodes_weight(*nodes) == 0) {
165 kmem_cache_free(policy_cache, policy);
166 return ERR_PTR(-EINVAL);
167 }
1da177e4
LT
168 break;
169 case MPOL_PREFERRED:
dfcd3c0d 170 policy->v.preferred_node = first_node(*nodes);
1da177e4
LT
171 if (policy->v.preferred_node >= MAX_NUMNODES)
172 policy->v.preferred_node = -1;
173 break;
174 case MPOL_BIND:
175 policy->v.zonelist = bind_zonelist(nodes);
176 if (policy->v.zonelist == NULL) {
177 kmem_cache_free(policy_cache, policy);
178 return ERR_PTR(-ENOMEM);
179 }
180 break;
181 }
182 policy->policy = mode;
183 return policy;
184}
185
dc9aa5b9
CL
186/* Check if we are the only process mapping the page in question */
187static inline int single_mm_mapping(struct mm_struct *mm,
188 struct address_space *mapping)
189{
190 struct vm_area_struct *vma;
191 struct prio_tree_iter iter;
192 int rc = 1;
193
194 spin_lock(&mapping->i_mmap_lock);
195 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
196 if (mm != vma->vm_mm) {
197 rc = 0;
198 goto out;
199 }
200 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
201 if (mm != vma->vm_mm) {
202 rc = 0;
203 goto out;
204 }
205out:
206 spin_unlock(&mapping->i_mmap_lock);
207 return rc;
208}
209
210/*
211 * Add a page to be migrated to the pagelist
212 */
213static void migrate_page_add(struct vm_area_struct *vma,
214 struct page *page, struct list_head *pagelist, unsigned long flags)
215{
216 /*
217 * Avoid migrating a page that is shared by others and not writable.
218 */
219 if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
220 mapping_writably_mapped(page->mapping) ||
221 single_mm_mapping(vma->vm_mm, page->mapping)) {
222 int rc = isolate_lru_page(page);
223
224 if (rc == 1)
225 list_add(&page->lru, pagelist);
226 /*
227 * If the isolate attempt was not successful then we just
228 * encountered an unswappable page. Something must be wrong.
229 */
230 WARN_ON(rc == 0);
231 }
232}
233
1a75a6c8
CL
234static void gather_stats(struct page *, void *);
235
38e35860 236/* Scan through pages checking if pages follow certain conditions. */
b5810039 237static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
dc9aa5b9
CL
238 unsigned long addr, unsigned long end,
239 const nodemask_t *nodes, unsigned long flags,
38e35860 240 void *private)
1da177e4 241{
91612e0d
HD
242 pte_t *orig_pte;
243 pte_t *pte;
705e87c0 244 spinlock_t *ptl;
941150a3 245
705e87c0 246 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
91612e0d 247 do {
6aab341e 248 struct page *page;
91612e0d
HD
249 unsigned int nid;
250
251 if (!pte_present(*pte))
1da177e4 252 continue;
6aab341e
LT
253 page = vm_normal_page(vma, addr, *pte);
254 if (!page)
1da177e4 255 continue;
6aab341e 256 nid = page_to_nid(page);
38e35860
CL
257 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
258 continue;
259
1a75a6c8
CL
260 if (flags & MPOL_MF_STATS)
261 gather_stats(page, private);
132beacf
CL
262 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
263 spin_unlock(ptl);
38e35860 264 migrate_page_add(vma, page, private, flags);
132beacf
CL
265 spin_lock(ptl);
266 }
38e35860
CL
267 else
268 break;
91612e0d 269 } while (pte++, addr += PAGE_SIZE, addr != end);
705e87c0 270 pte_unmap_unlock(orig_pte, ptl);
91612e0d
HD
271 return addr != end;
272}
273
b5810039 274static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
dc9aa5b9
CL
275 unsigned long addr, unsigned long end,
276 const nodemask_t *nodes, unsigned long flags,
38e35860 277 void *private)
91612e0d
HD
278{
279 pmd_t *pmd;
280 unsigned long next;
281
282 pmd = pmd_offset(pud, addr);
283 do {
284 next = pmd_addr_end(addr, end);
285 if (pmd_none_or_clear_bad(pmd))
286 continue;
dc9aa5b9 287 if (check_pte_range(vma, pmd, addr, next, nodes,
38e35860 288 flags, private))
91612e0d
HD
289 return -EIO;
290 } while (pmd++, addr = next, addr != end);
291 return 0;
292}
293
b5810039 294static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
dc9aa5b9
CL
295 unsigned long addr, unsigned long end,
296 const nodemask_t *nodes, unsigned long flags,
38e35860 297 void *private)
91612e0d
HD
298{
299 pud_t *pud;
300 unsigned long next;
301
302 pud = pud_offset(pgd, addr);
303 do {
304 next = pud_addr_end(addr, end);
305 if (pud_none_or_clear_bad(pud))
306 continue;
dc9aa5b9 307 if (check_pmd_range(vma, pud, addr, next, nodes,
38e35860 308 flags, private))
91612e0d
HD
309 return -EIO;
310 } while (pud++, addr = next, addr != end);
311 return 0;
312}
313
b5810039 314static inline int check_pgd_range(struct vm_area_struct *vma,
dc9aa5b9
CL
315 unsigned long addr, unsigned long end,
316 const nodemask_t *nodes, unsigned long flags,
38e35860 317 void *private)
91612e0d
HD
318{
319 pgd_t *pgd;
320 unsigned long next;
321
b5810039 322 pgd = pgd_offset(vma->vm_mm, addr);
91612e0d
HD
323 do {
324 next = pgd_addr_end(addr, end);
325 if (pgd_none_or_clear_bad(pgd))
326 continue;
dc9aa5b9 327 if (check_pud_range(vma, pgd, addr, next, nodes,
38e35860 328 flags, private))
91612e0d
HD
329 return -EIO;
330 } while (pgd++, addr = next, addr != end);
331 return 0;
1da177e4
LT
332}
333
dc9aa5b9
CL
334/* Check if a vma is migratable */
335static inline int vma_migratable(struct vm_area_struct *vma)
336{
337 if (vma->vm_flags & (
338 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
339 return 0;
340 return 1;
341}
342
343/*
344 * Check if all pages in a range are on a set of nodes.
345 * If pagelist != NULL then isolate pages from the LRU and
346 * put them on the pagelist.
347 */
1da177e4
LT
348static struct vm_area_struct *
349check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
38e35860 350 const nodemask_t *nodes, unsigned long flags, void *private)
1da177e4
LT
351{
352 int err;
353 struct vm_area_struct *first, *vma, *prev;
354
355 first = find_vma(mm, start);
356 if (!first)
357 return ERR_PTR(-EFAULT);
358 prev = NULL;
359 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
dc9aa5b9
CL
360 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
361 if (!vma->vm_next && vma->vm_end < end)
362 return ERR_PTR(-EFAULT);
363 if (prev && prev->vm_end < vma->vm_start)
364 return ERR_PTR(-EFAULT);
365 }
366 if (!is_vm_hugetlb_page(vma) &&
367 ((flags & MPOL_MF_STRICT) ||
368 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
369 vma_migratable(vma)))) {
5b952b3c 370 unsigned long endvma = vma->vm_end;
dc9aa5b9 371
5b952b3c
AK
372 if (endvma > end)
373 endvma = end;
374 if (vma->vm_start > start)
375 start = vma->vm_start;
dc9aa5b9 376 err = check_pgd_range(vma, start, endvma, nodes,
38e35860 377 flags, private);
1da177e4
LT
378 if (err) {
379 first = ERR_PTR(err);
380 break;
381 }
382 }
383 prev = vma;
384 }
385 return first;
386}
387
388/* Apply policy to a single VMA */
389static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
390{
391 int err = 0;
392 struct mempolicy *old = vma->vm_policy;
393
394 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
395 vma->vm_start, vma->vm_end, vma->vm_pgoff,
396 vma->vm_ops, vma->vm_file,
397 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
398
399 if (vma->vm_ops && vma->vm_ops->set_policy)
400 err = vma->vm_ops->set_policy(vma, new);
401 if (!err) {
402 mpol_get(new);
403 vma->vm_policy = new;
404 mpol_free(old);
405 }
406 return err;
407}
408
409/* Step 2: apply policy to a range and do splits. */
410static int mbind_range(struct vm_area_struct *vma, unsigned long start,
411 unsigned long end, struct mempolicy *new)
412{
413 struct vm_area_struct *next;
414 int err;
415
416 err = 0;
417 for (; vma && vma->vm_start < end; vma = next) {
418 next = vma->vm_next;
419 if (vma->vm_start < start)
420 err = split_vma(vma->vm_mm, vma, start, 1);
421 if (!err && vma->vm_end > end)
422 err = split_vma(vma->vm_mm, vma, end, 0);
423 if (!err)
424 err = policy_vma(vma, new);
425 if (err)
426 break;
427 }
428 return err;
429}
430
8bccd85f
CL
431static int contextualize_policy(int mode, nodemask_t *nodes)
432{
433 if (!nodes)
434 return 0;
435
436 /* Update current mems_allowed */
437 cpuset_update_current_mems_allowed();
438 /* Ignore nodes not set in current->mems_allowed */
439 cpuset_restrict_to_mems_allowed(nodes->bits);
440 return mpol_check_policy(mode, nodes);
441}
442
d4984711
CL
443static int swap_pages(struct list_head *pagelist)
444{
445 LIST_HEAD(moved);
446 LIST_HEAD(failed);
447 int n;
448
449 n = migrate_pages(pagelist, NULL, &moved, &failed);
450 putback_lru_pages(&failed);
451 putback_lru_pages(&moved);
452
453 return n;
454}
455
8bccd85f
CL
456long do_mbind(unsigned long start, unsigned long len,
457 unsigned long mode, nodemask_t *nmask, unsigned long flags)
1da177e4
LT
458{
459 struct vm_area_struct *vma;
460 struct mm_struct *mm = current->mm;
461 struct mempolicy *new;
462 unsigned long end;
1da177e4 463 int err;
dc9aa5b9 464 LIST_HEAD(pagelist);
1da177e4 465
38e35860
CL
466 if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
467 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
dc9aa5b9 468 || mode > MPOL_MAX)
1da177e4 469 return -EINVAL;
dc9aa5b9
CL
470 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
471 return -EPERM;
472
1da177e4
LT
473 if (start & ~PAGE_MASK)
474 return -EINVAL;
dc9aa5b9 475
1da177e4
LT
476 if (mode == MPOL_DEFAULT)
477 flags &= ~MPOL_MF_STRICT;
dc9aa5b9 478
1da177e4
LT
479 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
480 end = start + len;
dc9aa5b9 481
1da177e4
LT
482 if (end < start)
483 return -EINVAL;
484 if (end == start)
485 return 0;
dc9aa5b9 486
5fcbb230 487 if (mpol_check_policy(mode, nmask))
8bccd85f 488 return -EINVAL;
dc9aa5b9 489
8bccd85f 490 new = mpol_new(mode, nmask);
1da177e4
LT
491 if (IS_ERR(new))
492 return PTR_ERR(new);
493
dc9aa5b9
CL
494 /*
495 * If we are using the default policy then operation
496 * on discontinuous address spaces is okay after all
497 */
498 if (!new)
499 flags |= MPOL_MF_DISCONTIG_OK;
500
1da177e4 501 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
dfcd3c0d 502 mode,nodes_addr(nodes)[0]);
1da177e4
LT
503
504 down_write(&mm->mmap_sem);
38e35860
CL
505 vma = check_range(mm, start, end, nmask,
506 flags | MPOL_MF_INVERT, &pagelist);
507
1da177e4 508 err = PTR_ERR(vma);
dc9aa5b9 509 if (!IS_ERR(vma)) {
d4984711
CL
510 int nr_failed = 0;
511
1da177e4 512 err = mbind_range(vma, start, end, new);
dc9aa5b9 513 if (!list_empty(&pagelist))
d4984711
CL
514 nr_failed = swap_pages(&pagelist);
515
516 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
dc9aa5b9
CL
517 err = -EIO;
518 }
519 if (!list_empty(&pagelist))
520 putback_lru_pages(&pagelist);
521
1da177e4
LT
522 up_write(&mm->mmap_sem);
523 mpol_free(new);
524 return err;
525}
526
527/* Set the process memory policy */
8bccd85f 528long do_set_mempolicy(int mode, nodemask_t *nodes)
1da177e4 529{
1da177e4 530 struct mempolicy *new;
1da177e4 531
8bccd85f 532 if (contextualize_policy(mode, nodes))
1da177e4 533 return -EINVAL;
8bccd85f 534 new = mpol_new(mode, nodes);
1da177e4
LT
535 if (IS_ERR(new))
536 return PTR_ERR(new);
537 mpol_free(current->mempolicy);
538 current->mempolicy = new;
539 if (new && new->policy == MPOL_INTERLEAVE)
dfcd3c0d 540 current->il_next = first_node(new->v.nodes);
1da177e4
LT
541 return 0;
542}
543
544/* Fill a zone bitmap for a policy */
dfcd3c0d 545static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4
LT
546{
547 int i;
548
dfcd3c0d 549 nodes_clear(*nodes);
1da177e4
LT
550 switch (p->policy) {
551 case MPOL_BIND:
552 for (i = 0; p->v.zonelist->zones[i]; i++)
8bccd85f
CL
553 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
554 *nodes);
1da177e4
LT
555 break;
556 case MPOL_DEFAULT:
557 break;
558 case MPOL_INTERLEAVE:
dfcd3c0d 559 *nodes = p->v.nodes;
1da177e4
LT
560 break;
561 case MPOL_PREFERRED:
562 /* or use current node instead of online map? */
563 if (p->v.preferred_node < 0)
dfcd3c0d 564 *nodes = node_online_map;
1da177e4 565 else
dfcd3c0d 566 node_set(p->v.preferred_node, *nodes);
1da177e4
LT
567 break;
568 default:
569 BUG();
570 }
571}
572
573static int lookup_node(struct mm_struct *mm, unsigned long addr)
574{
575 struct page *p;
576 int err;
577
578 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
579 if (err >= 0) {
580 err = page_to_nid(p);
581 put_page(p);
582 }
583 return err;
584}
585
1da177e4 586/* Retrieve NUMA policy */
8bccd85f
CL
587long do_get_mempolicy(int *policy, nodemask_t *nmask,
588 unsigned long addr, unsigned long flags)
1da177e4 589{
8bccd85f 590 int err;
1da177e4
LT
591 struct mm_struct *mm = current->mm;
592 struct vm_area_struct *vma = NULL;
593 struct mempolicy *pol = current->mempolicy;
594
68860ec1 595 cpuset_update_current_mems_allowed();
1da177e4
LT
596 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
597 return -EINVAL;
1da177e4
LT
598 if (flags & MPOL_F_ADDR) {
599 down_read(&mm->mmap_sem);
600 vma = find_vma_intersection(mm, addr, addr+1);
601 if (!vma) {
602 up_read(&mm->mmap_sem);
603 return -EFAULT;
604 }
605 if (vma->vm_ops && vma->vm_ops->get_policy)
606 pol = vma->vm_ops->get_policy(vma, addr);
607 else
608 pol = vma->vm_policy;
609 } else if (addr)
610 return -EINVAL;
611
612 if (!pol)
613 pol = &default_policy;
614
615 if (flags & MPOL_F_NODE) {
616 if (flags & MPOL_F_ADDR) {
617 err = lookup_node(mm, addr);
618 if (err < 0)
619 goto out;
8bccd85f 620 *policy = err;
1da177e4
LT
621 } else if (pol == current->mempolicy &&
622 pol->policy == MPOL_INTERLEAVE) {
8bccd85f 623 *policy = current->il_next;
1da177e4
LT
624 } else {
625 err = -EINVAL;
626 goto out;
627 }
628 } else
8bccd85f 629 *policy = pol->policy;
1da177e4
LT
630
631 if (vma) {
632 up_read(&current->mm->mmap_sem);
633 vma = NULL;
634 }
635
1da177e4 636 err = 0;
8bccd85f
CL
637 if (nmask)
638 get_zonemask(pol, nmask);
1da177e4
LT
639
640 out:
641 if (vma)
642 up_read(&current->mm->mmap_sem);
643 return err;
644}
645
39743889
CL
646/*
647 * For now migrate_pages simply swaps out the pages from nodes that are in
648 * the source set but not in the target set. In the future, we would
649 * want a function that moves pages between the two nodesets in such
650 * a way as to preserve the physical layout as much as possible.
651 *
652 * Returns the number of page that could not be moved.
653 */
654int do_migrate_pages(struct mm_struct *mm,
655 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
656{
657 LIST_HEAD(pagelist);
658 int count = 0;
659 nodemask_t nodes;
660
661 nodes_andnot(nodes, *from_nodes, *to_nodes);
39743889
CL
662
663 down_read(&mm->mmap_sem);
664 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
665 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
d4984711 666
39743889 667 if (!list_empty(&pagelist)) {
d4984711
CL
668 count = swap_pages(&pagelist);
669 putback_lru_pages(&pagelist);
39743889 670 }
d4984711 671
39743889
CL
672 up_read(&mm->mmap_sem);
673 return count;
674}
675
8bccd85f
CL
676/*
677 * User space interface with variable sized bitmaps for nodelists.
678 */
679
680/* Copy a node mask from user space. */
39743889 681static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85f
CL
682 unsigned long maxnode)
683{
684 unsigned long k;
685 unsigned long nlongs;
686 unsigned long endmask;
687
688 --maxnode;
689 nodes_clear(*nodes);
690 if (maxnode == 0 || !nmask)
691 return 0;
692
693 nlongs = BITS_TO_LONGS(maxnode);
694 if ((maxnode % BITS_PER_LONG) == 0)
695 endmask = ~0UL;
696 else
697 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
698
699 /* When the user specified more nodes than supported just check
700 if the non supported part is all zero. */
701 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
702 if (nlongs > PAGE_SIZE/sizeof(long))
703 return -EINVAL;
704 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
705 unsigned long t;
706 if (get_user(t, nmask + k))
707 return -EFAULT;
708 if (k == nlongs - 1) {
709 if (t & endmask)
710 return -EINVAL;
711 } else if (t)
712 return -EINVAL;
713 }
714 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
715 endmask = ~0UL;
716 }
717
718 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
719 return -EFAULT;
720 nodes_addr(*nodes)[nlongs-1] &= endmask;
721 return 0;
722}
723
724/* Copy a kernel node mask to user space */
725static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
726 nodemask_t *nodes)
727{
728 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
729 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
730
731 if (copy > nbytes) {
732 if (copy > PAGE_SIZE)
733 return -EINVAL;
734 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
735 return -EFAULT;
736 copy = nbytes;
737 }
738 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
739}
740
741asmlinkage long sys_mbind(unsigned long start, unsigned long len,
742 unsigned long mode,
743 unsigned long __user *nmask, unsigned long maxnode,
744 unsigned flags)
745{
746 nodemask_t nodes;
747 int err;
748
749 err = get_nodes(&nodes, nmask, maxnode);
750 if (err)
751 return err;
752 return do_mbind(start, len, mode, &nodes, flags);
753}
754
755/* Set the process memory policy */
756asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
757 unsigned long maxnode)
758{
759 int err;
760 nodemask_t nodes;
761
762 if (mode < 0 || mode > MPOL_MAX)
763 return -EINVAL;
764 err = get_nodes(&nodes, nmask, maxnode);
765 if (err)
766 return err;
767 return do_set_mempolicy(mode, &nodes);
768}
769
39743889
CL
770/* Macro needed until Paul implements this function in kernel/cpusets.c */
771#define cpuset_mems_allowed(task) node_online_map
772
773asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
774 const unsigned long __user *old_nodes,
775 const unsigned long __user *new_nodes)
776{
777 struct mm_struct *mm;
778 struct task_struct *task;
779 nodemask_t old;
780 nodemask_t new;
781 nodemask_t task_nodes;
782 int err;
783
784 err = get_nodes(&old, old_nodes, maxnode);
785 if (err)
786 return err;
787
788 err = get_nodes(&new, new_nodes, maxnode);
789 if (err)
790 return err;
791
792 /* Find the mm_struct */
793 read_lock(&tasklist_lock);
794 task = pid ? find_task_by_pid(pid) : current;
795 if (!task) {
796 read_unlock(&tasklist_lock);
797 return -ESRCH;
798 }
799 mm = get_task_mm(task);
800 read_unlock(&tasklist_lock);
801
802 if (!mm)
803 return -EINVAL;
804
805 /*
806 * Check if this process has the right to modify the specified
807 * process. The right exists if the process has administrative
808 * capabilities, superuser priviledges or the same
809 * userid as the target process.
810 */
811 if ((current->euid != task->suid) && (current->euid != task->uid) &&
812 (current->uid != task->suid) && (current->uid != task->uid) &&
813 !capable(CAP_SYS_ADMIN)) {
814 err = -EPERM;
815 goto out;
816 }
817
818 task_nodes = cpuset_mems_allowed(task);
819 /* Is the user allowed to access the target nodes? */
820 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
821 err = -EPERM;
822 goto out;
823 }
824
825 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
826out:
827 mmput(mm);
828 return err;
829}
830
831
8bccd85f
CL
832/* Retrieve NUMA policy */
833asmlinkage long sys_get_mempolicy(int __user *policy,
834 unsigned long __user *nmask,
835 unsigned long maxnode,
836 unsigned long addr, unsigned long flags)
837{
838 int err, pval;
839 nodemask_t nodes;
840
841 if (nmask != NULL && maxnode < MAX_NUMNODES)
842 return -EINVAL;
843
844 err = do_get_mempolicy(&pval, &nodes, addr, flags);
845
846 if (err)
847 return err;
848
849 if (policy && put_user(pval, policy))
850 return -EFAULT;
851
852 if (nmask)
853 err = copy_nodes_to_user(nmask, maxnode, &nodes);
854
855 return err;
856}
857
1da177e4
LT
858#ifdef CONFIG_COMPAT
859
860asmlinkage long compat_sys_get_mempolicy(int __user *policy,
861 compat_ulong_t __user *nmask,
862 compat_ulong_t maxnode,
863 compat_ulong_t addr, compat_ulong_t flags)
864{
865 long err;
866 unsigned long __user *nm = NULL;
867 unsigned long nr_bits, alloc_size;
868 DECLARE_BITMAP(bm, MAX_NUMNODES);
869
870 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
871 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
872
873 if (nmask)
874 nm = compat_alloc_user_space(alloc_size);
875
876 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
877
878 if (!err && nmask) {
879 err = copy_from_user(bm, nm, alloc_size);
880 /* ensure entire bitmap is zeroed */
881 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
882 err |= compat_put_bitmap(nmask, bm, nr_bits);
883 }
884
885 return err;
886}
887
888asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
889 compat_ulong_t maxnode)
890{
891 long err = 0;
892 unsigned long __user *nm = NULL;
893 unsigned long nr_bits, alloc_size;
894 DECLARE_BITMAP(bm, MAX_NUMNODES);
895
896 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
897 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
898
899 if (nmask) {
900 err = compat_get_bitmap(bm, nmask, nr_bits);
901 nm = compat_alloc_user_space(alloc_size);
902 err |= copy_to_user(nm, bm, alloc_size);
903 }
904
905 if (err)
906 return -EFAULT;
907
908 return sys_set_mempolicy(mode, nm, nr_bits+1);
909}
910
911asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
912 compat_ulong_t mode, compat_ulong_t __user *nmask,
913 compat_ulong_t maxnode, compat_ulong_t flags)
914{
915 long err = 0;
916 unsigned long __user *nm = NULL;
917 unsigned long nr_bits, alloc_size;
dfcd3c0d 918 nodemask_t bm;
1da177e4
LT
919
920 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
921 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
922
923 if (nmask) {
dfcd3c0d 924 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1da177e4 925 nm = compat_alloc_user_space(alloc_size);
dfcd3c0d 926 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1da177e4
LT
927 }
928
929 if (err)
930 return -EFAULT;
931
932 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
933}
934
935#endif
936
937/* Return effective policy for a VMA */
48fce342
CL
938static struct mempolicy * get_vma_policy(struct task_struct *task,
939 struct vm_area_struct *vma, unsigned long addr)
1da177e4 940{
6e21c8f1 941 struct mempolicy *pol = task->mempolicy;
1da177e4
LT
942
943 if (vma) {
944 if (vma->vm_ops && vma->vm_ops->get_policy)
8bccd85f 945 pol = vma->vm_ops->get_policy(vma, addr);
1da177e4
LT
946 else if (vma->vm_policy &&
947 vma->vm_policy->policy != MPOL_DEFAULT)
948 pol = vma->vm_policy;
949 }
950 if (!pol)
951 pol = &default_policy;
952 return pol;
953}
954
955/* Return a zonelist representing a mempolicy */
dd0fc66f 956static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1da177e4
LT
957{
958 int nd;
959
960 switch (policy->policy) {
961 case MPOL_PREFERRED:
962 nd = policy->v.preferred_node;
963 if (nd < 0)
964 nd = numa_node_id();
965 break;
966 case MPOL_BIND:
967 /* Lower zones don't get a policy applied */
968 /* Careful: current->mems_allowed might have moved */
af4ca457 969 if (gfp_zone(gfp) >= policy_zone)
1da177e4
LT
970 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
971 return policy->v.zonelist;
972 /*FALL THROUGH*/
973 case MPOL_INTERLEAVE: /* should not happen */
974 case MPOL_DEFAULT:
975 nd = numa_node_id();
976 break;
977 default:
978 nd = 0;
979 BUG();
980 }
af4ca457 981 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1da177e4
LT
982}
983
984/* Do dynamic interleaving for a process */
985static unsigned interleave_nodes(struct mempolicy *policy)
986{
987 unsigned nid, next;
988 struct task_struct *me = current;
989
990 nid = me->il_next;
dfcd3c0d 991 next = next_node(nid, policy->v.nodes);
1da177e4 992 if (next >= MAX_NUMNODES)
dfcd3c0d 993 next = first_node(policy->v.nodes);
1da177e4
LT
994 me->il_next = next;
995 return nid;
996}
997
998/* Do static interleaving for a VMA with known offset. */
999static unsigned offset_il_node(struct mempolicy *pol,
1000 struct vm_area_struct *vma, unsigned long off)
1001{
dfcd3c0d 1002 unsigned nnodes = nodes_weight(pol->v.nodes);
1da177e4
LT
1003 unsigned target = (unsigned)off % nnodes;
1004 int c;
1005 int nid = -1;
1006
1007 c = 0;
1008 do {
dfcd3c0d 1009 nid = next_node(nid, pol->v.nodes);
1da177e4
LT
1010 c++;
1011 } while (c <= target);
1da177e4
LT
1012 return nid;
1013}
1014
5da7ca86
CL
1015/* Determine a node number for interleave */
1016static inline unsigned interleave_nid(struct mempolicy *pol,
1017 struct vm_area_struct *vma, unsigned long addr, int shift)
1018{
1019 if (vma) {
1020 unsigned long off;
1021
1022 off = vma->vm_pgoff;
1023 off += (addr - vma->vm_start) >> shift;
1024 return offset_il_node(pol, vma, off);
1025 } else
1026 return interleave_nodes(pol);
1027}
1028
1029/* Return a zonelist suitable for a huge page allocation. */
1030struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1031{
1032 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1033
1034 if (pol->policy == MPOL_INTERLEAVE) {
1035 unsigned nid;
1036
1037 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1038 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1039 }
1040 return zonelist_policy(GFP_HIGHUSER, pol);
1041}
1042
1da177e4
LT
1043/* Allocate a page in interleaved policy.
1044 Own path because it needs to do special accounting. */
662f3a0b
AK
1045static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1046 unsigned nid)
1da177e4
LT
1047{
1048 struct zonelist *zl;
1049 struct page *page;
1050
af4ca457 1051 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1da177e4
LT
1052 page = __alloc_pages(gfp, order, zl);
1053 if (page && page_zone(page) == zl->zones[0]) {
e7c8d5c9 1054 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1da177e4
LT
1055 put_cpu();
1056 }
1057 return page;
1058}
1059
1060/**
1061 * alloc_page_vma - Allocate a page for a VMA.
1062 *
1063 * @gfp:
1064 * %GFP_USER user allocation.
1065 * %GFP_KERNEL kernel allocations,
1066 * %GFP_HIGHMEM highmem/user allocations,
1067 * %GFP_FS allocation should not call back into a file system.
1068 * %GFP_ATOMIC don't sleep.
1069 *
1070 * @vma: Pointer to VMA or NULL if not available.
1071 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1072 *
1073 * This function allocates a page from the kernel page pool and applies
1074 * a NUMA policy associated with the VMA or the current process.
1075 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1076 * mm_struct of the VMA to prevent it from going away. Should be used for
1077 * all allocations for pages that will be mapped into
1078 * user space. Returns NULL when no page can be allocated.
1079 *
1080 * Should be called with the mm_sem of the vma hold.
1081 */
1082struct page *
dd0fc66f 1083alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1da177e4 1084{
6e21c8f1 1085 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1da177e4
LT
1086
1087 cpuset_update_current_mems_allowed();
1088
1089 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1090 unsigned nid;
5da7ca86
CL
1091
1092 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1da177e4
LT
1093 return alloc_page_interleave(gfp, 0, nid);
1094 }
1095 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1096}
1097
1098/**
1099 * alloc_pages_current - Allocate pages.
1100 *
1101 * @gfp:
1102 * %GFP_USER user allocation,
1103 * %GFP_KERNEL kernel allocation,
1104 * %GFP_HIGHMEM highmem allocation,
1105 * %GFP_FS don't call back into a file system.
1106 * %GFP_ATOMIC don't sleep.
1107 * @order: Power of two of allocation size in pages. 0 is a single page.
1108 *
1109 * Allocate a page from the kernel page pool. When not in
1110 * interrupt context and apply the current process NUMA policy.
1111 * Returns NULL when no page can be allocated.
1112 *
1113 * Don't call cpuset_update_current_mems_allowed() unless
1114 * 1) it's ok to take cpuset_sem (can WAIT), and
1115 * 2) allocating for current task (not interrupt).
1116 */
dd0fc66f 1117struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1da177e4
LT
1118{
1119 struct mempolicy *pol = current->mempolicy;
1120
1121 if ((gfp & __GFP_WAIT) && !in_interrupt())
1122 cpuset_update_current_mems_allowed();
1123 if (!pol || in_interrupt())
1124 pol = &default_policy;
1125 if (pol->policy == MPOL_INTERLEAVE)
1126 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1127 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1128}
1129EXPORT_SYMBOL(alloc_pages_current);
1130
1131/* Slow path of a mempolicy copy */
1132struct mempolicy *__mpol_copy(struct mempolicy *old)
1133{
1134 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1135
1136 if (!new)
1137 return ERR_PTR(-ENOMEM);
1138 *new = *old;
1139 atomic_set(&new->refcnt, 1);
1140 if (new->policy == MPOL_BIND) {
1141 int sz = ksize(old->v.zonelist);
1142 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1143 if (!new->v.zonelist) {
1144 kmem_cache_free(policy_cache, new);
1145 return ERR_PTR(-ENOMEM);
1146 }
1147 memcpy(new->v.zonelist, old->v.zonelist, sz);
1148 }
1149 return new;
1150}
1151
1152/* Slow path of a mempolicy comparison */
1153int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1154{
1155 if (!a || !b)
1156 return 0;
1157 if (a->policy != b->policy)
1158 return 0;
1159 switch (a->policy) {
1160 case MPOL_DEFAULT:
1161 return 1;
1162 case MPOL_INTERLEAVE:
dfcd3c0d 1163 return nodes_equal(a->v.nodes, b->v.nodes);
1da177e4
LT
1164 case MPOL_PREFERRED:
1165 return a->v.preferred_node == b->v.preferred_node;
1166 case MPOL_BIND: {
1167 int i;
1168 for (i = 0; a->v.zonelist->zones[i]; i++)
1169 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1170 return 0;
1171 return b->v.zonelist->zones[i] == NULL;
1172 }
1173 default:
1174 BUG();
1175 return 0;
1176 }
1177}
1178
1179/* Slow path of a mpol destructor. */
1180void __mpol_free(struct mempolicy *p)
1181{
1182 if (!atomic_dec_and_test(&p->refcnt))
1183 return;
1184 if (p->policy == MPOL_BIND)
1185 kfree(p->v.zonelist);
1186 p->policy = MPOL_DEFAULT;
1187 kmem_cache_free(policy_cache, p);
1188}
1189
1da177e4
LT
1190/*
1191 * Shared memory backing store policy support.
1192 *
1193 * Remember policies even when nobody has shared memory mapped.
1194 * The policies are kept in Red-Black tree linked from the inode.
1195 * They are protected by the sp->lock spinlock, which should be held
1196 * for any accesses to the tree.
1197 */
1198
1199/* lookup first element intersecting start-end */
1200/* Caller holds sp->lock */
1201static struct sp_node *
1202sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1203{
1204 struct rb_node *n = sp->root.rb_node;
1205
1206 while (n) {
1207 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1208
1209 if (start >= p->end)
1210 n = n->rb_right;
1211 else if (end <= p->start)
1212 n = n->rb_left;
1213 else
1214 break;
1215 }
1216 if (!n)
1217 return NULL;
1218 for (;;) {
1219 struct sp_node *w = NULL;
1220 struct rb_node *prev = rb_prev(n);
1221 if (!prev)
1222 break;
1223 w = rb_entry(prev, struct sp_node, nd);
1224 if (w->end <= start)
1225 break;
1226 n = prev;
1227 }
1228 return rb_entry(n, struct sp_node, nd);
1229}
1230
1231/* Insert a new shared policy into the list. */
1232/* Caller holds sp->lock */
1233static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1234{
1235 struct rb_node **p = &sp->root.rb_node;
1236 struct rb_node *parent = NULL;
1237 struct sp_node *nd;
1238
1239 while (*p) {
1240 parent = *p;
1241 nd = rb_entry(parent, struct sp_node, nd);
1242 if (new->start < nd->start)
1243 p = &(*p)->rb_left;
1244 else if (new->end > nd->end)
1245 p = &(*p)->rb_right;
1246 else
1247 BUG();
1248 }
1249 rb_link_node(&new->nd, parent, p);
1250 rb_insert_color(&new->nd, &sp->root);
1251 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1252 new->policy ? new->policy->policy : 0);
1253}
1254
1255/* Find shared policy intersecting idx */
1256struct mempolicy *
1257mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1258{
1259 struct mempolicy *pol = NULL;
1260 struct sp_node *sn;
1261
1262 if (!sp->root.rb_node)
1263 return NULL;
1264 spin_lock(&sp->lock);
1265 sn = sp_lookup(sp, idx, idx+1);
1266 if (sn) {
1267 mpol_get(sn->policy);
1268 pol = sn->policy;
1269 }
1270 spin_unlock(&sp->lock);
1271 return pol;
1272}
1273
1274static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1275{
1276 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1277 rb_erase(&n->nd, &sp->root);
1278 mpol_free(n->policy);
1279 kmem_cache_free(sn_cache, n);
1280}
1281
1282struct sp_node *
1283sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1284{
1285 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1286
1287 if (!n)
1288 return NULL;
1289 n->start = start;
1290 n->end = end;
1291 mpol_get(pol);
1292 n->policy = pol;
1293 return n;
1294}
1295
1296/* Replace a policy range. */
1297static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1298 unsigned long end, struct sp_node *new)
1299{
1300 struct sp_node *n, *new2 = NULL;
1301
1302restart:
1303 spin_lock(&sp->lock);
1304 n = sp_lookup(sp, start, end);
1305 /* Take care of old policies in the same range. */
1306 while (n && n->start < end) {
1307 struct rb_node *next = rb_next(&n->nd);
1308 if (n->start >= start) {
1309 if (n->end <= end)
1310 sp_delete(sp, n);
1311 else
1312 n->start = end;
1313 } else {
1314 /* Old policy spanning whole new range. */
1315 if (n->end > end) {
1316 if (!new2) {
1317 spin_unlock(&sp->lock);
1318 new2 = sp_alloc(end, n->end, n->policy);
1319 if (!new2)
1320 return -ENOMEM;
1321 goto restart;
1322 }
1323 n->end = start;
1324 sp_insert(sp, new2);
1325 new2 = NULL;
1326 break;
1327 } else
1328 n->end = start;
1329 }
1330 if (!next)
1331 break;
1332 n = rb_entry(next, struct sp_node, nd);
1333 }
1334 if (new)
1335 sp_insert(sp, new);
1336 spin_unlock(&sp->lock);
1337 if (new2) {
1338 mpol_free(new2->policy);
1339 kmem_cache_free(sn_cache, new2);
1340 }
1341 return 0;
1342}
1343
1344int mpol_set_shared_policy(struct shared_policy *info,
1345 struct vm_area_struct *vma, struct mempolicy *npol)
1346{
1347 int err;
1348 struct sp_node *new = NULL;
1349 unsigned long sz = vma_pages(vma);
1350
1351 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1352 vma->vm_pgoff,
1353 sz, npol? npol->policy : -1,
dfcd3c0d 1354 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1da177e4
LT
1355
1356 if (npol) {
1357 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1358 if (!new)
1359 return -ENOMEM;
1360 }
1361 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1362 if (err && new)
1363 kmem_cache_free(sn_cache, new);
1364 return err;
1365}
1366
1367/* Free a backing policy store on inode delete. */
1368void mpol_free_shared_policy(struct shared_policy *p)
1369{
1370 struct sp_node *n;
1371 struct rb_node *next;
1372
1373 if (!p->root.rb_node)
1374 return;
1375 spin_lock(&p->lock);
1376 next = rb_first(&p->root);
1377 while (next) {
1378 n = rb_entry(next, struct sp_node, nd);
1379 next = rb_next(&n->nd);
90c5029e 1380 rb_erase(&n->nd, &p->root);
1da177e4
LT
1381 mpol_free(n->policy);
1382 kmem_cache_free(sn_cache, n);
1383 }
1384 spin_unlock(&p->lock);
1da177e4
LT
1385}
1386
1387/* assumes fs == KERNEL_DS */
1388void __init numa_policy_init(void)
1389{
1390 policy_cache = kmem_cache_create("numa_policy",
1391 sizeof(struct mempolicy),
1392 0, SLAB_PANIC, NULL, NULL);
1393
1394 sn_cache = kmem_cache_create("shared_policy_node",
1395 sizeof(struct sp_node),
1396 0, SLAB_PANIC, NULL, NULL);
1397
1398 /* Set interleaving policy for system init. This way not all
1399 the data structures allocated at system boot end up in node zero. */
1400
8bccd85f 1401 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1da177e4
LT
1402 printk("numa_policy_init: interleaving failed\n");
1403}
1404
8bccd85f 1405/* Reset policy of current process to default */
1da177e4
LT
1406void numa_default_policy(void)
1407{
8bccd85f 1408 do_set_mempolicy(MPOL_DEFAULT, NULL);
1da177e4 1409}
68860ec1
PJ
1410
1411/* Migrate a policy to a different set of nodes */
1412static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1413 const nodemask_t *new)
1414{
1415 nodemask_t tmp;
1416
1417 if (!pol)
1418 return;
1419
1420 switch (pol->policy) {
1421 case MPOL_DEFAULT:
1422 break;
1423 case MPOL_INTERLEAVE:
1424 nodes_remap(tmp, pol->v.nodes, *old, *new);
1425 pol->v.nodes = tmp;
1426 current->il_next = node_remap(current->il_next, *old, *new);
1427 break;
1428 case MPOL_PREFERRED:
1429 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1430 *old, *new);
1431 break;
1432 case MPOL_BIND: {
1433 nodemask_t nodes;
1434 struct zone **z;
1435 struct zonelist *zonelist;
1436
1437 nodes_clear(nodes);
1438 for (z = pol->v.zonelist->zones; *z; z++)
1439 node_set((*z)->zone_pgdat->node_id, nodes);
1440 nodes_remap(tmp, nodes, *old, *new);
1441 nodes = tmp;
1442
1443 zonelist = bind_zonelist(&nodes);
1444
1445 /* If no mem, then zonelist is NULL and we keep old zonelist.
1446 * If that old zonelist has no remaining mems_allowed nodes,
1447 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1448 */
1449
1450 if (zonelist) {
1451 /* Good - got mem - substitute new zonelist */
1452 kfree(pol->v.zonelist);
1453 pol->v.zonelist = zonelist;
1454 }
1455 break;
1456 }
1457 default:
1458 BUG();
1459 break;
1460 }
1461}
1462
1463/*
1464 * Someone moved this task to different nodes. Fixup mempolicies.
1465 *
1466 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1467 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1468 */
1469void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1470{
1471 rebind_policy(current->mempolicy, old, new);
1472}
1a75a6c8
CL
1473
1474/*
1475 * Display pages allocated per node and memory policy via /proc.
1476 */
1477
1478static const char *policy_types[] = { "default", "prefer", "bind",
1479 "interleave" };
1480
1481/*
1482 * Convert a mempolicy into a string.
1483 * Returns the number of characters in buffer (if positive)
1484 * or an error (negative)
1485 */
1486static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1487{
1488 char *p = buffer;
1489 int l;
1490 nodemask_t nodes;
1491 int mode = pol ? pol->policy : MPOL_DEFAULT;
1492
1493 switch (mode) {
1494 case MPOL_DEFAULT:
1495 nodes_clear(nodes);
1496 break;
1497
1498 case MPOL_PREFERRED:
1499 nodes_clear(nodes);
1500 node_set(pol->v.preferred_node, nodes);
1501 break;
1502
1503 case MPOL_BIND:
1504 get_zonemask(pol, &nodes);
1505 break;
1506
1507 case MPOL_INTERLEAVE:
1508 nodes = pol->v.nodes;
1509 break;
1510
1511 default:
1512 BUG();
1513 return -EFAULT;
1514 }
1515
1516 l = strlen(policy_types[mode]);
1517 if (buffer + maxlen < p + l + 1)
1518 return -ENOSPC;
1519
1520 strcpy(p, policy_types[mode]);
1521 p += l;
1522
1523 if (!nodes_empty(nodes)) {
1524 if (buffer + maxlen < p + 2)
1525 return -ENOSPC;
1526 *p++ = '=';
1527 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1528 }
1529 return p - buffer;
1530}
1531
1532struct numa_maps {
1533 unsigned long pages;
1534 unsigned long anon;
1535 unsigned long mapped;
1536 unsigned long mapcount_max;
1537 unsigned long node[MAX_NUMNODES];
1538};
1539
1540static void gather_stats(struct page *page, void *private)
1541{
1542 struct numa_maps *md = private;
1543 int count = page_mapcount(page);
1544
1545 if (count)
1546 md->mapped++;
1547
1548 if (count > md->mapcount_max)
1549 md->mapcount_max = count;
1550
1551 md->pages++;
1552
1553 if (PageAnon(page))
1554 md->anon++;
1555
1556 md->node[page_to_nid(page)]++;
1557 cond_resched();
1558}
1559
1560int show_numa_map(struct seq_file *m, void *v)
1561{
1562 struct task_struct *task = m->private;
1563 struct vm_area_struct *vma = v;
1564 struct numa_maps *md;
1565 int n;
1566 char buffer[50];
1567
1568 if (!vma->vm_mm)
1569 return 0;
1570
1571 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1572 if (!md)
1573 return 0;
1574
1575 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1576 &node_online_map, MPOL_MF_STATS, md);
1577
1578 if (md->pages) {
1579 mpol_to_str(buffer, sizeof(buffer),
1580 get_vma_policy(task, vma, vma->vm_start));
1581
1582 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1583 vma->vm_start, buffer, md->pages,
1584 md->mapped, md->mapcount_max);
1585
1586 if (md->anon)
1587 seq_printf(m," anon=%lu",md->anon);
1588
1589 for_each_online_node(n)
1590 if (md->node[n])
1591 seq_printf(m, " N%d=%lu", n, md->node[n]);
1592
1593 seq_putc(m, '\n');
1594 }
1595 kfree(md);
1596
1597 if (m->count < m->size)
1598 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1599 return 0;
1600}
1601