]> bbs.cooldavid.org Git - net-next-2.6.git/blame_incremental - mm/mempolicy.c
[PATCH] Drop page table lock before calling migrate_page_add()
[net-next-2.6.git] / mm / mempolicy.c
... / ...
CommitLineData
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
21 *
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
33 *
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 could replace all the switch()es with a mempolicy_ops structure.
67*/
68
69#include <linux/mempolicy.h>
70#include <linux/mm.h>
71#include <linux/highmem.h>
72#include <linux/hugetlb.h>
73#include <linux/kernel.h>
74#include <linux/sched.h>
75#include <linux/mm.h>
76#include <linux/nodemask.h>
77#include <linux/cpuset.h>
78#include <linux/gfp.h>
79#include <linux/slab.h>
80#include <linux/string.h>
81#include <linux/module.h>
82#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
85#include <linux/mempolicy.h>
86#include <linux/swap.h>
87#include <linux/seq_file.h>
88#include <linux/proc_fs.h>
89
90#include <asm/tlbflush.h>
91#include <asm/uaccess.h>
92
93/* Internal flags */
94#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
97
98static kmem_cache_t *policy_cache;
99static kmem_cache_t *sn_cache;
100
101#define PDprintk(fmt...)
102
103/* Highest zone. An specific allocation for a zone below that is not
104 policied. */
105int policy_zone = ZONE_DMA;
106
107struct mempolicy default_policy = {
108 .refcnt = ATOMIC_INIT(1), /* never free it */
109 .policy = MPOL_DEFAULT,
110};
111
112/* Do sanity checking on a policy */
113static int mpol_check_policy(int mode, nodemask_t *nodes)
114{
115 int empty = nodes_empty(*nodes);
116
117 switch (mode) {
118 case MPOL_DEFAULT:
119 if (!empty)
120 return -EINVAL;
121 break;
122 case MPOL_BIND:
123 case MPOL_INTERLEAVE:
124 /* Preferred will only use the first bit, but allow
125 more for now. */
126 if (empty)
127 return -EINVAL;
128 break;
129 }
130 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
131}
132/* Generate a custom zonelist for the BIND policy. */
133static struct zonelist *bind_zonelist(nodemask_t *nodes)
134{
135 struct zonelist *zl;
136 int num, max, nd;
137
138 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
139 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
140 if (!zl)
141 return NULL;
142 num = 0;
143 for_each_node_mask(nd, *nodes)
144 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
145 zl->zones[num] = NULL;
146 return zl;
147}
148
149/* Create a new policy */
150static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
151{
152 struct mempolicy *policy;
153
154 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
155 if (mode == MPOL_DEFAULT)
156 return NULL;
157 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
158 if (!policy)
159 return ERR_PTR(-ENOMEM);
160 atomic_set(&policy->refcnt, 1);
161 switch (mode) {
162 case MPOL_INTERLEAVE:
163 policy->v.nodes = *nodes;
164 if (nodes_weight(*nodes) == 0) {
165 kmem_cache_free(policy_cache, policy);
166 return ERR_PTR(-EINVAL);
167 }
168 break;
169 case MPOL_PREFERRED:
170 policy->v.preferred_node = first_node(*nodes);
171 if (policy->v.preferred_node >= MAX_NUMNODES)
172 policy->v.preferred_node = -1;
173 break;
174 case MPOL_BIND:
175 policy->v.zonelist = bind_zonelist(nodes);
176 if (policy->v.zonelist == NULL) {
177 kmem_cache_free(policy_cache, policy);
178 return ERR_PTR(-ENOMEM);
179 }
180 break;
181 }
182 policy->policy = mode;
183 return policy;
184}
185
186/* Check if we are the only process mapping the page in question */
187static inline int single_mm_mapping(struct mm_struct *mm,
188 struct address_space *mapping)
189{
190 struct vm_area_struct *vma;
191 struct prio_tree_iter iter;
192 int rc = 1;
193
194 spin_lock(&mapping->i_mmap_lock);
195 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
196 if (mm != vma->vm_mm) {
197 rc = 0;
198 goto out;
199 }
200 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
201 if (mm != vma->vm_mm) {
202 rc = 0;
203 goto out;
204 }
205out:
206 spin_unlock(&mapping->i_mmap_lock);
207 return rc;
208}
209
210/*
211 * Add a page to be migrated to the pagelist
212 */
213static void migrate_page_add(struct vm_area_struct *vma,
214 struct page *page, struct list_head *pagelist, unsigned long flags)
215{
216 /*
217 * Avoid migrating a page that is shared by others and not writable.
218 */
219 if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
220 mapping_writably_mapped(page->mapping) ||
221 single_mm_mapping(vma->vm_mm, page->mapping)) {
222 int rc = isolate_lru_page(page);
223
224 if (rc == 1)
225 list_add(&page->lru, pagelist);
226 /*
227 * If the isolate attempt was not successful then we just
228 * encountered an unswappable page. Something must be wrong.
229 */
230 WARN_ON(rc == 0);
231 }
232}
233
234static void gather_stats(struct page *, void *);
235
236/* Scan through pages checking if pages follow certain conditions. */
237static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
238 unsigned long addr, unsigned long end,
239 const nodemask_t *nodes, unsigned long flags,
240 void *private)
241{
242 pte_t *orig_pte;
243 pte_t *pte;
244 spinlock_t *ptl;
245
246 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
247 do {
248 struct page *page;
249 unsigned int nid;
250
251 if (!pte_present(*pte))
252 continue;
253 page = vm_normal_page(vma, addr, *pte);
254 if (!page)
255 continue;
256 nid = page_to_nid(page);
257 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
258 continue;
259
260 if (flags & MPOL_MF_STATS)
261 gather_stats(page, private);
262 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
263 spin_unlock(ptl);
264 migrate_page_add(vma, page, private, flags);
265 spin_lock(ptl);
266 }
267 else
268 break;
269 } while (pte++, addr += PAGE_SIZE, addr != end);
270 pte_unmap_unlock(orig_pte, ptl);
271 return addr != end;
272}
273
274static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
275 unsigned long addr, unsigned long end,
276 const nodemask_t *nodes, unsigned long flags,
277 void *private)
278{
279 pmd_t *pmd;
280 unsigned long next;
281
282 pmd = pmd_offset(pud, addr);
283 do {
284 next = pmd_addr_end(addr, end);
285 if (pmd_none_or_clear_bad(pmd))
286 continue;
287 if (check_pte_range(vma, pmd, addr, next, nodes,
288 flags, private))
289 return -EIO;
290 } while (pmd++, addr = next, addr != end);
291 return 0;
292}
293
294static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
295 unsigned long addr, unsigned long end,
296 const nodemask_t *nodes, unsigned long flags,
297 void *private)
298{
299 pud_t *pud;
300 unsigned long next;
301
302 pud = pud_offset(pgd, addr);
303 do {
304 next = pud_addr_end(addr, end);
305 if (pud_none_or_clear_bad(pud))
306 continue;
307 if (check_pmd_range(vma, pud, addr, next, nodes,
308 flags, private))
309 return -EIO;
310 } while (pud++, addr = next, addr != end);
311 return 0;
312}
313
314static inline int check_pgd_range(struct vm_area_struct *vma,
315 unsigned long addr, unsigned long end,
316 const nodemask_t *nodes, unsigned long flags,
317 void *private)
318{
319 pgd_t *pgd;
320 unsigned long next;
321
322 pgd = pgd_offset(vma->vm_mm, addr);
323 do {
324 next = pgd_addr_end(addr, end);
325 if (pgd_none_or_clear_bad(pgd))
326 continue;
327 if (check_pud_range(vma, pgd, addr, next, nodes,
328 flags, private))
329 return -EIO;
330 } while (pgd++, addr = next, addr != end);
331 return 0;
332}
333
334/* Check if a vma is migratable */
335static inline int vma_migratable(struct vm_area_struct *vma)
336{
337 if (vma->vm_flags & (
338 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
339 return 0;
340 return 1;
341}
342
343/*
344 * Check if all pages in a range are on a set of nodes.
345 * If pagelist != NULL then isolate pages from the LRU and
346 * put them on the pagelist.
347 */
348static struct vm_area_struct *
349check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
350 const nodemask_t *nodes, unsigned long flags, void *private)
351{
352 int err;
353 struct vm_area_struct *first, *vma, *prev;
354
355 first = find_vma(mm, start);
356 if (!first)
357 return ERR_PTR(-EFAULT);
358 prev = NULL;
359 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
360 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
361 if (!vma->vm_next && vma->vm_end < end)
362 return ERR_PTR(-EFAULT);
363 if (prev && prev->vm_end < vma->vm_start)
364 return ERR_PTR(-EFAULT);
365 }
366 if (!is_vm_hugetlb_page(vma) &&
367 ((flags & MPOL_MF_STRICT) ||
368 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
369 vma_migratable(vma)))) {
370 unsigned long endvma = vma->vm_end;
371
372 if (endvma > end)
373 endvma = end;
374 if (vma->vm_start > start)
375 start = vma->vm_start;
376 err = check_pgd_range(vma, start, endvma, nodes,
377 flags, private);
378 if (err) {
379 first = ERR_PTR(err);
380 break;
381 }
382 }
383 prev = vma;
384 }
385 return first;
386}
387
388/* Apply policy to a single VMA */
389static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
390{
391 int err = 0;
392 struct mempolicy *old = vma->vm_policy;
393
394 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
395 vma->vm_start, vma->vm_end, vma->vm_pgoff,
396 vma->vm_ops, vma->vm_file,
397 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
398
399 if (vma->vm_ops && vma->vm_ops->set_policy)
400 err = vma->vm_ops->set_policy(vma, new);
401 if (!err) {
402 mpol_get(new);
403 vma->vm_policy = new;
404 mpol_free(old);
405 }
406 return err;
407}
408
409/* Step 2: apply policy to a range and do splits. */
410static int mbind_range(struct vm_area_struct *vma, unsigned long start,
411 unsigned long end, struct mempolicy *new)
412{
413 struct vm_area_struct *next;
414 int err;
415
416 err = 0;
417 for (; vma && vma->vm_start < end; vma = next) {
418 next = vma->vm_next;
419 if (vma->vm_start < start)
420 err = split_vma(vma->vm_mm, vma, start, 1);
421 if (!err && vma->vm_end > end)
422 err = split_vma(vma->vm_mm, vma, end, 0);
423 if (!err)
424 err = policy_vma(vma, new);
425 if (err)
426 break;
427 }
428 return err;
429}
430
431static int contextualize_policy(int mode, nodemask_t *nodes)
432{
433 if (!nodes)
434 return 0;
435
436 /* Update current mems_allowed */
437 cpuset_update_current_mems_allowed();
438 /* Ignore nodes not set in current->mems_allowed */
439 cpuset_restrict_to_mems_allowed(nodes->bits);
440 return mpol_check_policy(mode, nodes);
441}
442
443static int swap_pages(struct list_head *pagelist)
444{
445 LIST_HEAD(moved);
446 LIST_HEAD(failed);
447 int n;
448
449 n = migrate_pages(pagelist, NULL, &moved, &failed);
450 putback_lru_pages(&failed);
451 putback_lru_pages(&moved);
452
453 return n;
454}
455
456long do_mbind(unsigned long start, unsigned long len,
457 unsigned long mode, nodemask_t *nmask, unsigned long flags)
458{
459 struct vm_area_struct *vma;
460 struct mm_struct *mm = current->mm;
461 struct mempolicy *new;
462 unsigned long end;
463 int err;
464 LIST_HEAD(pagelist);
465
466 if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
467 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
468 || mode > MPOL_MAX)
469 return -EINVAL;
470 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
471 return -EPERM;
472
473 if (start & ~PAGE_MASK)
474 return -EINVAL;
475
476 if (mode == MPOL_DEFAULT)
477 flags &= ~MPOL_MF_STRICT;
478
479 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
480 end = start + len;
481
482 if (end < start)
483 return -EINVAL;
484 if (end == start)
485 return 0;
486
487 if (mpol_check_policy(mode, nmask))
488 return -EINVAL;
489
490 new = mpol_new(mode, nmask);
491 if (IS_ERR(new))
492 return PTR_ERR(new);
493
494 /*
495 * If we are using the default policy then operation
496 * on discontinuous address spaces is okay after all
497 */
498 if (!new)
499 flags |= MPOL_MF_DISCONTIG_OK;
500
501 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
502 mode,nodes_addr(nodes)[0]);
503
504 down_write(&mm->mmap_sem);
505 vma = check_range(mm, start, end, nmask,
506 flags | MPOL_MF_INVERT, &pagelist);
507
508 err = PTR_ERR(vma);
509 if (!IS_ERR(vma)) {
510 int nr_failed = 0;
511
512 err = mbind_range(vma, start, end, new);
513 if (!list_empty(&pagelist))
514 nr_failed = swap_pages(&pagelist);
515
516 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
517 err = -EIO;
518 }
519 if (!list_empty(&pagelist))
520 putback_lru_pages(&pagelist);
521
522 up_write(&mm->mmap_sem);
523 mpol_free(new);
524 return err;
525}
526
527/* Set the process memory policy */
528long do_set_mempolicy(int mode, nodemask_t *nodes)
529{
530 struct mempolicy *new;
531
532 if (contextualize_policy(mode, nodes))
533 return -EINVAL;
534 new = mpol_new(mode, nodes);
535 if (IS_ERR(new))
536 return PTR_ERR(new);
537 mpol_free(current->mempolicy);
538 current->mempolicy = new;
539 if (new && new->policy == MPOL_INTERLEAVE)
540 current->il_next = first_node(new->v.nodes);
541 return 0;
542}
543
544/* Fill a zone bitmap for a policy */
545static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
546{
547 int i;
548
549 nodes_clear(*nodes);
550 switch (p->policy) {
551 case MPOL_BIND:
552 for (i = 0; p->v.zonelist->zones[i]; i++)
553 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
554 *nodes);
555 break;
556 case MPOL_DEFAULT:
557 break;
558 case MPOL_INTERLEAVE:
559 *nodes = p->v.nodes;
560 break;
561 case MPOL_PREFERRED:
562 /* or use current node instead of online map? */
563 if (p->v.preferred_node < 0)
564 *nodes = node_online_map;
565 else
566 node_set(p->v.preferred_node, *nodes);
567 break;
568 default:
569 BUG();
570 }
571}
572
573static int lookup_node(struct mm_struct *mm, unsigned long addr)
574{
575 struct page *p;
576 int err;
577
578 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
579 if (err >= 0) {
580 err = page_to_nid(p);
581 put_page(p);
582 }
583 return err;
584}
585
586/* Retrieve NUMA policy */
587long do_get_mempolicy(int *policy, nodemask_t *nmask,
588 unsigned long addr, unsigned long flags)
589{
590 int err;
591 struct mm_struct *mm = current->mm;
592 struct vm_area_struct *vma = NULL;
593 struct mempolicy *pol = current->mempolicy;
594
595 cpuset_update_current_mems_allowed();
596 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
597 return -EINVAL;
598 if (flags & MPOL_F_ADDR) {
599 down_read(&mm->mmap_sem);
600 vma = find_vma_intersection(mm, addr, addr+1);
601 if (!vma) {
602 up_read(&mm->mmap_sem);
603 return -EFAULT;
604 }
605 if (vma->vm_ops && vma->vm_ops->get_policy)
606 pol = vma->vm_ops->get_policy(vma, addr);
607 else
608 pol = vma->vm_policy;
609 } else if (addr)
610 return -EINVAL;
611
612 if (!pol)
613 pol = &default_policy;
614
615 if (flags & MPOL_F_NODE) {
616 if (flags & MPOL_F_ADDR) {
617 err = lookup_node(mm, addr);
618 if (err < 0)
619 goto out;
620 *policy = err;
621 } else if (pol == current->mempolicy &&
622 pol->policy == MPOL_INTERLEAVE) {
623 *policy = current->il_next;
624 } else {
625 err = -EINVAL;
626 goto out;
627 }
628 } else
629 *policy = pol->policy;
630
631 if (vma) {
632 up_read(&current->mm->mmap_sem);
633 vma = NULL;
634 }
635
636 err = 0;
637 if (nmask)
638 get_zonemask(pol, nmask);
639
640 out:
641 if (vma)
642 up_read(&current->mm->mmap_sem);
643 return err;
644}
645
646/*
647 * For now migrate_pages simply swaps out the pages from nodes that are in
648 * the source set but not in the target set. In the future, we would
649 * want a function that moves pages between the two nodesets in such
650 * a way as to preserve the physical layout as much as possible.
651 *
652 * Returns the number of page that could not be moved.
653 */
654int do_migrate_pages(struct mm_struct *mm,
655 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
656{
657 LIST_HEAD(pagelist);
658 int count = 0;
659 nodemask_t nodes;
660
661 nodes_andnot(nodes, *from_nodes, *to_nodes);
662
663 down_read(&mm->mmap_sem);
664 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
665 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
666
667 if (!list_empty(&pagelist)) {
668 count = swap_pages(&pagelist);
669 putback_lru_pages(&pagelist);
670 }
671
672 up_read(&mm->mmap_sem);
673 return count;
674}
675
676/*
677 * User space interface with variable sized bitmaps for nodelists.
678 */
679
680/* Copy a node mask from user space. */
681static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
682 unsigned long maxnode)
683{
684 unsigned long k;
685 unsigned long nlongs;
686 unsigned long endmask;
687
688 --maxnode;
689 nodes_clear(*nodes);
690 if (maxnode == 0 || !nmask)
691 return 0;
692
693 nlongs = BITS_TO_LONGS(maxnode);
694 if ((maxnode % BITS_PER_LONG) == 0)
695 endmask = ~0UL;
696 else
697 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
698
699 /* When the user specified more nodes than supported just check
700 if the non supported part is all zero. */
701 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
702 if (nlongs > PAGE_SIZE/sizeof(long))
703 return -EINVAL;
704 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
705 unsigned long t;
706 if (get_user(t, nmask + k))
707 return -EFAULT;
708 if (k == nlongs - 1) {
709 if (t & endmask)
710 return -EINVAL;
711 } else if (t)
712 return -EINVAL;
713 }
714 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
715 endmask = ~0UL;
716 }
717
718 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
719 return -EFAULT;
720 nodes_addr(*nodes)[nlongs-1] &= endmask;
721 return 0;
722}
723
724/* Copy a kernel node mask to user space */
725static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
726 nodemask_t *nodes)
727{
728 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
729 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
730
731 if (copy > nbytes) {
732 if (copy > PAGE_SIZE)
733 return -EINVAL;
734 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
735 return -EFAULT;
736 copy = nbytes;
737 }
738 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
739}
740
741asmlinkage long sys_mbind(unsigned long start, unsigned long len,
742 unsigned long mode,
743 unsigned long __user *nmask, unsigned long maxnode,
744 unsigned flags)
745{
746 nodemask_t nodes;
747 int err;
748
749 err = get_nodes(&nodes, nmask, maxnode);
750 if (err)
751 return err;
752 return do_mbind(start, len, mode, &nodes, flags);
753}
754
755/* Set the process memory policy */
756asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
757 unsigned long maxnode)
758{
759 int err;
760 nodemask_t nodes;
761
762 if (mode < 0 || mode > MPOL_MAX)
763 return -EINVAL;
764 err = get_nodes(&nodes, nmask, maxnode);
765 if (err)
766 return err;
767 return do_set_mempolicy(mode, &nodes);
768}
769
770/* Macro needed until Paul implements this function in kernel/cpusets.c */
771#define cpuset_mems_allowed(task) node_online_map
772
773asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
774 const unsigned long __user *old_nodes,
775 const unsigned long __user *new_nodes)
776{
777 struct mm_struct *mm;
778 struct task_struct *task;
779 nodemask_t old;
780 nodemask_t new;
781 nodemask_t task_nodes;
782 int err;
783
784 err = get_nodes(&old, old_nodes, maxnode);
785 if (err)
786 return err;
787
788 err = get_nodes(&new, new_nodes, maxnode);
789 if (err)
790 return err;
791
792 /* Find the mm_struct */
793 read_lock(&tasklist_lock);
794 task = pid ? find_task_by_pid(pid) : current;
795 if (!task) {
796 read_unlock(&tasklist_lock);
797 return -ESRCH;
798 }
799 mm = get_task_mm(task);
800 read_unlock(&tasklist_lock);
801
802 if (!mm)
803 return -EINVAL;
804
805 /*
806 * Check if this process has the right to modify the specified
807 * process. The right exists if the process has administrative
808 * capabilities, superuser priviledges or the same
809 * userid as the target process.
810 */
811 if ((current->euid != task->suid) && (current->euid != task->uid) &&
812 (current->uid != task->suid) && (current->uid != task->uid) &&
813 !capable(CAP_SYS_ADMIN)) {
814 err = -EPERM;
815 goto out;
816 }
817
818 task_nodes = cpuset_mems_allowed(task);
819 /* Is the user allowed to access the target nodes? */
820 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
821 err = -EPERM;
822 goto out;
823 }
824
825 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
826out:
827 mmput(mm);
828 return err;
829}
830
831
832/* Retrieve NUMA policy */
833asmlinkage long sys_get_mempolicy(int __user *policy,
834 unsigned long __user *nmask,
835 unsigned long maxnode,
836 unsigned long addr, unsigned long flags)
837{
838 int err, pval;
839 nodemask_t nodes;
840
841 if (nmask != NULL && maxnode < MAX_NUMNODES)
842 return -EINVAL;
843
844 err = do_get_mempolicy(&pval, &nodes, addr, flags);
845
846 if (err)
847 return err;
848
849 if (policy && put_user(pval, policy))
850 return -EFAULT;
851
852 if (nmask)
853 err = copy_nodes_to_user(nmask, maxnode, &nodes);
854
855 return err;
856}
857
858#ifdef CONFIG_COMPAT
859
860asmlinkage long compat_sys_get_mempolicy(int __user *policy,
861 compat_ulong_t __user *nmask,
862 compat_ulong_t maxnode,
863 compat_ulong_t addr, compat_ulong_t flags)
864{
865 long err;
866 unsigned long __user *nm = NULL;
867 unsigned long nr_bits, alloc_size;
868 DECLARE_BITMAP(bm, MAX_NUMNODES);
869
870 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
871 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
872
873 if (nmask)
874 nm = compat_alloc_user_space(alloc_size);
875
876 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
877
878 if (!err && nmask) {
879 err = copy_from_user(bm, nm, alloc_size);
880 /* ensure entire bitmap is zeroed */
881 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
882 err |= compat_put_bitmap(nmask, bm, nr_bits);
883 }
884
885 return err;
886}
887
888asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
889 compat_ulong_t maxnode)
890{
891 long err = 0;
892 unsigned long __user *nm = NULL;
893 unsigned long nr_bits, alloc_size;
894 DECLARE_BITMAP(bm, MAX_NUMNODES);
895
896 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
897 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
898
899 if (nmask) {
900 err = compat_get_bitmap(bm, nmask, nr_bits);
901 nm = compat_alloc_user_space(alloc_size);
902 err |= copy_to_user(nm, bm, alloc_size);
903 }
904
905 if (err)
906 return -EFAULT;
907
908 return sys_set_mempolicy(mode, nm, nr_bits+1);
909}
910
911asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
912 compat_ulong_t mode, compat_ulong_t __user *nmask,
913 compat_ulong_t maxnode, compat_ulong_t flags)
914{
915 long err = 0;
916 unsigned long __user *nm = NULL;
917 unsigned long nr_bits, alloc_size;
918 nodemask_t bm;
919
920 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
921 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
922
923 if (nmask) {
924 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
925 nm = compat_alloc_user_space(alloc_size);
926 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
927 }
928
929 if (err)
930 return -EFAULT;
931
932 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
933}
934
935#endif
936
937/* Return effective policy for a VMA */
938struct mempolicy *
939get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
940{
941 struct mempolicy *pol = task->mempolicy;
942
943 if (vma) {
944 if (vma->vm_ops && vma->vm_ops->get_policy)
945 pol = vma->vm_ops->get_policy(vma, addr);
946 else if (vma->vm_policy &&
947 vma->vm_policy->policy != MPOL_DEFAULT)
948 pol = vma->vm_policy;
949 }
950 if (!pol)
951 pol = &default_policy;
952 return pol;
953}
954
955/* Return a zonelist representing a mempolicy */
956static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
957{
958 int nd;
959
960 switch (policy->policy) {
961 case MPOL_PREFERRED:
962 nd = policy->v.preferred_node;
963 if (nd < 0)
964 nd = numa_node_id();
965 break;
966 case MPOL_BIND:
967 /* Lower zones don't get a policy applied */
968 /* Careful: current->mems_allowed might have moved */
969 if (gfp_zone(gfp) >= policy_zone)
970 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
971 return policy->v.zonelist;
972 /*FALL THROUGH*/
973 case MPOL_INTERLEAVE: /* should not happen */
974 case MPOL_DEFAULT:
975 nd = numa_node_id();
976 break;
977 default:
978 nd = 0;
979 BUG();
980 }
981 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
982}
983
984/* Do dynamic interleaving for a process */
985static unsigned interleave_nodes(struct mempolicy *policy)
986{
987 unsigned nid, next;
988 struct task_struct *me = current;
989
990 nid = me->il_next;
991 next = next_node(nid, policy->v.nodes);
992 if (next >= MAX_NUMNODES)
993 next = first_node(policy->v.nodes);
994 me->il_next = next;
995 return nid;
996}
997
998/* Do static interleaving for a VMA with known offset. */
999static unsigned offset_il_node(struct mempolicy *pol,
1000 struct vm_area_struct *vma, unsigned long off)
1001{
1002 unsigned nnodes = nodes_weight(pol->v.nodes);
1003 unsigned target = (unsigned)off % nnodes;
1004 int c;
1005 int nid = -1;
1006
1007 c = 0;
1008 do {
1009 nid = next_node(nid, pol->v.nodes);
1010 c++;
1011 } while (c <= target);
1012 return nid;
1013}
1014
1015/* Determine a node number for interleave */
1016static inline unsigned interleave_nid(struct mempolicy *pol,
1017 struct vm_area_struct *vma, unsigned long addr, int shift)
1018{
1019 if (vma) {
1020 unsigned long off;
1021
1022 off = vma->vm_pgoff;
1023 off += (addr - vma->vm_start) >> shift;
1024 return offset_il_node(pol, vma, off);
1025 } else
1026 return interleave_nodes(pol);
1027}
1028
1029/* Return a zonelist suitable for a huge page allocation. */
1030struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1031{
1032 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1033
1034 if (pol->policy == MPOL_INTERLEAVE) {
1035 unsigned nid;
1036
1037 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1038 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1039 }
1040 return zonelist_policy(GFP_HIGHUSER, pol);
1041}
1042
1043/* Allocate a page in interleaved policy.
1044 Own path because it needs to do special accounting. */
1045static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1046 unsigned nid)
1047{
1048 struct zonelist *zl;
1049 struct page *page;
1050
1051 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1052 page = __alloc_pages(gfp, order, zl);
1053 if (page && page_zone(page) == zl->zones[0]) {
1054 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1055 put_cpu();
1056 }
1057 return page;
1058}
1059
1060/**
1061 * alloc_page_vma - Allocate a page for a VMA.
1062 *
1063 * @gfp:
1064 * %GFP_USER user allocation.
1065 * %GFP_KERNEL kernel allocations,
1066 * %GFP_HIGHMEM highmem/user allocations,
1067 * %GFP_FS allocation should not call back into a file system.
1068 * %GFP_ATOMIC don't sleep.
1069 *
1070 * @vma: Pointer to VMA or NULL if not available.
1071 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1072 *
1073 * This function allocates a page from the kernel page pool and applies
1074 * a NUMA policy associated with the VMA or the current process.
1075 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1076 * mm_struct of the VMA to prevent it from going away. Should be used for
1077 * all allocations for pages that will be mapped into
1078 * user space. Returns NULL when no page can be allocated.
1079 *
1080 * Should be called with the mm_sem of the vma hold.
1081 */
1082struct page *
1083alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1084{
1085 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1086
1087 cpuset_update_current_mems_allowed();
1088
1089 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1090 unsigned nid;
1091
1092 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1093 return alloc_page_interleave(gfp, 0, nid);
1094 }
1095 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1096}
1097
1098/**
1099 * alloc_pages_current - Allocate pages.
1100 *
1101 * @gfp:
1102 * %GFP_USER user allocation,
1103 * %GFP_KERNEL kernel allocation,
1104 * %GFP_HIGHMEM highmem allocation,
1105 * %GFP_FS don't call back into a file system.
1106 * %GFP_ATOMIC don't sleep.
1107 * @order: Power of two of allocation size in pages. 0 is a single page.
1108 *
1109 * Allocate a page from the kernel page pool. When not in
1110 * interrupt context and apply the current process NUMA policy.
1111 * Returns NULL when no page can be allocated.
1112 *
1113 * Don't call cpuset_update_current_mems_allowed() unless
1114 * 1) it's ok to take cpuset_sem (can WAIT), and
1115 * 2) allocating for current task (not interrupt).
1116 */
1117struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1118{
1119 struct mempolicy *pol = current->mempolicy;
1120
1121 if ((gfp & __GFP_WAIT) && !in_interrupt())
1122 cpuset_update_current_mems_allowed();
1123 if (!pol || in_interrupt())
1124 pol = &default_policy;
1125 if (pol->policy == MPOL_INTERLEAVE)
1126 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1127 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1128}
1129EXPORT_SYMBOL(alloc_pages_current);
1130
1131/* Slow path of a mempolicy copy */
1132struct mempolicy *__mpol_copy(struct mempolicy *old)
1133{
1134 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1135
1136 if (!new)
1137 return ERR_PTR(-ENOMEM);
1138 *new = *old;
1139 atomic_set(&new->refcnt, 1);
1140 if (new->policy == MPOL_BIND) {
1141 int sz = ksize(old->v.zonelist);
1142 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1143 if (!new->v.zonelist) {
1144 kmem_cache_free(policy_cache, new);
1145 return ERR_PTR(-ENOMEM);
1146 }
1147 memcpy(new->v.zonelist, old->v.zonelist, sz);
1148 }
1149 return new;
1150}
1151
1152/* Slow path of a mempolicy comparison */
1153int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1154{
1155 if (!a || !b)
1156 return 0;
1157 if (a->policy != b->policy)
1158 return 0;
1159 switch (a->policy) {
1160 case MPOL_DEFAULT:
1161 return 1;
1162 case MPOL_INTERLEAVE:
1163 return nodes_equal(a->v.nodes, b->v.nodes);
1164 case MPOL_PREFERRED:
1165 return a->v.preferred_node == b->v.preferred_node;
1166 case MPOL_BIND: {
1167 int i;
1168 for (i = 0; a->v.zonelist->zones[i]; i++)
1169 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1170 return 0;
1171 return b->v.zonelist->zones[i] == NULL;
1172 }
1173 default:
1174 BUG();
1175 return 0;
1176 }
1177}
1178
1179/* Slow path of a mpol destructor. */
1180void __mpol_free(struct mempolicy *p)
1181{
1182 if (!atomic_dec_and_test(&p->refcnt))
1183 return;
1184 if (p->policy == MPOL_BIND)
1185 kfree(p->v.zonelist);
1186 p->policy = MPOL_DEFAULT;
1187 kmem_cache_free(policy_cache, p);
1188}
1189
1190/*
1191 * Shared memory backing store policy support.
1192 *
1193 * Remember policies even when nobody has shared memory mapped.
1194 * The policies are kept in Red-Black tree linked from the inode.
1195 * They are protected by the sp->lock spinlock, which should be held
1196 * for any accesses to the tree.
1197 */
1198
1199/* lookup first element intersecting start-end */
1200/* Caller holds sp->lock */
1201static struct sp_node *
1202sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1203{
1204 struct rb_node *n = sp->root.rb_node;
1205
1206 while (n) {
1207 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1208
1209 if (start >= p->end)
1210 n = n->rb_right;
1211 else if (end <= p->start)
1212 n = n->rb_left;
1213 else
1214 break;
1215 }
1216 if (!n)
1217 return NULL;
1218 for (;;) {
1219 struct sp_node *w = NULL;
1220 struct rb_node *prev = rb_prev(n);
1221 if (!prev)
1222 break;
1223 w = rb_entry(prev, struct sp_node, nd);
1224 if (w->end <= start)
1225 break;
1226 n = prev;
1227 }
1228 return rb_entry(n, struct sp_node, nd);
1229}
1230
1231/* Insert a new shared policy into the list. */
1232/* Caller holds sp->lock */
1233static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1234{
1235 struct rb_node **p = &sp->root.rb_node;
1236 struct rb_node *parent = NULL;
1237 struct sp_node *nd;
1238
1239 while (*p) {
1240 parent = *p;
1241 nd = rb_entry(parent, struct sp_node, nd);
1242 if (new->start < nd->start)
1243 p = &(*p)->rb_left;
1244 else if (new->end > nd->end)
1245 p = &(*p)->rb_right;
1246 else
1247 BUG();
1248 }
1249 rb_link_node(&new->nd, parent, p);
1250 rb_insert_color(&new->nd, &sp->root);
1251 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1252 new->policy ? new->policy->policy : 0);
1253}
1254
1255/* Find shared policy intersecting idx */
1256struct mempolicy *
1257mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1258{
1259 struct mempolicy *pol = NULL;
1260 struct sp_node *sn;
1261
1262 if (!sp->root.rb_node)
1263 return NULL;
1264 spin_lock(&sp->lock);
1265 sn = sp_lookup(sp, idx, idx+1);
1266 if (sn) {
1267 mpol_get(sn->policy);
1268 pol = sn->policy;
1269 }
1270 spin_unlock(&sp->lock);
1271 return pol;
1272}
1273
1274static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1275{
1276 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1277 rb_erase(&n->nd, &sp->root);
1278 mpol_free(n->policy);
1279 kmem_cache_free(sn_cache, n);
1280}
1281
1282struct sp_node *
1283sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1284{
1285 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1286
1287 if (!n)
1288 return NULL;
1289 n->start = start;
1290 n->end = end;
1291 mpol_get(pol);
1292 n->policy = pol;
1293 return n;
1294}
1295
1296/* Replace a policy range. */
1297static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1298 unsigned long end, struct sp_node *new)
1299{
1300 struct sp_node *n, *new2 = NULL;
1301
1302restart:
1303 spin_lock(&sp->lock);
1304 n = sp_lookup(sp, start, end);
1305 /* Take care of old policies in the same range. */
1306 while (n && n->start < end) {
1307 struct rb_node *next = rb_next(&n->nd);
1308 if (n->start >= start) {
1309 if (n->end <= end)
1310 sp_delete(sp, n);
1311 else
1312 n->start = end;
1313 } else {
1314 /* Old policy spanning whole new range. */
1315 if (n->end > end) {
1316 if (!new2) {
1317 spin_unlock(&sp->lock);
1318 new2 = sp_alloc(end, n->end, n->policy);
1319 if (!new2)
1320 return -ENOMEM;
1321 goto restart;
1322 }
1323 n->end = start;
1324 sp_insert(sp, new2);
1325 new2 = NULL;
1326 break;
1327 } else
1328 n->end = start;
1329 }
1330 if (!next)
1331 break;
1332 n = rb_entry(next, struct sp_node, nd);
1333 }
1334 if (new)
1335 sp_insert(sp, new);
1336 spin_unlock(&sp->lock);
1337 if (new2) {
1338 mpol_free(new2->policy);
1339 kmem_cache_free(sn_cache, new2);
1340 }
1341 return 0;
1342}
1343
1344int mpol_set_shared_policy(struct shared_policy *info,
1345 struct vm_area_struct *vma, struct mempolicy *npol)
1346{
1347 int err;
1348 struct sp_node *new = NULL;
1349 unsigned long sz = vma_pages(vma);
1350
1351 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1352 vma->vm_pgoff,
1353 sz, npol? npol->policy : -1,
1354 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1355
1356 if (npol) {
1357 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1358 if (!new)
1359 return -ENOMEM;
1360 }
1361 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1362 if (err && new)
1363 kmem_cache_free(sn_cache, new);
1364 return err;
1365}
1366
1367/* Free a backing policy store on inode delete. */
1368void mpol_free_shared_policy(struct shared_policy *p)
1369{
1370 struct sp_node *n;
1371 struct rb_node *next;
1372
1373 if (!p->root.rb_node)
1374 return;
1375 spin_lock(&p->lock);
1376 next = rb_first(&p->root);
1377 while (next) {
1378 n = rb_entry(next, struct sp_node, nd);
1379 next = rb_next(&n->nd);
1380 rb_erase(&n->nd, &p->root);
1381 mpol_free(n->policy);
1382 kmem_cache_free(sn_cache, n);
1383 }
1384 spin_unlock(&p->lock);
1385}
1386
1387/* assumes fs == KERNEL_DS */
1388void __init numa_policy_init(void)
1389{
1390 policy_cache = kmem_cache_create("numa_policy",
1391 sizeof(struct mempolicy),
1392 0, SLAB_PANIC, NULL, NULL);
1393
1394 sn_cache = kmem_cache_create("shared_policy_node",
1395 sizeof(struct sp_node),
1396 0, SLAB_PANIC, NULL, NULL);
1397
1398 /* Set interleaving policy for system init. This way not all
1399 the data structures allocated at system boot end up in node zero. */
1400
1401 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1402 printk("numa_policy_init: interleaving failed\n");
1403}
1404
1405/* Reset policy of current process to default */
1406void numa_default_policy(void)
1407{
1408 do_set_mempolicy(MPOL_DEFAULT, NULL);
1409}
1410
1411/* Migrate a policy to a different set of nodes */
1412static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1413 const nodemask_t *new)
1414{
1415 nodemask_t tmp;
1416
1417 if (!pol)
1418 return;
1419
1420 switch (pol->policy) {
1421 case MPOL_DEFAULT:
1422 break;
1423 case MPOL_INTERLEAVE:
1424 nodes_remap(tmp, pol->v.nodes, *old, *new);
1425 pol->v.nodes = tmp;
1426 current->il_next = node_remap(current->il_next, *old, *new);
1427 break;
1428 case MPOL_PREFERRED:
1429 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1430 *old, *new);
1431 break;
1432 case MPOL_BIND: {
1433 nodemask_t nodes;
1434 struct zone **z;
1435 struct zonelist *zonelist;
1436
1437 nodes_clear(nodes);
1438 for (z = pol->v.zonelist->zones; *z; z++)
1439 node_set((*z)->zone_pgdat->node_id, nodes);
1440 nodes_remap(tmp, nodes, *old, *new);
1441 nodes = tmp;
1442
1443 zonelist = bind_zonelist(&nodes);
1444
1445 /* If no mem, then zonelist is NULL and we keep old zonelist.
1446 * If that old zonelist has no remaining mems_allowed nodes,
1447 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1448 */
1449
1450 if (zonelist) {
1451 /* Good - got mem - substitute new zonelist */
1452 kfree(pol->v.zonelist);
1453 pol->v.zonelist = zonelist;
1454 }
1455 break;
1456 }
1457 default:
1458 BUG();
1459 break;
1460 }
1461}
1462
1463/*
1464 * Someone moved this task to different nodes. Fixup mempolicies.
1465 *
1466 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1467 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1468 */
1469void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1470{
1471 rebind_policy(current->mempolicy, old, new);
1472}
1473
1474/*
1475 * Display pages allocated per node and memory policy via /proc.
1476 */
1477
1478static const char *policy_types[] = { "default", "prefer", "bind",
1479 "interleave" };
1480
1481/*
1482 * Convert a mempolicy into a string.
1483 * Returns the number of characters in buffer (if positive)
1484 * or an error (negative)
1485 */
1486static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1487{
1488 char *p = buffer;
1489 int l;
1490 nodemask_t nodes;
1491 int mode = pol ? pol->policy : MPOL_DEFAULT;
1492
1493 switch (mode) {
1494 case MPOL_DEFAULT:
1495 nodes_clear(nodes);
1496 break;
1497
1498 case MPOL_PREFERRED:
1499 nodes_clear(nodes);
1500 node_set(pol->v.preferred_node, nodes);
1501 break;
1502
1503 case MPOL_BIND:
1504 get_zonemask(pol, &nodes);
1505 break;
1506
1507 case MPOL_INTERLEAVE:
1508 nodes = pol->v.nodes;
1509 break;
1510
1511 default:
1512 BUG();
1513 return -EFAULT;
1514 }
1515
1516 l = strlen(policy_types[mode]);
1517 if (buffer + maxlen < p + l + 1)
1518 return -ENOSPC;
1519
1520 strcpy(p, policy_types[mode]);
1521 p += l;
1522
1523 if (!nodes_empty(nodes)) {
1524 if (buffer + maxlen < p + 2)
1525 return -ENOSPC;
1526 *p++ = '=';
1527 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1528 }
1529 return p - buffer;
1530}
1531
1532struct numa_maps {
1533 unsigned long pages;
1534 unsigned long anon;
1535 unsigned long mapped;
1536 unsigned long mapcount_max;
1537 unsigned long node[MAX_NUMNODES];
1538};
1539
1540static void gather_stats(struct page *page, void *private)
1541{
1542 struct numa_maps *md = private;
1543 int count = page_mapcount(page);
1544
1545 if (count)
1546 md->mapped++;
1547
1548 if (count > md->mapcount_max)
1549 md->mapcount_max = count;
1550
1551 md->pages++;
1552
1553 if (PageAnon(page))
1554 md->anon++;
1555
1556 md->node[page_to_nid(page)]++;
1557 cond_resched();
1558}
1559
1560int show_numa_map(struct seq_file *m, void *v)
1561{
1562 struct task_struct *task = m->private;
1563 struct vm_area_struct *vma = v;
1564 struct numa_maps *md;
1565 int n;
1566 char buffer[50];
1567
1568 if (!vma->vm_mm)
1569 return 0;
1570
1571 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1572 if (!md)
1573 return 0;
1574
1575 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1576 &node_online_map, MPOL_MF_STATS, md);
1577
1578 if (md->pages) {
1579 mpol_to_str(buffer, sizeof(buffer),
1580 get_vma_policy(task, vma, vma->vm_start));
1581
1582 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1583 vma->vm_start, buffer, md->pages,
1584 md->mapped, md->mapcount_max);
1585
1586 if (md->anon)
1587 seq_printf(m," anon=%lu",md->anon);
1588
1589 for_each_online_node(n)
1590 if (md->node[n])
1591 seq_printf(m, " N%d=%lu", n, md->node[n]);
1592
1593 seq_putc(m, '\n');
1594 }
1595 kfree(md);
1596
1597 if (m->count < m->size)
1598 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1599 return 0;
1600}
1601