]> bbs.cooldavid.org Git - net-next-2.6.git/blame - mm/mempolicy.c
mm: remove return value of putback_lru_pages()
[net-next-2.6.git] / mm / mempolicy.c
CommitLineData
1da177e4
LT
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
1da177e4
LT
28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
1da177e4
LT
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
1da177e4
LT
66*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
1da177e4
LT
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
1da177e4
LT
76#include <linux/slab.h>
77#include <linux/string.h>
78#include <linux/module.h>
b488893a 79#include <linux/nsproxy.h>
1da177e4
LT
80#include <linux/interrupt.h>
81#include <linux/init.h>
82#include <linux/compat.h>
dc9aa5b9 83#include <linux/swap.h>
1a75a6c8
CL
84#include <linux/seq_file.h>
85#include <linux/proc_fs.h>
b20a3503 86#include <linux/migrate.h>
62b61f61 87#include <linux/ksm.h>
95a402c3 88#include <linux/rmap.h>
86c3a764 89#include <linux/security.h>
dbcb0f19 90#include <linux/syscalls.h>
095f1fc4 91#include <linux/ctype.h>
6d9c285a 92#include <linux/mm_inline.h>
dc9aa5b9 93
1da177e4
LT
94#include <asm/tlbflush.h>
95#include <asm/uaccess.h>
96
62695a84
NP
97#include "internal.h"
98
38e35860 99/* Internal flags */
dc9aa5b9 100#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
38e35860 101#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
1a75a6c8 102#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
dc9aa5b9 103
fcc234f8
PE
104static struct kmem_cache *policy_cache;
105static struct kmem_cache *sn_cache;
1da177e4 106
1da177e4
LT
107/* Highest zone. An specific allocation for a zone below that is not
108 policied. */
6267276f 109enum zone_type policy_zone = 0;
1da177e4 110
bea904d5
LS
111/*
112 * run-time system-wide default policy => local allocation
113 */
d42c6997 114struct mempolicy default_policy = {
1da177e4 115 .refcnt = ATOMIC_INIT(1), /* never free it */
bea904d5 116 .mode = MPOL_PREFERRED,
fc36b8d3 117 .flags = MPOL_F_LOCAL,
1da177e4
LT
118};
119
37012946
DR
120static const struct mempolicy_operations {
121 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
122 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
123} mpol_ops[MPOL_MAX];
124
19770b32 125/* Check that the nodemask contains at least one populated zone */
37012946 126static int is_valid_nodemask(const nodemask_t *nodemask)
1da177e4 127{
19770b32 128 int nd, k;
1da177e4 129
19770b32
MG
130 /* Check that there is something useful in this mask */
131 k = policy_zone;
132
133 for_each_node_mask(nd, *nodemask) {
134 struct zone *z;
135
136 for (k = 0; k <= policy_zone; k++) {
137 z = &NODE_DATA(nd)->node_zones[k];
138 if (z->present_pages > 0)
139 return 1;
dd942ae3 140 }
8af5e2eb 141 }
19770b32
MG
142
143 return 0;
1da177e4
LT
144}
145
f5b087b5
DR
146static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
147{
4c50bc01
DR
148 return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
149}
150
151static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
152 const nodemask_t *rel)
153{
154 nodemask_t tmp;
155 nodes_fold(tmp, *orig, nodes_weight(*rel));
156 nodes_onto(*ret, tmp, *rel);
f5b087b5
DR
157}
158
37012946
DR
159static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
160{
161 if (nodes_empty(*nodes))
162 return -EINVAL;
163 pol->v.nodes = *nodes;
164 return 0;
165}
166
167static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
168{
169 if (!nodes)
fc36b8d3 170 pol->flags |= MPOL_F_LOCAL; /* local allocation */
37012946
DR
171 else if (nodes_empty(*nodes))
172 return -EINVAL; /* no allowed nodes */
173 else
174 pol->v.preferred_node = first_node(*nodes);
175 return 0;
176}
177
178static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
179{
180 if (!is_valid_nodemask(nodes))
181 return -EINVAL;
182 pol->v.nodes = *nodes;
183 return 0;
184}
185
58568d2a
MX
186/*
187 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
188 * any, for the new policy. mpol_new() has already validated the nodes
189 * parameter with respect to the policy mode and flags. But, we need to
190 * handle an empty nodemask with MPOL_PREFERRED here.
191 *
192 * Must be called holding task's alloc_lock to protect task's mems_allowed
193 * and mempolicy. May also be called holding the mmap_semaphore for write.
194 */
4bfc4495
KH
195static int mpol_set_nodemask(struct mempolicy *pol,
196 const nodemask_t *nodes, struct nodemask_scratch *nsc)
58568d2a 197{
58568d2a
MX
198 int ret;
199
200 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
201 if (pol == NULL)
202 return 0;
4bfc4495
KH
203 /* Check N_HIGH_MEMORY */
204 nodes_and(nsc->mask1,
205 cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
58568d2a
MX
206
207 VM_BUG_ON(!nodes);
208 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
209 nodes = NULL; /* explicit local allocation */
210 else {
211 if (pol->flags & MPOL_F_RELATIVE_NODES)
4bfc4495 212 mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
58568d2a 213 else
4bfc4495
KH
214 nodes_and(nsc->mask2, *nodes, nsc->mask1);
215
58568d2a
MX
216 if (mpol_store_user_nodemask(pol))
217 pol->w.user_nodemask = *nodes;
218 else
219 pol->w.cpuset_mems_allowed =
220 cpuset_current_mems_allowed;
221 }
222
4bfc4495
KH
223 if (nodes)
224 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
225 else
226 ret = mpol_ops[pol->mode].create(pol, NULL);
58568d2a
MX
227 return ret;
228}
229
230/*
231 * This function just creates a new policy, does some check and simple
232 * initialization. You must invoke mpol_set_nodemask() to set nodes.
233 */
028fec41
DR
234static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
235 nodemask_t *nodes)
1da177e4
LT
236{
237 struct mempolicy *policy;
238
028fec41
DR
239 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
240 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
140d5a49 241
3e1f0645
DR
242 if (mode == MPOL_DEFAULT) {
243 if (nodes && !nodes_empty(*nodes))
37012946 244 return ERR_PTR(-EINVAL);
bea904d5 245 return NULL; /* simply delete any existing policy */
37012946 246 }
3e1f0645
DR
247 VM_BUG_ON(!nodes);
248
249 /*
250 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
251 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
252 * All other modes require a valid pointer to a non-empty nodemask.
253 */
254 if (mode == MPOL_PREFERRED) {
255 if (nodes_empty(*nodes)) {
256 if (((flags & MPOL_F_STATIC_NODES) ||
257 (flags & MPOL_F_RELATIVE_NODES)))
258 return ERR_PTR(-EINVAL);
3e1f0645
DR
259 }
260 } else if (nodes_empty(*nodes))
261 return ERR_PTR(-EINVAL);
1da177e4
LT
262 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
263 if (!policy)
264 return ERR_PTR(-ENOMEM);
265 atomic_set(&policy->refcnt, 1);
45c4745a 266 policy->mode = mode;
3e1f0645 267 policy->flags = flags;
37012946 268
1da177e4 269 return policy;
37012946
DR
270}
271
52cd3b07
LS
272/* Slow path of a mpol destructor. */
273void __mpol_put(struct mempolicy *p)
274{
275 if (!atomic_dec_and_test(&p->refcnt))
276 return;
52cd3b07
LS
277 kmem_cache_free(policy_cache, p);
278}
279
37012946
DR
280static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
281{
282}
283
284static void mpol_rebind_nodemask(struct mempolicy *pol,
285 const nodemask_t *nodes)
286{
287 nodemask_t tmp;
288
289 if (pol->flags & MPOL_F_STATIC_NODES)
290 nodes_and(tmp, pol->w.user_nodemask, *nodes);
291 else if (pol->flags & MPOL_F_RELATIVE_NODES)
292 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
293 else {
294 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
295 *nodes);
296 pol->w.cpuset_mems_allowed = *nodes;
297 }
f5b087b5 298
37012946
DR
299 pol->v.nodes = tmp;
300 if (!node_isset(current->il_next, tmp)) {
301 current->il_next = next_node(current->il_next, tmp);
302 if (current->il_next >= MAX_NUMNODES)
303 current->il_next = first_node(tmp);
304 if (current->il_next >= MAX_NUMNODES)
305 current->il_next = numa_node_id();
306 }
307}
308
309static void mpol_rebind_preferred(struct mempolicy *pol,
310 const nodemask_t *nodes)
311{
312 nodemask_t tmp;
313
37012946
DR
314 if (pol->flags & MPOL_F_STATIC_NODES) {
315 int node = first_node(pol->w.user_nodemask);
316
fc36b8d3 317 if (node_isset(node, *nodes)) {
37012946 318 pol->v.preferred_node = node;
fc36b8d3
LS
319 pol->flags &= ~MPOL_F_LOCAL;
320 } else
321 pol->flags |= MPOL_F_LOCAL;
37012946
DR
322 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
323 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
324 pol->v.preferred_node = first_node(tmp);
fc36b8d3 325 } else if (!(pol->flags & MPOL_F_LOCAL)) {
37012946
DR
326 pol->v.preferred_node = node_remap(pol->v.preferred_node,
327 pol->w.cpuset_mems_allowed,
328 *nodes);
329 pol->w.cpuset_mems_allowed = *nodes;
330 }
1da177e4
LT
331}
332
1d0d2680
DR
333/* Migrate a policy to a different set of nodes */
334static void mpol_rebind_policy(struct mempolicy *pol,
335 const nodemask_t *newmask)
336{
1d0d2680
DR
337 if (!pol)
338 return;
1d0d2680
DR
339 if (!mpol_store_user_nodemask(pol) &&
340 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
341 return;
45c4745a 342 mpol_ops[pol->mode].rebind(pol, newmask);
1d0d2680
DR
343}
344
345/*
346 * Wrapper for mpol_rebind_policy() that just requires task
347 * pointer, and updates task mempolicy.
58568d2a
MX
348 *
349 * Called with task's alloc_lock held.
1d0d2680
DR
350 */
351
352void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
353{
354 mpol_rebind_policy(tsk->mempolicy, new);
355}
356
357/*
358 * Rebind each vma in mm to new nodemask.
359 *
360 * Call holding a reference to mm. Takes mm->mmap_sem during call.
361 */
362
363void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
364{
365 struct vm_area_struct *vma;
366
367 down_write(&mm->mmap_sem);
368 for (vma = mm->mmap; vma; vma = vma->vm_next)
369 mpol_rebind_policy(vma->vm_policy, new);
370 up_write(&mm->mmap_sem);
371}
372
37012946
DR
373static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
374 [MPOL_DEFAULT] = {
375 .rebind = mpol_rebind_default,
376 },
377 [MPOL_INTERLEAVE] = {
378 .create = mpol_new_interleave,
379 .rebind = mpol_rebind_nodemask,
380 },
381 [MPOL_PREFERRED] = {
382 .create = mpol_new_preferred,
383 .rebind = mpol_rebind_preferred,
384 },
385 [MPOL_BIND] = {
386 .create = mpol_new_bind,
387 .rebind = mpol_rebind_nodemask,
388 },
389};
390
397874df 391static void gather_stats(struct page *, void *, int pte_dirty);
fc301289
CL
392static void migrate_page_add(struct page *page, struct list_head *pagelist,
393 unsigned long flags);
1a75a6c8 394
38e35860 395/* Scan through pages checking if pages follow certain conditions. */
b5810039 396static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
dc9aa5b9
CL
397 unsigned long addr, unsigned long end,
398 const nodemask_t *nodes, unsigned long flags,
38e35860 399 void *private)
1da177e4 400{
91612e0d
HD
401 pte_t *orig_pte;
402 pte_t *pte;
705e87c0 403 spinlock_t *ptl;
941150a3 404
705e87c0 405 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
91612e0d 406 do {
6aab341e 407 struct page *page;
25ba77c1 408 int nid;
91612e0d
HD
409
410 if (!pte_present(*pte))
1da177e4 411 continue;
6aab341e
LT
412 page = vm_normal_page(vma, addr, *pte);
413 if (!page)
1da177e4 414 continue;
053837fc 415 /*
62b61f61
HD
416 * vm_normal_page() filters out zero pages, but there might
417 * still be PageReserved pages to skip, perhaps in a VDSO.
418 * And we cannot move PageKsm pages sensibly or safely yet.
053837fc 419 */
62b61f61 420 if (PageReserved(page) || PageKsm(page))
f4598c8b 421 continue;
6aab341e 422 nid = page_to_nid(page);
38e35860
CL
423 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
424 continue;
425
1a75a6c8 426 if (flags & MPOL_MF_STATS)
397874df 427 gather_stats(page, private, pte_dirty(*pte));
053837fc 428 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
fc301289 429 migrate_page_add(page, private, flags);
38e35860
CL
430 else
431 break;
91612e0d 432 } while (pte++, addr += PAGE_SIZE, addr != end);
705e87c0 433 pte_unmap_unlock(orig_pte, ptl);
91612e0d
HD
434 return addr != end;
435}
436
b5810039 437static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
dc9aa5b9
CL
438 unsigned long addr, unsigned long end,
439 const nodemask_t *nodes, unsigned long flags,
38e35860 440 void *private)
91612e0d
HD
441{
442 pmd_t *pmd;
443 unsigned long next;
444
445 pmd = pmd_offset(pud, addr);
446 do {
447 next = pmd_addr_end(addr, end);
448 if (pmd_none_or_clear_bad(pmd))
449 continue;
dc9aa5b9 450 if (check_pte_range(vma, pmd, addr, next, nodes,
38e35860 451 flags, private))
91612e0d
HD
452 return -EIO;
453 } while (pmd++, addr = next, addr != end);
454 return 0;
455}
456
b5810039 457static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
dc9aa5b9
CL
458 unsigned long addr, unsigned long end,
459 const nodemask_t *nodes, unsigned long flags,
38e35860 460 void *private)
91612e0d
HD
461{
462 pud_t *pud;
463 unsigned long next;
464
465 pud = pud_offset(pgd, addr);
466 do {
467 next = pud_addr_end(addr, end);
468 if (pud_none_or_clear_bad(pud))
469 continue;
dc9aa5b9 470 if (check_pmd_range(vma, pud, addr, next, nodes,
38e35860 471 flags, private))
91612e0d
HD
472 return -EIO;
473 } while (pud++, addr = next, addr != end);
474 return 0;
475}
476
b5810039 477static inline int check_pgd_range(struct vm_area_struct *vma,
dc9aa5b9
CL
478 unsigned long addr, unsigned long end,
479 const nodemask_t *nodes, unsigned long flags,
38e35860 480 void *private)
91612e0d
HD
481{
482 pgd_t *pgd;
483 unsigned long next;
484
b5810039 485 pgd = pgd_offset(vma->vm_mm, addr);
91612e0d
HD
486 do {
487 next = pgd_addr_end(addr, end);
488 if (pgd_none_or_clear_bad(pgd))
489 continue;
dc9aa5b9 490 if (check_pud_range(vma, pgd, addr, next, nodes,
38e35860 491 flags, private))
91612e0d
HD
492 return -EIO;
493 } while (pgd++, addr = next, addr != end);
494 return 0;
1da177e4
LT
495}
496
dc9aa5b9
CL
497/*
498 * Check if all pages in a range are on a set of nodes.
499 * If pagelist != NULL then isolate pages from the LRU and
500 * put them on the pagelist.
501 */
1da177e4
LT
502static struct vm_area_struct *
503check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
38e35860 504 const nodemask_t *nodes, unsigned long flags, void *private)
1da177e4
LT
505{
506 int err;
507 struct vm_area_struct *first, *vma, *prev;
508
053837fc 509
1da177e4
LT
510 first = find_vma(mm, start);
511 if (!first)
512 return ERR_PTR(-EFAULT);
513 prev = NULL;
514 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
dc9aa5b9
CL
515 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
516 if (!vma->vm_next && vma->vm_end < end)
517 return ERR_PTR(-EFAULT);
518 if (prev && prev->vm_end < vma->vm_start)
519 return ERR_PTR(-EFAULT);
520 }
521 if (!is_vm_hugetlb_page(vma) &&
522 ((flags & MPOL_MF_STRICT) ||
523 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
524 vma_migratable(vma)))) {
5b952b3c 525 unsigned long endvma = vma->vm_end;
dc9aa5b9 526
5b952b3c
AK
527 if (endvma > end)
528 endvma = end;
529 if (vma->vm_start > start)
530 start = vma->vm_start;
dc9aa5b9 531 err = check_pgd_range(vma, start, endvma, nodes,
38e35860 532 flags, private);
1da177e4
LT
533 if (err) {
534 first = ERR_PTR(err);
535 break;
536 }
537 }
538 prev = vma;
539 }
540 return first;
541}
542
543/* Apply policy to a single VMA */
544static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
545{
546 int err = 0;
547 struct mempolicy *old = vma->vm_policy;
548
140d5a49 549 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
1da177e4
LT
550 vma->vm_start, vma->vm_end, vma->vm_pgoff,
551 vma->vm_ops, vma->vm_file,
552 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
553
554 if (vma->vm_ops && vma->vm_ops->set_policy)
555 err = vma->vm_ops->set_policy(vma, new);
556 if (!err) {
557 mpol_get(new);
558 vma->vm_policy = new;
f0be3d32 559 mpol_put(old);
1da177e4
LT
560 }
561 return err;
562}
563
564/* Step 2: apply policy to a range and do splits. */
9d8cebd4
KM
565static int mbind_range(struct mm_struct *mm, unsigned long start,
566 unsigned long end, struct mempolicy *new_pol)
1da177e4
LT
567{
568 struct vm_area_struct *next;
9d8cebd4
KM
569 struct vm_area_struct *prev;
570 struct vm_area_struct *vma;
571 int err = 0;
572 pgoff_t pgoff;
573 unsigned long vmstart;
574 unsigned long vmend;
1da177e4 575
9d8cebd4
KM
576 vma = find_vma_prev(mm, start, &prev);
577 if (!vma || vma->vm_start > start)
578 return -EFAULT;
579
580 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
1da177e4 581 next = vma->vm_next;
9d8cebd4
KM
582 vmstart = max(start, vma->vm_start);
583 vmend = min(end, vma->vm_end);
584
585 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
586 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
587 vma->anon_vma, vma->vm_file, pgoff, new_pol);
588 if (prev) {
589 vma = prev;
590 next = vma->vm_next;
591 continue;
592 }
593 if (vma->vm_start != vmstart) {
594 err = split_vma(vma->vm_mm, vma, vmstart, 1);
595 if (err)
596 goto out;
597 }
598 if (vma->vm_end != vmend) {
599 err = split_vma(vma->vm_mm, vma, vmend, 0);
600 if (err)
601 goto out;
602 }
603 err = policy_vma(vma, new_pol);
1da177e4 604 if (err)
9d8cebd4 605 goto out;
1da177e4 606 }
9d8cebd4
KM
607
608 out:
1da177e4
LT
609 return err;
610}
611
c61afb18
PJ
612/*
613 * Update task->flags PF_MEMPOLICY bit: set iff non-default
614 * mempolicy. Allows more rapid checking of this (combined perhaps
615 * with other PF_* flag bits) on memory allocation hot code paths.
616 *
617 * If called from outside this file, the task 'p' should -only- be
618 * a newly forked child not yet visible on the task list, because
619 * manipulating the task flags of a visible task is not safe.
620 *
621 * The above limitation is why this routine has the funny name
622 * mpol_fix_fork_child_flag().
623 *
624 * It is also safe to call this with a task pointer of current,
625 * which the static wrapper mpol_set_task_struct_flag() does,
626 * for use within this file.
627 */
628
629void mpol_fix_fork_child_flag(struct task_struct *p)
630{
631 if (p->mempolicy)
632 p->flags |= PF_MEMPOLICY;
633 else
634 p->flags &= ~PF_MEMPOLICY;
635}
636
637static void mpol_set_task_struct_flag(void)
638{
639 mpol_fix_fork_child_flag(current);
640}
641
1da177e4 642/* Set the process memory policy */
028fec41
DR
643static long do_set_mempolicy(unsigned short mode, unsigned short flags,
644 nodemask_t *nodes)
1da177e4 645{
58568d2a 646 struct mempolicy *new, *old;
f4e53d91 647 struct mm_struct *mm = current->mm;
4bfc4495 648 NODEMASK_SCRATCH(scratch);
58568d2a 649 int ret;
1da177e4 650
4bfc4495
KH
651 if (!scratch)
652 return -ENOMEM;
f4e53d91 653
4bfc4495
KH
654 new = mpol_new(mode, flags, nodes);
655 if (IS_ERR(new)) {
656 ret = PTR_ERR(new);
657 goto out;
658 }
f4e53d91
LS
659 /*
660 * prevent changing our mempolicy while show_numa_maps()
661 * is using it.
662 * Note: do_set_mempolicy() can be called at init time
663 * with no 'mm'.
664 */
665 if (mm)
666 down_write(&mm->mmap_sem);
58568d2a 667 task_lock(current);
4bfc4495 668 ret = mpol_set_nodemask(new, nodes, scratch);
58568d2a
MX
669 if (ret) {
670 task_unlock(current);
671 if (mm)
672 up_write(&mm->mmap_sem);
673 mpol_put(new);
4bfc4495 674 goto out;
58568d2a
MX
675 }
676 old = current->mempolicy;
1da177e4 677 current->mempolicy = new;
c61afb18 678 mpol_set_task_struct_flag();
45c4745a 679 if (new && new->mode == MPOL_INTERLEAVE &&
f5b087b5 680 nodes_weight(new->v.nodes))
dfcd3c0d 681 current->il_next = first_node(new->v.nodes);
58568d2a 682 task_unlock(current);
f4e53d91
LS
683 if (mm)
684 up_write(&mm->mmap_sem);
685
58568d2a 686 mpol_put(old);
4bfc4495
KH
687 ret = 0;
688out:
689 NODEMASK_SCRATCH_FREE(scratch);
690 return ret;
1da177e4
LT
691}
692
bea904d5
LS
693/*
694 * Return nodemask for policy for get_mempolicy() query
58568d2a
MX
695 *
696 * Called with task's alloc_lock held
bea904d5
LS
697 */
698static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4 699{
dfcd3c0d 700 nodes_clear(*nodes);
bea904d5
LS
701 if (p == &default_policy)
702 return;
703
45c4745a 704 switch (p->mode) {
19770b32
MG
705 case MPOL_BIND:
706 /* Fall through */
1da177e4 707 case MPOL_INTERLEAVE:
dfcd3c0d 708 *nodes = p->v.nodes;
1da177e4
LT
709 break;
710 case MPOL_PREFERRED:
fc36b8d3 711 if (!(p->flags & MPOL_F_LOCAL))
dfcd3c0d 712 node_set(p->v.preferred_node, *nodes);
53f2556b 713 /* else return empty node mask for local allocation */
1da177e4
LT
714 break;
715 default:
716 BUG();
717 }
718}
719
720static int lookup_node(struct mm_struct *mm, unsigned long addr)
721{
722 struct page *p;
723 int err;
724
725 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
726 if (err >= 0) {
727 err = page_to_nid(p);
728 put_page(p);
729 }
730 return err;
731}
732
1da177e4 733/* Retrieve NUMA policy */
dbcb0f19
AB
734static long do_get_mempolicy(int *policy, nodemask_t *nmask,
735 unsigned long addr, unsigned long flags)
1da177e4 736{
8bccd85f 737 int err;
1da177e4
LT
738 struct mm_struct *mm = current->mm;
739 struct vm_area_struct *vma = NULL;
740 struct mempolicy *pol = current->mempolicy;
741
754af6f5
LS
742 if (flags &
743 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1da177e4 744 return -EINVAL;
754af6f5
LS
745
746 if (flags & MPOL_F_MEMS_ALLOWED) {
747 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
748 return -EINVAL;
749 *policy = 0; /* just so it's initialized */
58568d2a 750 task_lock(current);
754af6f5 751 *nmask = cpuset_current_mems_allowed;
58568d2a 752 task_unlock(current);
754af6f5
LS
753 return 0;
754 }
755
1da177e4 756 if (flags & MPOL_F_ADDR) {
bea904d5
LS
757 /*
758 * Do NOT fall back to task policy if the
759 * vma/shared policy at addr is NULL. We
760 * want to return MPOL_DEFAULT in this case.
761 */
1da177e4
LT
762 down_read(&mm->mmap_sem);
763 vma = find_vma_intersection(mm, addr, addr+1);
764 if (!vma) {
765 up_read(&mm->mmap_sem);
766 return -EFAULT;
767 }
768 if (vma->vm_ops && vma->vm_ops->get_policy)
769 pol = vma->vm_ops->get_policy(vma, addr);
770 else
771 pol = vma->vm_policy;
772 } else if (addr)
773 return -EINVAL;
774
775 if (!pol)
bea904d5 776 pol = &default_policy; /* indicates default behavior */
1da177e4
LT
777
778 if (flags & MPOL_F_NODE) {
779 if (flags & MPOL_F_ADDR) {
780 err = lookup_node(mm, addr);
781 if (err < 0)
782 goto out;
8bccd85f 783 *policy = err;
1da177e4 784 } else if (pol == current->mempolicy &&
45c4745a 785 pol->mode == MPOL_INTERLEAVE) {
8bccd85f 786 *policy = current->il_next;
1da177e4
LT
787 } else {
788 err = -EINVAL;
789 goto out;
790 }
bea904d5
LS
791 } else {
792 *policy = pol == &default_policy ? MPOL_DEFAULT :
793 pol->mode;
d79df630
DR
794 /*
795 * Internal mempolicy flags must be masked off before exposing
796 * the policy to userspace.
797 */
798 *policy |= (pol->flags & MPOL_MODE_FLAGS);
bea904d5 799 }
1da177e4
LT
800
801 if (vma) {
802 up_read(&current->mm->mmap_sem);
803 vma = NULL;
804 }
805
1da177e4 806 err = 0;
58568d2a 807 if (nmask) {
c6b6ef8b
LS
808 if (mpol_store_user_nodemask(pol)) {
809 *nmask = pol->w.user_nodemask;
810 } else {
811 task_lock(current);
812 get_policy_nodemask(pol, nmask);
813 task_unlock(current);
814 }
58568d2a 815 }
1da177e4
LT
816
817 out:
52cd3b07 818 mpol_cond_put(pol);
1da177e4
LT
819 if (vma)
820 up_read(&current->mm->mmap_sem);
821 return err;
822}
823
b20a3503 824#ifdef CONFIG_MIGRATION
6ce3c4c0
CL
825/*
826 * page migration
827 */
fc301289
CL
828static void migrate_page_add(struct page *page, struct list_head *pagelist,
829 unsigned long flags)
6ce3c4c0
CL
830{
831 /*
fc301289 832 * Avoid migrating a page that is shared with others.
6ce3c4c0 833 */
62695a84
NP
834 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
835 if (!isolate_lru_page(page)) {
836 list_add_tail(&page->lru, pagelist);
6d9c285a
KM
837 inc_zone_page_state(page, NR_ISOLATED_ANON +
838 page_is_file_cache(page));
62695a84
NP
839 }
840 }
7e2ab150 841}
6ce3c4c0 842
742755a1 843static struct page *new_node_page(struct page *page, unsigned long node, int **x)
95a402c3 844{
6484eb3e 845 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
95a402c3
CL
846}
847
7e2ab150
CL
848/*
849 * Migrate pages from one node to a target node.
850 * Returns error or the number of pages not migrated.
851 */
dbcb0f19
AB
852static int migrate_to_node(struct mm_struct *mm, int source, int dest,
853 int flags)
7e2ab150
CL
854{
855 nodemask_t nmask;
856 LIST_HEAD(pagelist);
857 int err = 0;
858
859 nodes_clear(nmask);
860 node_set(source, nmask);
6ce3c4c0 861
7e2ab150
CL
862 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
863 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
864
aaa994b3 865 if (!list_empty(&pagelist))
62b61f61 866 err = migrate_pages(&pagelist, new_node_page, dest, 0);
95a402c3 867
7e2ab150 868 return err;
6ce3c4c0
CL
869}
870
39743889 871/*
7e2ab150
CL
872 * Move pages between the two nodesets so as to preserve the physical
873 * layout as much as possible.
39743889
CL
874 *
875 * Returns the number of page that could not be moved.
876 */
877int do_migrate_pages(struct mm_struct *mm,
878 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
879{
7e2ab150 880 int busy = 0;
0aedadf9 881 int err;
7e2ab150 882 nodemask_t tmp;
39743889 883
0aedadf9
CL
884 err = migrate_prep();
885 if (err)
886 return err;
887
53f2556b 888 down_read(&mm->mmap_sem);
39743889 889
7b2259b3
CL
890 err = migrate_vmas(mm, from_nodes, to_nodes, flags);
891 if (err)
892 goto out;
893
da0aa138
KM
894 /*
895 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
896 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
897 * bit in 'tmp', and return that <source, dest> pair for migration.
898 * The pair of nodemasks 'to' and 'from' define the map.
899 *
900 * If no pair of bits is found that way, fallback to picking some
901 * pair of 'source' and 'dest' bits that are not the same. If the
902 * 'source' and 'dest' bits are the same, this represents a node
903 * that will be migrating to itself, so no pages need move.
904 *
905 * If no bits are left in 'tmp', or if all remaining bits left
906 * in 'tmp' correspond to the same bit in 'to', return false
907 * (nothing left to migrate).
908 *
909 * This lets us pick a pair of nodes to migrate between, such that
910 * if possible the dest node is not already occupied by some other
911 * source node, minimizing the risk of overloading the memory on a
912 * node that would happen if we migrated incoming memory to a node
913 * before migrating outgoing memory source that same node.
914 *
915 * A single scan of tmp is sufficient. As we go, we remember the
916 * most recent <s, d> pair that moved (s != d). If we find a pair
917 * that not only moved, but what's better, moved to an empty slot
918 * (d is not set in tmp), then we break out then, with that pair.
919 * Otherwise when we finish scannng from_tmp, we at least have the
920 * most recent <s, d> pair that moved. If we get all the way through
921 * the scan of tmp without finding any node that moved, much less
922 * moved to an empty node, then there is nothing left worth migrating.
923 */
d4984711 924
7e2ab150
CL
925 tmp = *from_nodes;
926 while (!nodes_empty(tmp)) {
927 int s,d;
928 int source = -1;
929 int dest = 0;
930
931 for_each_node_mask(s, tmp) {
932 d = node_remap(s, *from_nodes, *to_nodes);
933 if (s == d)
934 continue;
935
936 source = s; /* Node moved. Memorize */
937 dest = d;
938
939 /* dest not in remaining from nodes? */
940 if (!node_isset(dest, tmp))
941 break;
942 }
943 if (source == -1)
944 break;
945
946 node_clear(source, tmp);
947 err = migrate_to_node(mm, source, dest, flags);
948 if (err > 0)
949 busy += err;
950 if (err < 0)
951 break;
39743889 952 }
7b2259b3 953out:
39743889 954 up_read(&mm->mmap_sem);
7e2ab150
CL
955 if (err < 0)
956 return err;
957 return busy;
b20a3503
CL
958
959}
960
3ad33b24
LS
961/*
962 * Allocate a new page for page migration based on vma policy.
963 * Start assuming that page is mapped by vma pointed to by @private.
964 * Search forward from there, if not. N.B., this assumes that the
965 * list of pages handed to migrate_pages()--which is how we get here--
966 * is in virtual address order.
967 */
742755a1 968static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
95a402c3
CL
969{
970 struct vm_area_struct *vma = (struct vm_area_struct *)private;
3ad33b24 971 unsigned long uninitialized_var(address);
95a402c3 972
3ad33b24
LS
973 while (vma) {
974 address = page_address_in_vma(page, vma);
975 if (address != -EFAULT)
976 break;
977 vma = vma->vm_next;
978 }
979
980 /*
981 * if !vma, alloc_page_vma() will use task or system default policy
982 */
983 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
95a402c3 984}
b20a3503
CL
985#else
986
987static void migrate_page_add(struct page *page, struct list_head *pagelist,
988 unsigned long flags)
989{
39743889
CL
990}
991
b20a3503
CL
992int do_migrate_pages(struct mm_struct *mm,
993 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
994{
995 return -ENOSYS;
996}
95a402c3 997
69939749 998static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
95a402c3
CL
999{
1000 return NULL;
1001}
b20a3503
CL
1002#endif
1003
dbcb0f19 1004static long do_mbind(unsigned long start, unsigned long len,
028fec41
DR
1005 unsigned short mode, unsigned short mode_flags,
1006 nodemask_t *nmask, unsigned long flags)
6ce3c4c0
CL
1007{
1008 struct vm_area_struct *vma;
1009 struct mm_struct *mm = current->mm;
1010 struct mempolicy *new;
1011 unsigned long end;
1012 int err;
1013 LIST_HEAD(pagelist);
1014
a3b51e01
DR
1015 if (flags & ~(unsigned long)(MPOL_MF_STRICT |
1016 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
6ce3c4c0 1017 return -EINVAL;
74c00241 1018 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
6ce3c4c0
CL
1019 return -EPERM;
1020
1021 if (start & ~PAGE_MASK)
1022 return -EINVAL;
1023
1024 if (mode == MPOL_DEFAULT)
1025 flags &= ~MPOL_MF_STRICT;
1026
1027 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1028 end = start + len;
1029
1030 if (end < start)
1031 return -EINVAL;
1032 if (end == start)
1033 return 0;
1034
028fec41 1035 new = mpol_new(mode, mode_flags, nmask);
6ce3c4c0
CL
1036 if (IS_ERR(new))
1037 return PTR_ERR(new);
1038
1039 /*
1040 * If we are using the default policy then operation
1041 * on discontinuous address spaces is okay after all
1042 */
1043 if (!new)
1044 flags |= MPOL_MF_DISCONTIG_OK;
1045
028fec41
DR
1046 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1047 start, start + len, mode, mode_flags,
1048 nmask ? nodes_addr(*nmask)[0] : -1);
6ce3c4c0 1049
0aedadf9
CL
1050 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1051
1052 err = migrate_prep();
1053 if (err)
b05ca738 1054 goto mpol_out;
0aedadf9 1055 }
4bfc4495
KH
1056 {
1057 NODEMASK_SCRATCH(scratch);
1058 if (scratch) {
1059 down_write(&mm->mmap_sem);
1060 task_lock(current);
1061 err = mpol_set_nodemask(new, nmask, scratch);
1062 task_unlock(current);
1063 if (err)
1064 up_write(&mm->mmap_sem);
1065 } else
1066 err = -ENOMEM;
1067 NODEMASK_SCRATCH_FREE(scratch);
1068 }
b05ca738
KM
1069 if (err)
1070 goto mpol_out;
1071
6ce3c4c0
CL
1072 vma = check_range(mm, start, end, nmask,
1073 flags | MPOL_MF_INVERT, &pagelist);
1074
1075 err = PTR_ERR(vma);
1076 if (!IS_ERR(vma)) {
1077 int nr_failed = 0;
1078
9d8cebd4 1079 err = mbind_range(mm, start, end, new);
7e2ab150 1080
6ce3c4c0 1081 if (!list_empty(&pagelist))
95a402c3 1082 nr_failed = migrate_pages(&pagelist, new_vma_page,
62b61f61 1083 (unsigned long)vma, 0);
6ce3c4c0
CL
1084
1085 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1086 err = -EIO;
ab8a3e14
KM
1087 } else
1088 putback_lru_pages(&pagelist);
b20a3503 1089
6ce3c4c0 1090 up_write(&mm->mmap_sem);
b05ca738 1091 mpol_out:
f0be3d32 1092 mpol_put(new);
6ce3c4c0
CL
1093 return err;
1094}
1095
8bccd85f
CL
1096/*
1097 * User space interface with variable sized bitmaps for nodelists.
1098 */
1099
1100/* Copy a node mask from user space. */
39743889 1101static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85f
CL
1102 unsigned long maxnode)
1103{
1104 unsigned long k;
1105 unsigned long nlongs;
1106 unsigned long endmask;
1107
1108 --maxnode;
1109 nodes_clear(*nodes);
1110 if (maxnode == 0 || !nmask)
1111 return 0;
a9c930ba 1112 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
636f13c1 1113 return -EINVAL;
8bccd85f
CL
1114
1115 nlongs = BITS_TO_LONGS(maxnode);
1116 if ((maxnode % BITS_PER_LONG) == 0)
1117 endmask = ~0UL;
1118 else
1119 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1120
1121 /* When the user specified more nodes than supported just check
1122 if the non supported part is all zero. */
1123 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1124 if (nlongs > PAGE_SIZE/sizeof(long))
1125 return -EINVAL;
1126 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1127 unsigned long t;
1128 if (get_user(t, nmask + k))
1129 return -EFAULT;
1130 if (k == nlongs - 1) {
1131 if (t & endmask)
1132 return -EINVAL;
1133 } else if (t)
1134 return -EINVAL;
1135 }
1136 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1137 endmask = ~0UL;
1138 }
1139
1140 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1141 return -EFAULT;
1142 nodes_addr(*nodes)[nlongs-1] &= endmask;
1143 return 0;
1144}
1145
1146/* Copy a kernel node mask to user space */
1147static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1148 nodemask_t *nodes)
1149{
1150 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1151 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1152
1153 if (copy > nbytes) {
1154 if (copy > PAGE_SIZE)
1155 return -EINVAL;
1156 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1157 return -EFAULT;
1158 copy = nbytes;
1159 }
1160 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1161}
1162
938bb9f5
HC
1163SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1164 unsigned long, mode, unsigned long __user *, nmask,
1165 unsigned long, maxnode, unsigned, flags)
8bccd85f
CL
1166{
1167 nodemask_t nodes;
1168 int err;
028fec41 1169 unsigned short mode_flags;
8bccd85f 1170
028fec41
DR
1171 mode_flags = mode & MPOL_MODE_FLAGS;
1172 mode &= ~MPOL_MODE_FLAGS;
a3b51e01
DR
1173 if (mode >= MPOL_MAX)
1174 return -EINVAL;
4c50bc01
DR
1175 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1176 (mode_flags & MPOL_F_RELATIVE_NODES))
1177 return -EINVAL;
8bccd85f
CL
1178 err = get_nodes(&nodes, nmask, maxnode);
1179 if (err)
1180 return err;
028fec41 1181 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
8bccd85f
CL
1182}
1183
1184/* Set the process memory policy */
938bb9f5
HC
1185SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1186 unsigned long, maxnode)
8bccd85f
CL
1187{
1188 int err;
1189 nodemask_t nodes;
028fec41 1190 unsigned short flags;
8bccd85f 1191
028fec41
DR
1192 flags = mode & MPOL_MODE_FLAGS;
1193 mode &= ~MPOL_MODE_FLAGS;
1194 if ((unsigned int)mode >= MPOL_MAX)
8bccd85f 1195 return -EINVAL;
4c50bc01
DR
1196 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1197 return -EINVAL;
8bccd85f
CL
1198 err = get_nodes(&nodes, nmask, maxnode);
1199 if (err)
1200 return err;
028fec41 1201 return do_set_mempolicy(mode, flags, &nodes);
8bccd85f
CL
1202}
1203
938bb9f5
HC
1204SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1205 const unsigned long __user *, old_nodes,
1206 const unsigned long __user *, new_nodes)
39743889 1207{
c69e8d9c 1208 const struct cred *cred = current_cred(), *tcred;
39743889
CL
1209 struct mm_struct *mm;
1210 struct task_struct *task;
1211 nodemask_t old;
1212 nodemask_t new;
1213 nodemask_t task_nodes;
1214 int err;
1215
1216 err = get_nodes(&old, old_nodes, maxnode);
1217 if (err)
1218 return err;
1219
1220 err = get_nodes(&new, new_nodes, maxnode);
1221 if (err)
1222 return err;
1223
1224 /* Find the mm_struct */
1225 read_lock(&tasklist_lock);
228ebcbe 1226 task = pid ? find_task_by_vpid(pid) : current;
39743889
CL
1227 if (!task) {
1228 read_unlock(&tasklist_lock);
1229 return -ESRCH;
1230 }
1231 mm = get_task_mm(task);
1232 read_unlock(&tasklist_lock);
1233
1234 if (!mm)
1235 return -EINVAL;
1236
1237 /*
1238 * Check if this process has the right to modify the specified
1239 * process. The right exists if the process has administrative
7f927fcc 1240 * capabilities, superuser privileges or the same
39743889
CL
1241 * userid as the target process.
1242 */
c69e8d9c
DH
1243 rcu_read_lock();
1244 tcred = __task_cred(task);
b6dff3ec
DH
1245 if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1246 cred->uid != tcred->suid && cred->uid != tcred->uid &&
74c00241 1247 !capable(CAP_SYS_NICE)) {
c69e8d9c 1248 rcu_read_unlock();
39743889
CL
1249 err = -EPERM;
1250 goto out;
1251 }
c69e8d9c 1252 rcu_read_unlock();
39743889
CL
1253
1254 task_nodes = cpuset_mems_allowed(task);
1255 /* Is the user allowed to access the target nodes? */
74c00241 1256 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
39743889
CL
1257 err = -EPERM;
1258 goto out;
1259 }
1260
37b07e41 1261 if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
3b42d28b
CL
1262 err = -EINVAL;
1263 goto out;
1264 }
1265
86c3a764
DQ
1266 err = security_task_movememory(task);
1267 if (err)
1268 goto out;
1269
511030bc 1270 err = do_migrate_pages(mm, &old, &new,
74c00241 1271 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
39743889
CL
1272out:
1273 mmput(mm);
1274 return err;
1275}
1276
1277
8bccd85f 1278/* Retrieve NUMA policy */
938bb9f5
HC
1279SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1280 unsigned long __user *, nmask, unsigned long, maxnode,
1281 unsigned long, addr, unsigned long, flags)
8bccd85f 1282{
dbcb0f19
AB
1283 int err;
1284 int uninitialized_var(pval);
8bccd85f
CL
1285 nodemask_t nodes;
1286
1287 if (nmask != NULL && maxnode < MAX_NUMNODES)
1288 return -EINVAL;
1289
1290 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1291
1292 if (err)
1293 return err;
1294
1295 if (policy && put_user(pval, policy))
1296 return -EFAULT;
1297
1298 if (nmask)
1299 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1300
1301 return err;
1302}
1303
1da177e4
LT
1304#ifdef CONFIG_COMPAT
1305
1306asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1307 compat_ulong_t __user *nmask,
1308 compat_ulong_t maxnode,
1309 compat_ulong_t addr, compat_ulong_t flags)
1310{
1311 long err;
1312 unsigned long __user *nm = NULL;
1313 unsigned long nr_bits, alloc_size;
1314 DECLARE_BITMAP(bm, MAX_NUMNODES);
1315
1316 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1317 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1318
1319 if (nmask)
1320 nm = compat_alloc_user_space(alloc_size);
1321
1322 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1323
1324 if (!err && nmask) {
1325 err = copy_from_user(bm, nm, alloc_size);
1326 /* ensure entire bitmap is zeroed */
1327 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1328 err |= compat_put_bitmap(nmask, bm, nr_bits);
1329 }
1330
1331 return err;
1332}
1333
1334asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1335 compat_ulong_t maxnode)
1336{
1337 long err = 0;
1338 unsigned long __user *nm = NULL;
1339 unsigned long nr_bits, alloc_size;
1340 DECLARE_BITMAP(bm, MAX_NUMNODES);
1341
1342 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1343 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1344
1345 if (nmask) {
1346 err = compat_get_bitmap(bm, nmask, nr_bits);
1347 nm = compat_alloc_user_space(alloc_size);
1348 err |= copy_to_user(nm, bm, alloc_size);
1349 }
1350
1351 if (err)
1352 return -EFAULT;
1353
1354 return sys_set_mempolicy(mode, nm, nr_bits+1);
1355}
1356
1357asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1358 compat_ulong_t mode, compat_ulong_t __user *nmask,
1359 compat_ulong_t maxnode, compat_ulong_t flags)
1360{
1361 long err = 0;
1362 unsigned long __user *nm = NULL;
1363 unsigned long nr_bits, alloc_size;
dfcd3c0d 1364 nodemask_t bm;
1da177e4
LT
1365
1366 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1367 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1368
1369 if (nmask) {
dfcd3c0d 1370 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1da177e4 1371 nm = compat_alloc_user_space(alloc_size);
dfcd3c0d 1372 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1da177e4
LT
1373 }
1374
1375 if (err)
1376 return -EFAULT;
1377
1378 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1379}
1380
1381#endif
1382
480eccf9
LS
1383/*
1384 * get_vma_policy(@task, @vma, @addr)
1385 * @task - task for fallback if vma policy == default
1386 * @vma - virtual memory area whose policy is sought
1387 * @addr - address in @vma for shared policy lookup
1388 *
1389 * Returns effective policy for a VMA at specified address.
1390 * Falls back to @task or system default policy, as necessary.
52cd3b07
LS
1391 * Current or other task's task mempolicy and non-shared vma policies
1392 * are protected by the task's mmap_sem, which must be held for read by
1393 * the caller.
1394 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1395 * count--added by the get_policy() vm_op, as appropriate--to protect against
1396 * freeing by another task. It is the caller's responsibility to free the
1397 * extra reference for shared policies.
480eccf9 1398 */
ae4d8c16 1399static struct mempolicy *get_vma_policy(struct task_struct *task,
48fce342 1400 struct vm_area_struct *vma, unsigned long addr)
1da177e4 1401{
6e21c8f1 1402 struct mempolicy *pol = task->mempolicy;
1da177e4
LT
1403
1404 if (vma) {
480eccf9 1405 if (vma->vm_ops && vma->vm_ops->get_policy) {
ae4d8c16
LS
1406 struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1407 addr);
1408 if (vpol)
1409 pol = vpol;
bea904d5 1410 } else if (vma->vm_policy)
1da177e4
LT
1411 pol = vma->vm_policy;
1412 }
1413 if (!pol)
1414 pol = &default_policy;
1415 return pol;
1416}
1417
52cd3b07
LS
1418/*
1419 * Return a nodemask representing a mempolicy for filtering nodes for
1420 * page allocation
1421 */
1422static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
19770b32
MG
1423{
1424 /* Lower zones don't get a nodemask applied for MPOL_BIND */
45c4745a 1425 if (unlikely(policy->mode == MPOL_BIND) &&
19770b32
MG
1426 gfp_zone(gfp) >= policy_zone &&
1427 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1428 return &policy->v.nodes;
1429
1430 return NULL;
1431}
1432
52cd3b07
LS
1433/* Return a zonelist indicated by gfp for node representing a mempolicy */
1434static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1da177e4 1435{
fc36b8d3 1436 int nd = numa_node_id();
1da177e4 1437
45c4745a 1438 switch (policy->mode) {
1da177e4 1439 case MPOL_PREFERRED:
fc36b8d3
LS
1440 if (!(policy->flags & MPOL_F_LOCAL))
1441 nd = policy->v.preferred_node;
1da177e4
LT
1442 break;
1443 case MPOL_BIND:
19770b32 1444 /*
52cd3b07
LS
1445 * Normally, MPOL_BIND allocations are node-local within the
1446 * allowed nodemask. However, if __GFP_THISNODE is set and the
1447 * current node is part of the mask, we use the zonelist for
1448 * the first node in the mask instead.
19770b32 1449 */
19770b32
MG
1450 if (unlikely(gfp & __GFP_THISNODE) &&
1451 unlikely(!node_isset(nd, policy->v.nodes)))
1452 nd = first_node(policy->v.nodes);
1453 break;
1da177e4 1454 case MPOL_INTERLEAVE: /* should not happen */
1da177e4
LT
1455 break;
1456 default:
1da177e4
LT
1457 BUG();
1458 }
0e88460d 1459 return node_zonelist(nd, gfp);
1da177e4
LT
1460}
1461
1462/* Do dynamic interleaving for a process */
1463static unsigned interleave_nodes(struct mempolicy *policy)
1464{
1465 unsigned nid, next;
1466 struct task_struct *me = current;
1467
1468 nid = me->il_next;
dfcd3c0d 1469 next = next_node(nid, policy->v.nodes);
1da177e4 1470 if (next >= MAX_NUMNODES)
dfcd3c0d 1471 next = first_node(policy->v.nodes);
f5b087b5
DR
1472 if (next < MAX_NUMNODES)
1473 me->il_next = next;
1da177e4
LT
1474 return nid;
1475}
1476
dc85da15
CL
1477/*
1478 * Depending on the memory policy provide a node from which to allocate the
1479 * next slab entry.
52cd3b07
LS
1480 * @policy must be protected by freeing by the caller. If @policy is
1481 * the current task's mempolicy, this protection is implicit, as only the
1482 * task can change it's policy. The system default policy requires no
1483 * such protection.
dc85da15
CL
1484 */
1485unsigned slab_node(struct mempolicy *policy)
1486{
fc36b8d3 1487 if (!policy || policy->flags & MPOL_F_LOCAL)
bea904d5
LS
1488 return numa_node_id();
1489
1490 switch (policy->mode) {
1491 case MPOL_PREFERRED:
fc36b8d3
LS
1492 /*
1493 * handled MPOL_F_LOCAL above
1494 */
1495 return policy->v.preferred_node;
765c4507 1496
dc85da15
CL
1497 case MPOL_INTERLEAVE:
1498 return interleave_nodes(policy);
1499
dd1a239f 1500 case MPOL_BIND: {
dc85da15
CL
1501 /*
1502 * Follow bind policy behavior and start allocation at the
1503 * first node.
1504 */
19770b32
MG
1505 struct zonelist *zonelist;
1506 struct zone *zone;
1507 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1508 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1509 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1510 &policy->v.nodes,
1511 &zone);
1512 return zone->node;
dd1a239f 1513 }
dc85da15 1514
dc85da15 1515 default:
bea904d5 1516 BUG();
dc85da15
CL
1517 }
1518}
1519
1da177e4
LT
1520/* Do static interleaving for a VMA with known offset. */
1521static unsigned offset_il_node(struct mempolicy *pol,
1522 struct vm_area_struct *vma, unsigned long off)
1523{
dfcd3c0d 1524 unsigned nnodes = nodes_weight(pol->v.nodes);
f5b087b5 1525 unsigned target;
1da177e4
LT
1526 int c;
1527 int nid = -1;
1528
f5b087b5
DR
1529 if (!nnodes)
1530 return numa_node_id();
1531 target = (unsigned int)off % nnodes;
1da177e4
LT
1532 c = 0;
1533 do {
dfcd3c0d 1534 nid = next_node(nid, pol->v.nodes);
1da177e4
LT
1535 c++;
1536 } while (c <= target);
1da177e4
LT
1537 return nid;
1538}
1539
5da7ca86
CL
1540/* Determine a node number for interleave */
1541static inline unsigned interleave_nid(struct mempolicy *pol,
1542 struct vm_area_struct *vma, unsigned long addr, int shift)
1543{
1544 if (vma) {
1545 unsigned long off;
1546
3b98b087
NA
1547 /*
1548 * for small pages, there is no difference between
1549 * shift and PAGE_SHIFT, so the bit-shift is safe.
1550 * for huge pages, since vm_pgoff is in units of small
1551 * pages, we need to shift off the always 0 bits to get
1552 * a useful offset.
1553 */
1554 BUG_ON(shift < PAGE_SHIFT);
1555 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
5da7ca86
CL
1556 off += (addr - vma->vm_start) >> shift;
1557 return offset_il_node(pol, vma, off);
1558 } else
1559 return interleave_nodes(pol);
1560}
1561
00ac59ad 1562#ifdef CONFIG_HUGETLBFS
480eccf9
LS
1563/*
1564 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1565 * @vma = virtual memory area whose policy is sought
1566 * @addr = address in @vma for shared policy lookup and interleave policy
1567 * @gfp_flags = for requested zone
19770b32
MG
1568 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1569 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
480eccf9 1570 *
52cd3b07
LS
1571 * Returns a zonelist suitable for a huge page allocation and a pointer
1572 * to the struct mempolicy for conditional unref after allocation.
1573 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1574 * @nodemask for filtering the zonelist.
480eccf9 1575 */
396faf03 1576struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
19770b32
MG
1577 gfp_t gfp_flags, struct mempolicy **mpol,
1578 nodemask_t **nodemask)
5da7ca86 1579{
480eccf9 1580 struct zonelist *zl;
5da7ca86 1581
52cd3b07 1582 *mpol = get_vma_policy(current, vma, addr);
19770b32 1583 *nodemask = NULL; /* assume !MPOL_BIND */
5da7ca86 1584
52cd3b07
LS
1585 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1586 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
a5516438 1587 huge_page_shift(hstate_vma(vma))), gfp_flags);
52cd3b07
LS
1588 } else {
1589 zl = policy_zonelist(gfp_flags, *mpol);
1590 if ((*mpol)->mode == MPOL_BIND)
1591 *nodemask = &(*mpol)->v.nodes;
480eccf9
LS
1592 }
1593 return zl;
5da7ca86 1594}
06808b08
LS
1595
1596/*
1597 * init_nodemask_of_mempolicy
1598 *
1599 * If the current task's mempolicy is "default" [NULL], return 'false'
1600 * to indicate default policy. Otherwise, extract the policy nodemask
1601 * for 'bind' or 'interleave' policy into the argument nodemask, or
1602 * initialize the argument nodemask to contain the single node for
1603 * 'preferred' or 'local' policy and return 'true' to indicate presence
1604 * of non-default mempolicy.
1605 *
1606 * We don't bother with reference counting the mempolicy [mpol_get/put]
1607 * because the current task is examining it's own mempolicy and a task's
1608 * mempolicy is only ever changed by the task itself.
1609 *
1610 * N.B., it is the caller's responsibility to free a returned nodemask.
1611 */
1612bool init_nodemask_of_mempolicy(nodemask_t *mask)
1613{
1614 struct mempolicy *mempolicy;
1615 int nid;
1616
1617 if (!(mask && current->mempolicy))
1618 return false;
1619
1620 mempolicy = current->mempolicy;
1621 switch (mempolicy->mode) {
1622 case MPOL_PREFERRED:
1623 if (mempolicy->flags & MPOL_F_LOCAL)
1624 nid = numa_node_id();
1625 else
1626 nid = mempolicy->v.preferred_node;
1627 init_nodemask_of_node(mask, nid);
1628 break;
1629
1630 case MPOL_BIND:
1631 /* Fall through */
1632 case MPOL_INTERLEAVE:
1633 *mask = mempolicy->v.nodes;
1634 break;
1635
1636 default:
1637 BUG();
1638 }
1639
1640 return true;
1641}
00ac59ad 1642#endif
5da7ca86 1643
1da177e4
LT
1644/* Allocate a page in interleaved policy.
1645 Own path because it needs to do special accounting. */
662f3a0b
AK
1646static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1647 unsigned nid)
1da177e4
LT
1648{
1649 struct zonelist *zl;
1650 struct page *page;
1651
0e88460d 1652 zl = node_zonelist(nid, gfp);
1da177e4 1653 page = __alloc_pages(gfp, order, zl);
dd1a239f 1654 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
ca889e6c 1655 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1da177e4
LT
1656 return page;
1657}
1658
1659/**
1660 * alloc_page_vma - Allocate a page for a VMA.
1661 *
1662 * @gfp:
1663 * %GFP_USER user allocation.
1664 * %GFP_KERNEL kernel allocations,
1665 * %GFP_HIGHMEM highmem/user allocations,
1666 * %GFP_FS allocation should not call back into a file system.
1667 * %GFP_ATOMIC don't sleep.
1668 *
1669 * @vma: Pointer to VMA or NULL if not available.
1670 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1671 *
1672 * This function allocates a page from the kernel page pool and applies
1673 * a NUMA policy associated with the VMA or the current process.
1674 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1675 * mm_struct of the VMA to prevent it from going away. Should be used for
1676 * all allocations for pages that will be mapped into
1677 * user space. Returns NULL when no page can be allocated.
1678 *
1679 * Should be called with the mm_sem of the vma hold.
1680 */
1681struct page *
dd0fc66f 1682alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1da177e4 1683{
6e21c8f1 1684 struct mempolicy *pol = get_vma_policy(current, vma, addr);
480eccf9 1685 struct zonelist *zl;
1da177e4 1686
45c4745a 1687 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1da177e4 1688 unsigned nid;
5da7ca86
CL
1689
1690 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
52cd3b07 1691 mpol_cond_put(pol);
1da177e4
LT
1692 return alloc_page_interleave(gfp, 0, nid);
1693 }
52cd3b07
LS
1694 zl = policy_zonelist(gfp, pol);
1695 if (unlikely(mpol_needs_cond_ref(pol))) {
480eccf9 1696 /*
52cd3b07 1697 * slow path: ref counted shared policy
480eccf9 1698 */
19770b32 1699 struct page *page = __alloc_pages_nodemask(gfp, 0,
52cd3b07 1700 zl, policy_nodemask(gfp, pol));
f0be3d32 1701 __mpol_put(pol);
480eccf9
LS
1702 return page;
1703 }
1704 /*
1705 * fast path: default or task policy
1706 */
52cd3b07 1707 return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1da177e4
LT
1708}
1709
1710/**
1711 * alloc_pages_current - Allocate pages.
1712 *
1713 * @gfp:
1714 * %GFP_USER user allocation,
1715 * %GFP_KERNEL kernel allocation,
1716 * %GFP_HIGHMEM highmem allocation,
1717 * %GFP_FS don't call back into a file system.
1718 * %GFP_ATOMIC don't sleep.
1719 * @order: Power of two of allocation size in pages. 0 is a single page.
1720 *
1721 * Allocate a page from the kernel page pool. When not in
1722 * interrupt context and apply the current process NUMA policy.
1723 * Returns NULL when no page can be allocated.
1724 *
cf2a473c 1725 * Don't call cpuset_update_task_memory_state() unless
1da177e4
LT
1726 * 1) it's ok to take cpuset_sem (can WAIT), and
1727 * 2) allocating for current task (not interrupt).
1728 */
dd0fc66f 1729struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1da177e4
LT
1730{
1731 struct mempolicy *pol = current->mempolicy;
1732
9b819d20 1733 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1da177e4 1734 pol = &default_policy;
52cd3b07
LS
1735
1736 /*
1737 * No reference counting needed for current->mempolicy
1738 * nor system default_policy
1739 */
45c4745a 1740 if (pol->mode == MPOL_INTERLEAVE)
1da177e4 1741 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
19770b32 1742 return __alloc_pages_nodemask(gfp, order,
52cd3b07 1743 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1da177e4
LT
1744}
1745EXPORT_SYMBOL(alloc_pages_current);
1746
4225399a 1747/*
846a16bf 1748 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
4225399a
PJ
1749 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1750 * with the mems_allowed returned by cpuset_mems_allowed(). This
1751 * keeps mempolicies cpuset relative after its cpuset moves. See
1752 * further kernel/cpuset.c update_nodemask().
1753 */
4225399a 1754
846a16bf
LS
1755/* Slow path of a mempolicy duplicate */
1756struct mempolicy *__mpol_dup(struct mempolicy *old)
1da177e4
LT
1757{
1758 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1759
1760 if (!new)
1761 return ERR_PTR(-ENOMEM);
99ee4ca7 1762 rcu_read_lock();
4225399a
PJ
1763 if (current_cpuset_is_being_rebound()) {
1764 nodemask_t mems = cpuset_mems_allowed(current);
1765 mpol_rebind_policy(old, &mems);
1766 }
99ee4ca7 1767 rcu_read_unlock();
1da177e4
LT
1768 *new = *old;
1769 atomic_set(&new->refcnt, 1);
1da177e4
LT
1770 return new;
1771}
1772
52cd3b07
LS
1773/*
1774 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1775 * eliminate the * MPOL_F_* flags that require conditional ref and
1776 * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
1777 * after return. Use the returned value.
1778 *
1779 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1780 * policy lookup, even if the policy needs/has extra ref on lookup.
1781 * shmem_readahead needs this.
1782 */
1783struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1784 struct mempolicy *frompol)
1785{
1786 if (!mpol_needs_cond_ref(frompol))
1787 return frompol;
1788
1789 *tompol = *frompol;
1790 tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
1791 __mpol_put(frompol);
1792 return tompol;
1793}
1794
f5b087b5
DR
1795static int mpol_match_intent(const struct mempolicy *a,
1796 const struct mempolicy *b)
1797{
1798 if (a->flags != b->flags)
1799 return 0;
1800 if (!mpol_store_user_nodemask(a))
1801 return 1;
1802 return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1803}
1804
1da177e4
LT
1805/* Slow path of a mempolicy comparison */
1806int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1807{
1808 if (!a || !b)
1809 return 0;
45c4745a 1810 if (a->mode != b->mode)
1da177e4 1811 return 0;
45c4745a 1812 if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
f5b087b5 1813 return 0;
45c4745a 1814 switch (a->mode) {
19770b32
MG
1815 case MPOL_BIND:
1816 /* Fall through */
1da177e4 1817 case MPOL_INTERLEAVE:
dfcd3c0d 1818 return nodes_equal(a->v.nodes, b->v.nodes);
1da177e4 1819 case MPOL_PREFERRED:
fc36b8d3
LS
1820 return a->v.preferred_node == b->v.preferred_node &&
1821 a->flags == b->flags;
1da177e4
LT
1822 default:
1823 BUG();
1824 return 0;
1825 }
1826}
1827
1da177e4
LT
1828/*
1829 * Shared memory backing store policy support.
1830 *
1831 * Remember policies even when nobody has shared memory mapped.
1832 * The policies are kept in Red-Black tree linked from the inode.
1833 * They are protected by the sp->lock spinlock, which should be held
1834 * for any accesses to the tree.
1835 */
1836
1837/* lookup first element intersecting start-end */
1838/* Caller holds sp->lock */
1839static struct sp_node *
1840sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1841{
1842 struct rb_node *n = sp->root.rb_node;
1843
1844 while (n) {
1845 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1846
1847 if (start >= p->end)
1848 n = n->rb_right;
1849 else if (end <= p->start)
1850 n = n->rb_left;
1851 else
1852 break;
1853 }
1854 if (!n)
1855 return NULL;
1856 for (;;) {
1857 struct sp_node *w = NULL;
1858 struct rb_node *prev = rb_prev(n);
1859 if (!prev)
1860 break;
1861 w = rb_entry(prev, struct sp_node, nd);
1862 if (w->end <= start)
1863 break;
1864 n = prev;
1865 }
1866 return rb_entry(n, struct sp_node, nd);
1867}
1868
1869/* Insert a new shared policy into the list. */
1870/* Caller holds sp->lock */
1871static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1872{
1873 struct rb_node **p = &sp->root.rb_node;
1874 struct rb_node *parent = NULL;
1875 struct sp_node *nd;
1876
1877 while (*p) {
1878 parent = *p;
1879 nd = rb_entry(parent, struct sp_node, nd);
1880 if (new->start < nd->start)
1881 p = &(*p)->rb_left;
1882 else if (new->end > nd->end)
1883 p = &(*p)->rb_right;
1884 else
1885 BUG();
1886 }
1887 rb_link_node(&new->nd, parent, p);
1888 rb_insert_color(&new->nd, &sp->root);
140d5a49 1889 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
45c4745a 1890 new->policy ? new->policy->mode : 0);
1da177e4
LT
1891}
1892
1893/* Find shared policy intersecting idx */
1894struct mempolicy *
1895mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1896{
1897 struct mempolicy *pol = NULL;
1898 struct sp_node *sn;
1899
1900 if (!sp->root.rb_node)
1901 return NULL;
1902 spin_lock(&sp->lock);
1903 sn = sp_lookup(sp, idx, idx+1);
1904 if (sn) {
1905 mpol_get(sn->policy);
1906 pol = sn->policy;
1907 }
1908 spin_unlock(&sp->lock);
1909 return pol;
1910}
1911
1912static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1913{
140d5a49 1914 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1da177e4 1915 rb_erase(&n->nd, &sp->root);
f0be3d32 1916 mpol_put(n->policy);
1da177e4
LT
1917 kmem_cache_free(sn_cache, n);
1918}
1919
dbcb0f19
AB
1920static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1921 struct mempolicy *pol)
1da177e4
LT
1922{
1923 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1924
1925 if (!n)
1926 return NULL;
1927 n->start = start;
1928 n->end = end;
1929 mpol_get(pol);
aab0b102 1930 pol->flags |= MPOL_F_SHARED; /* for unref */
1da177e4
LT
1931 n->policy = pol;
1932 return n;
1933}
1934
1935/* Replace a policy range. */
1936static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1937 unsigned long end, struct sp_node *new)
1938{
1939 struct sp_node *n, *new2 = NULL;
1940
1941restart:
1942 spin_lock(&sp->lock);
1943 n = sp_lookup(sp, start, end);
1944 /* Take care of old policies in the same range. */
1945 while (n && n->start < end) {
1946 struct rb_node *next = rb_next(&n->nd);
1947 if (n->start >= start) {
1948 if (n->end <= end)
1949 sp_delete(sp, n);
1950 else
1951 n->start = end;
1952 } else {
1953 /* Old policy spanning whole new range. */
1954 if (n->end > end) {
1955 if (!new2) {
1956 spin_unlock(&sp->lock);
1957 new2 = sp_alloc(end, n->end, n->policy);
1958 if (!new2)
1959 return -ENOMEM;
1960 goto restart;
1961 }
1962 n->end = start;
1963 sp_insert(sp, new2);
1964 new2 = NULL;
1965 break;
1966 } else
1967 n->end = start;
1968 }
1969 if (!next)
1970 break;
1971 n = rb_entry(next, struct sp_node, nd);
1972 }
1973 if (new)
1974 sp_insert(sp, new);
1975 spin_unlock(&sp->lock);
1976 if (new2) {
f0be3d32 1977 mpol_put(new2->policy);
1da177e4
LT
1978 kmem_cache_free(sn_cache, new2);
1979 }
1980 return 0;
1981}
1982
71fe804b
LS
1983/**
1984 * mpol_shared_policy_init - initialize shared policy for inode
1985 * @sp: pointer to inode shared policy
1986 * @mpol: struct mempolicy to install
1987 *
1988 * Install non-NULL @mpol in inode's shared policy rb-tree.
1989 * On entry, the current task has a reference on a non-NULL @mpol.
1990 * This must be released on exit.
4bfc4495 1991 * This is called at get_inode() calls and we can use GFP_KERNEL.
71fe804b
LS
1992 */
1993void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1994{
58568d2a
MX
1995 int ret;
1996
71fe804b
LS
1997 sp->root = RB_ROOT; /* empty tree == default mempolicy */
1998 spin_lock_init(&sp->lock);
1999
2000 if (mpol) {
2001 struct vm_area_struct pvma;
2002 struct mempolicy *new;
4bfc4495 2003 NODEMASK_SCRATCH(scratch);
71fe804b 2004
4bfc4495
KH
2005 if (!scratch)
2006 return;
71fe804b
LS
2007 /* contextualize the tmpfs mount point mempolicy */
2008 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
58568d2a
MX
2009 if (IS_ERR(new)) {
2010 mpol_put(mpol); /* drop our ref on sb mpol */
4bfc4495 2011 NODEMASK_SCRATCH_FREE(scratch);
71fe804b 2012 return; /* no valid nodemask intersection */
58568d2a
MX
2013 }
2014
2015 task_lock(current);
4bfc4495 2016 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
58568d2a
MX
2017 task_unlock(current);
2018 mpol_put(mpol); /* drop our ref on sb mpol */
2019 if (ret) {
4bfc4495 2020 NODEMASK_SCRATCH_FREE(scratch);
58568d2a
MX
2021 mpol_put(new);
2022 return;
2023 }
71fe804b
LS
2024
2025 /* Create pseudo-vma that contains just the policy */
2026 memset(&pvma, 0, sizeof(struct vm_area_struct));
2027 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2028 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2029 mpol_put(new); /* drop initial ref */
4bfc4495 2030 NODEMASK_SCRATCH_FREE(scratch);
7339ff83
RH
2031 }
2032}
2033
1da177e4
LT
2034int mpol_set_shared_policy(struct shared_policy *info,
2035 struct vm_area_struct *vma, struct mempolicy *npol)
2036{
2037 int err;
2038 struct sp_node *new = NULL;
2039 unsigned long sz = vma_pages(vma);
2040
028fec41 2041 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1da177e4 2042 vma->vm_pgoff,
45c4745a 2043 sz, npol ? npol->mode : -1,
028fec41 2044 npol ? npol->flags : -1,
140d5a49 2045 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1da177e4
LT
2046
2047 if (npol) {
2048 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2049 if (!new)
2050 return -ENOMEM;
2051 }
2052 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2053 if (err && new)
2054 kmem_cache_free(sn_cache, new);
2055 return err;
2056}
2057
2058/* Free a backing policy store on inode delete. */
2059void mpol_free_shared_policy(struct shared_policy *p)
2060{
2061 struct sp_node *n;
2062 struct rb_node *next;
2063
2064 if (!p->root.rb_node)
2065 return;
2066 spin_lock(&p->lock);
2067 next = rb_first(&p->root);
2068 while (next) {
2069 n = rb_entry(next, struct sp_node, nd);
2070 next = rb_next(&n->nd);
90c5029e 2071 rb_erase(&n->nd, &p->root);
f0be3d32 2072 mpol_put(n->policy);
1da177e4
LT
2073 kmem_cache_free(sn_cache, n);
2074 }
2075 spin_unlock(&p->lock);
1da177e4
LT
2076}
2077
2078/* assumes fs == KERNEL_DS */
2079void __init numa_policy_init(void)
2080{
b71636e2
PM
2081 nodemask_t interleave_nodes;
2082 unsigned long largest = 0;
2083 int nid, prefer = 0;
2084
1da177e4
LT
2085 policy_cache = kmem_cache_create("numa_policy",
2086 sizeof(struct mempolicy),
20c2df83 2087 0, SLAB_PANIC, NULL);
1da177e4
LT
2088
2089 sn_cache = kmem_cache_create("shared_policy_node",
2090 sizeof(struct sp_node),
20c2df83 2091 0, SLAB_PANIC, NULL);
1da177e4 2092
b71636e2
PM
2093 /*
2094 * Set interleaving policy for system init. Interleaving is only
2095 * enabled across suitably sized nodes (default is >= 16MB), or
2096 * fall back to the largest node if they're all smaller.
2097 */
2098 nodes_clear(interleave_nodes);
56bbd65d 2099 for_each_node_state(nid, N_HIGH_MEMORY) {
b71636e2
PM
2100 unsigned long total_pages = node_present_pages(nid);
2101
2102 /* Preserve the largest node */
2103 if (largest < total_pages) {
2104 largest = total_pages;
2105 prefer = nid;
2106 }
2107
2108 /* Interleave this node? */
2109 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2110 node_set(nid, interleave_nodes);
2111 }
2112
2113 /* All too small, use the largest */
2114 if (unlikely(nodes_empty(interleave_nodes)))
2115 node_set(prefer, interleave_nodes);
1da177e4 2116
028fec41 2117 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1da177e4
LT
2118 printk("numa_policy_init: interleaving failed\n");
2119}
2120
8bccd85f 2121/* Reset policy of current process to default */
1da177e4
LT
2122void numa_default_policy(void)
2123{
028fec41 2124 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1da177e4 2125}
68860ec1 2126
095f1fc4
LS
2127/*
2128 * Parse and format mempolicy from/to strings
2129 */
2130
1a75a6c8 2131/*
fc36b8d3 2132 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
3f226aa1 2133 * Used only for mpol_parse_str() and mpol_to_str()
1a75a6c8 2134 */
53f2556b 2135#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
15ad7cdc 2136static const char * const policy_types[] =
53f2556b 2137 { "default", "prefer", "bind", "interleave", "local" };
1a75a6c8 2138
095f1fc4
LS
2139
2140#ifdef CONFIG_TMPFS
2141/**
2142 * mpol_parse_str - parse string to mempolicy
2143 * @str: string containing mempolicy to parse
71fe804b
LS
2144 * @mpol: pointer to struct mempolicy pointer, returned on success.
2145 * @no_context: flag whether to "contextualize" the mempolicy
095f1fc4
LS
2146 *
2147 * Format of input:
2148 * <mode>[=<flags>][:<nodelist>]
2149 *
71fe804b
LS
2150 * if @no_context is true, save the input nodemask in w.user_nodemask in
2151 * the returned mempolicy. This will be used to "clone" the mempolicy in
2152 * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
2153 * mount option. Note that if 'static' or 'relative' mode flags were
2154 * specified, the input nodemask will already have been saved. Saving
2155 * it again is redundant, but safe.
2156 *
2157 * On success, returns 0, else 1
095f1fc4 2158 */
71fe804b 2159int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
095f1fc4 2160{
71fe804b
LS
2161 struct mempolicy *new = NULL;
2162 unsigned short uninitialized_var(mode);
2163 unsigned short uninitialized_var(mode_flags);
2164 nodemask_t nodes;
095f1fc4
LS
2165 char *nodelist = strchr(str, ':');
2166 char *flags = strchr(str, '=');
2167 int i;
2168 int err = 1;
2169
2170 if (nodelist) {
2171 /* NUL-terminate mode or flags string */
2172 *nodelist++ = '\0';
71fe804b 2173 if (nodelist_parse(nodelist, nodes))
095f1fc4 2174 goto out;
71fe804b 2175 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
095f1fc4 2176 goto out;
71fe804b
LS
2177 } else
2178 nodes_clear(nodes);
2179
095f1fc4
LS
2180 if (flags)
2181 *flags++ = '\0'; /* terminate mode string */
2182
3f226aa1 2183 for (i = 0; i <= MPOL_LOCAL; i++) {
095f1fc4 2184 if (!strcmp(str, policy_types[i])) {
71fe804b 2185 mode = i;
095f1fc4
LS
2186 break;
2187 }
2188 }
3f226aa1 2189 if (i > MPOL_LOCAL)
095f1fc4
LS
2190 goto out;
2191
71fe804b 2192 switch (mode) {
095f1fc4 2193 case MPOL_PREFERRED:
71fe804b
LS
2194 /*
2195 * Insist on a nodelist of one node only
2196 */
095f1fc4
LS
2197 if (nodelist) {
2198 char *rest = nodelist;
2199 while (isdigit(*rest))
2200 rest++;
926f2ae0
KM
2201 if (*rest)
2202 goto out;
095f1fc4
LS
2203 }
2204 break;
095f1fc4
LS
2205 case MPOL_INTERLEAVE:
2206 /*
2207 * Default to online nodes with memory if no nodelist
2208 */
2209 if (!nodelist)
71fe804b 2210 nodes = node_states[N_HIGH_MEMORY];
3f226aa1 2211 break;
71fe804b 2212 case MPOL_LOCAL:
3f226aa1 2213 /*
71fe804b 2214 * Don't allow a nodelist; mpol_new() checks flags
3f226aa1 2215 */
71fe804b 2216 if (nodelist)
3f226aa1 2217 goto out;
71fe804b 2218 mode = MPOL_PREFERRED;
3f226aa1 2219 break;
413b43de
RT
2220 case MPOL_DEFAULT:
2221 /*
2222 * Insist on a empty nodelist
2223 */
2224 if (!nodelist)
2225 err = 0;
2226 goto out;
d69b2e63
KM
2227 case MPOL_BIND:
2228 /*
2229 * Insist on a nodelist
2230 */
2231 if (!nodelist)
2232 goto out;
095f1fc4
LS
2233 }
2234
71fe804b 2235 mode_flags = 0;
095f1fc4
LS
2236 if (flags) {
2237 /*
2238 * Currently, we only support two mutually exclusive
2239 * mode flags.
2240 */
2241 if (!strcmp(flags, "static"))
71fe804b 2242 mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4 2243 else if (!strcmp(flags, "relative"))
71fe804b 2244 mode_flags |= MPOL_F_RELATIVE_NODES;
095f1fc4 2245 else
926f2ae0 2246 goto out;
095f1fc4 2247 }
71fe804b
LS
2248
2249 new = mpol_new(mode, mode_flags, &nodes);
2250 if (IS_ERR(new))
926f2ae0
KM
2251 goto out;
2252
2253 {
58568d2a 2254 int ret;
4bfc4495
KH
2255 NODEMASK_SCRATCH(scratch);
2256 if (scratch) {
2257 task_lock(current);
2258 ret = mpol_set_nodemask(new, &nodes, scratch);
2259 task_unlock(current);
2260 } else
2261 ret = -ENOMEM;
2262 NODEMASK_SCRATCH_FREE(scratch);
2263 if (ret) {
4bfc4495 2264 mpol_put(new);
926f2ae0 2265 goto out;
58568d2a
MX
2266 }
2267 }
926f2ae0
KM
2268 err = 0;
2269 if (no_context) {
2270 /* save for contextualization */
2271 new->w.user_nodemask = nodes;
2272 }
71fe804b 2273
095f1fc4
LS
2274out:
2275 /* Restore string for error message */
2276 if (nodelist)
2277 *--nodelist = ':';
2278 if (flags)
2279 *--flags = '=';
71fe804b
LS
2280 if (!err)
2281 *mpol = new;
095f1fc4
LS
2282 return err;
2283}
2284#endif /* CONFIG_TMPFS */
2285
71fe804b
LS
2286/**
2287 * mpol_to_str - format a mempolicy structure for printing
2288 * @buffer: to contain formatted mempolicy string
2289 * @maxlen: length of @buffer
2290 * @pol: pointer to mempolicy to be formatted
2291 * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
2292 *
1a75a6c8
CL
2293 * Convert a mempolicy into a string.
2294 * Returns the number of characters in buffer (if positive)
2295 * or an error (negative)
2296 */
71fe804b 2297int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
1a75a6c8
CL
2298{
2299 char *p = buffer;
2300 int l;
2301 nodemask_t nodes;
bea904d5 2302 unsigned short mode;
f5b087b5 2303 unsigned short flags = pol ? pol->flags : 0;
1a75a6c8 2304
2291990a
LS
2305 /*
2306 * Sanity check: room for longest mode, flag and some nodes
2307 */
2308 VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2309
bea904d5
LS
2310 if (!pol || pol == &default_policy)
2311 mode = MPOL_DEFAULT;
2312 else
2313 mode = pol->mode;
2314
1a75a6c8
CL
2315 switch (mode) {
2316 case MPOL_DEFAULT:
2317 nodes_clear(nodes);
2318 break;
2319
2320 case MPOL_PREFERRED:
2321 nodes_clear(nodes);
fc36b8d3 2322 if (flags & MPOL_F_LOCAL)
53f2556b
LS
2323 mode = MPOL_LOCAL; /* pseudo-policy */
2324 else
fc36b8d3 2325 node_set(pol->v.preferred_node, nodes);
1a75a6c8
CL
2326 break;
2327
2328 case MPOL_BIND:
19770b32 2329 /* Fall through */
1a75a6c8 2330 case MPOL_INTERLEAVE:
71fe804b
LS
2331 if (no_context)
2332 nodes = pol->w.user_nodemask;
2333 else
2334 nodes = pol->v.nodes;
1a75a6c8
CL
2335 break;
2336
2337 default:
2338 BUG();
1a75a6c8
CL
2339 }
2340
2341 l = strlen(policy_types[mode]);
53f2556b
LS
2342 if (buffer + maxlen < p + l + 1)
2343 return -ENOSPC;
1a75a6c8
CL
2344
2345 strcpy(p, policy_types[mode]);
2346 p += l;
2347
fc36b8d3 2348 if (flags & MPOL_MODE_FLAGS) {
f5b087b5
DR
2349 if (buffer + maxlen < p + 2)
2350 return -ENOSPC;
2351 *p++ = '=';
2352
2291990a
LS
2353 /*
2354 * Currently, the only defined flags are mutually exclusive
2355 */
f5b087b5 2356 if (flags & MPOL_F_STATIC_NODES)
2291990a
LS
2357 p += snprintf(p, buffer + maxlen - p, "static");
2358 else if (flags & MPOL_F_RELATIVE_NODES)
2359 p += snprintf(p, buffer + maxlen - p, "relative");
f5b087b5
DR
2360 }
2361
1a75a6c8
CL
2362 if (!nodes_empty(nodes)) {
2363 if (buffer + maxlen < p + 2)
2364 return -ENOSPC;
095f1fc4 2365 *p++ = ':';
1a75a6c8
CL
2366 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2367 }
2368 return p - buffer;
2369}
2370
2371struct numa_maps {
2372 unsigned long pages;
2373 unsigned long anon;
397874df
CL
2374 unsigned long active;
2375 unsigned long writeback;
1a75a6c8 2376 unsigned long mapcount_max;
397874df
CL
2377 unsigned long dirty;
2378 unsigned long swapcache;
1a75a6c8
CL
2379 unsigned long node[MAX_NUMNODES];
2380};
2381
397874df 2382static void gather_stats(struct page *page, void *private, int pte_dirty)
1a75a6c8
CL
2383{
2384 struct numa_maps *md = private;
2385 int count = page_mapcount(page);
2386
397874df
CL
2387 md->pages++;
2388 if (pte_dirty || PageDirty(page))
2389 md->dirty++;
1a75a6c8 2390
397874df
CL
2391 if (PageSwapCache(page))
2392 md->swapcache++;
1a75a6c8 2393
894bc310 2394 if (PageActive(page) || PageUnevictable(page))
397874df
CL
2395 md->active++;
2396
2397 if (PageWriteback(page))
2398 md->writeback++;
1a75a6c8
CL
2399
2400 if (PageAnon(page))
2401 md->anon++;
2402
397874df
CL
2403 if (count > md->mapcount_max)
2404 md->mapcount_max = count;
2405
1a75a6c8 2406 md->node[page_to_nid(page)]++;
1a75a6c8
CL
2407}
2408
7f709ed0 2409#ifdef CONFIG_HUGETLB_PAGE
397874df
CL
2410static void check_huge_range(struct vm_area_struct *vma,
2411 unsigned long start, unsigned long end,
2412 struct numa_maps *md)
2413{
2414 unsigned long addr;
2415 struct page *page;
a5516438
AK
2416 struct hstate *h = hstate_vma(vma);
2417 unsigned long sz = huge_page_size(h);
397874df 2418
a5516438
AK
2419 for (addr = start; addr < end; addr += sz) {
2420 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2421 addr & huge_page_mask(h));
397874df
CL
2422 pte_t pte;
2423
2424 if (!ptep)
2425 continue;
2426
2427 pte = *ptep;
2428 if (pte_none(pte))
2429 continue;
2430
2431 page = pte_page(pte);
2432 if (!page)
2433 continue;
2434
2435 gather_stats(page, md, pte_dirty(*ptep));
2436 }
2437}
7f709ed0
AM
2438#else
2439static inline void check_huge_range(struct vm_area_struct *vma,
2440 unsigned long start, unsigned long end,
2441 struct numa_maps *md)
2442{
2443}
2444#endif
397874df 2445
53f2556b
LS
2446/*
2447 * Display pages allocated per node and memory policy via /proc.
2448 */
1a75a6c8
CL
2449int show_numa_map(struct seq_file *m, void *v)
2450{
99f89551 2451 struct proc_maps_private *priv = m->private;
1a75a6c8
CL
2452 struct vm_area_struct *vma = v;
2453 struct numa_maps *md;
397874df
CL
2454 struct file *file = vma->vm_file;
2455 struct mm_struct *mm = vma->vm_mm;
480eccf9 2456 struct mempolicy *pol;
1a75a6c8
CL
2457 int n;
2458 char buffer[50];
2459
397874df 2460 if (!mm)
1a75a6c8
CL
2461 return 0;
2462
2463 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2464 if (!md)
2465 return 0;
2466
480eccf9 2467 pol = get_vma_policy(priv->task, vma, vma->vm_start);
71fe804b 2468 mpol_to_str(buffer, sizeof(buffer), pol, 0);
52cd3b07 2469 mpol_cond_put(pol);
397874df
CL
2470
2471 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2472
2473 if (file) {
2474 seq_printf(m, " file=");
c32c2f63 2475 seq_path(m, &file->f_path, "\n\t= ");
397874df
CL
2476 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2477 seq_printf(m, " heap");
2478 } else if (vma->vm_start <= mm->start_stack &&
2479 vma->vm_end >= mm->start_stack) {
2480 seq_printf(m, " stack");
2481 }
2482
2483 if (is_vm_hugetlb_page(vma)) {
2484 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2485 seq_printf(m, " huge");
2486 } else {
a57ebfdb 2487 check_pgd_range(vma, vma->vm_start, vma->vm_end,
56bbd65d 2488 &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
397874df
CL
2489 }
2490
2491 if (!md->pages)
2492 goto out;
1a75a6c8 2493
397874df
CL
2494 if (md->anon)
2495 seq_printf(m," anon=%lu",md->anon);
1a75a6c8 2496
397874df
CL
2497 if (md->dirty)
2498 seq_printf(m," dirty=%lu",md->dirty);
1a75a6c8 2499
397874df
CL
2500 if (md->pages != md->anon && md->pages != md->dirty)
2501 seq_printf(m, " mapped=%lu", md->pages);
1a75a6c8 2502
397874df
CL
2503 if (md->mapcount_max > 1)
2504 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1a75a6c8 2505
397874df
CL
2506 if (md->swapcache)
2507 seq_printf(m," swapcache=%lu", md->swapcache);
2508
2509 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2510 seq_printf(m," active=%lu", md->active);
2511
2512 if (md->writeback)
2513 seq_printf(m," writeback=%lu", md->writeback);
2514
56bbd65d 2515 for_each_node_state(n, N_HIGH_MEMORY)
397874df
CL
2516 if (md->node[n])
2517 seq_printf(m, " N%d=%lu", n, md->node[n]);
2518out:
2519 seq_putc(m, '\n');
1a75a6c8
CL
2520 kfree(md);
2521
2522 if (m->count < m->size)
99f89551 2523 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1a75a6c8
CL
2524 return 0;
2525}