]> bbs.cooldavid.org Git - net-next-2.6.git/blame - arch/x86/xen/mmu.c
xen64: defer setting pagetable alloc/release ops
[net-next-2.6.git] / arch / x86 / xen / mmu.c
CommitLineData
3b827c1b
JF
1/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
f120f13e 41#include <linux/sched.h>
f4f97b3e 42#include <linux/highmem.h>
3b827c1b 43#include <linux/bug.h>
3b827c1b
JF
44
45#include <asm/pgtable.h>
46#include <asm/tlbflush.h>
47#include <asm/mmu_context.h>
f4f97b3e 48#include <asm/paravirt.h>
cbcd79c2 49#include <asm/linkage.h>
3b827c1b
JF
50
51#include <asm/xen/hypercall.h>
f4f97b3e 52#include <asm/xen/hypervisor.h>
3b827c1b
JF
53
54#include <xen/page.h>
55#include <xen/interface/xen.h>
56
f4f97b3e 57#include "multicalls.h"
3b827c1b
JF
58#include "mmu.h"
59
d451bb7a 60#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
cf0923ea 61#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
d451bb7a 62
cf0923ea 63/* Placeholder for holes in the address space */
cbcd79c2 64static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
cf0923ea
JF
65 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
66
67 /* Array of pointers to pages containing p2m entries */
cbcd79c2 68static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
cf0923ea 69 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
d451bb7a 70
d5edbc1f 71/* Arrays of p2m arrays expressed in mfns used for save/restore */
cbcd79c2 72static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
d5edbc1f 73
cbcd79c2
JF
74static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
75 __page_aligned_bss;
d5edbc1f 76
d451bb7a
JF
77static inline unsigned p2m_top_index(unsigned long pfn)
78{
8006ec3e 79 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
d451bb7a
JF
80 return pfn / P2M_ENTRIES_PER_PAGE;
81}
82
83static inline unsigned p2m_index(unsigned long pfn)
84{
85 return pfn % P2M_ENTRIES_PER_PAGE;
86}
87
d5edbc1f
JF
88/* Build the parallel p2m_top_mfn structures */
89void xen_setup_mfn_list_list(void)
90{
91 unsigned pfn, idx;
92
93 for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
94 unsigned topidx = p2m_top_index(pfn);
95
96 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
97 }
98
99 for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
100 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
101 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
102 }
103
104 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
105
106 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
107 virt_to_mfn(p2m_top_mfn_list);
108 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
109}
110
111/* Set up p2m_top to point to the domain-builder provided p2m pages */
d451bb7a
JF
112void __init xen_build_dynamic_phys_to_machine(void)
113{
d451bb7a 114 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
8006ec3e 115 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
d5edbc1f 116 unsigned pfn;
d451bb7a 117
8006ec3e 118 for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
d451bb7a
JF
119 unsigned topidx = p2m_top_index(pfn);
120
121 p2m_top[topidx] = &mfn_list[pfn];
122 }
123}
124
125unsigned long get_phys_to_machine(unsigned long pfn)
126{
127 unsigned topidx, idx;
128
8006ec3e
JF
129 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
130 return INVALID_P2M_ENTRY;
131
d451bb7a 132 topidx = p2m_top_index(pfn);
d451bb7a
JF
133 idx = p2m_index(pfn);
134 return p2m_top[topidx][idx];
135}
15ce6005 136EXPORT_SYMBOL_GPL(get_phys_to_machine);
d451bb7a 137
d5edbc1f 138static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
d451bb7a
JF
139{
140 unsigned long *p;
141 unsigned i;
142
143 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
144 BUG_ON(p == NULL);
145
146 for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
147 p[i] = INVALID_P2M_ENTRY;
148
cf0923ea 149 if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
d451bb7a 150 free_page((unsigned long)p);
d5edbc1f
JF
151 else
152 *mfnp = virt_to_mfn(p);
d451bb7a
JF
153}
154
155void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
156{
157 unsigned topidx, idx;
158
159 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
160 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
8006ec3e
JF
161 return;
162 }
163
164 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
165 BUG_ON(mfn != INVALID_P2M_ENTRY);
d451bb7a
JF
166 return;
167 }
168
169 topidx = p2m_top_index(pfn);
cf0923ea 170 if (p2m_top[topidx] == p2m_missing) {
d451bb7a
JF
171 /* no need to allocate a page to store an invalid entry */
172 if (mfn == INVALID_P2M_ENTRY)
173 return;
d5edbc1f 174 alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
d451bb7a
JF
175 }
176
177 idx = p2m_index(pfn);
178 p2m_top[topidx][idx] = mfn;
179}
180
ce803e70 181xmaddr_t arbitrary_virt_to_machine(void *vaddr)
3b827c1b 182{
ce803e70 183 unsigned long address = (unsigned long)vaddr;
da7bfc50 184 unsigned int level;
f0646e43 185 pte_t *pte = lookup_address(address, &level);
de067814 186 unsigned offset = address & ~PAGE_MASK;
3b827c1b
JF
187
188 BUG_ON(pte == NULL);
189
ebd879e3 190 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
3b827c1b
JF
191}
192
193void make_lowmem_page_readonly(void *vaddr)
194{
195 pte_t *pte, ptev;
196 unsigned long address = (unsigned long)vaddr;
da7bfc50 197 unsigned int level;
3b827c1b 198
f0646e43 199 pte = lookup_address(address, &level);
3b827c1b
JF
200 BUG_ON(pte == NULL);
201
202 ptev = pte_wrprotect(*pte);
203
204 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
205 BUG();
206}
207
208void make_lowmem_page_readwrite(void *vaddr)
209{
210 pte_t *pte, ptev;
211 unsigned long address = (unsigned long)vaddr;
da7bfc50 212 unsigned int level;
3b827c1b 213
f0646e43 214 pte = lookup_address(address, &level);
3b827c1b
JF
215 BUG_ON(pte == NULL);
216
217 ptev = pte_mkwrite(*pte);
218
219 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
220 BUG();
221}
222
223
e2426cf8
JF
224static bool page_pinned(void *ptr)
225{
226 struct page *page = virt_to_page(ptr);
227
228 return PagePinned(page);
229}
230
400d3494 231static void extend_mmu_update(const struct mmu_update *update)
3b827c1b 232{
d66bf8fc
JF
233 struct multicall_space mcs;
234 struct mmu_update *u;
3b827c1b 235
400d3494
JF
236 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
237
238 if (mcs.mc != NULL)
239 mcs.mc->args[1]++;
240 else {
241 mcs = __xen_mc_entry(sizeof(*u));
242 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
243 }
d66bf8fc 244
d66bf8fc 245 u = mcs.args;
400d3494
JF
246 *u = *update;
247}
248
249void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
250{
251 struct mmu_update u;
252
253 preempt_disable();
254
255 xen_mc_batch();
256
ce803e70
JF
257 /* ptr may be ioremapped for 64-bit pagetable setup */
258 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
400d3494
JF
259 u.val = pmd_val_ma(val);
260 extend_mmu_update(&u);
d66bf8fc
JF
261
262 xen_mc_issue(PARAVIRT_LAZY_MMU);
263
264 preempt_enable();
3b827c1b
JF
265}
266
e2426cf8
JF
267void xen_set_pmd(pmd_t *ptr, pmd_t val)
268{
269 /* If page is not pinned, we can just update the entry
270 directly */
271 if (!page_pinned(ptr)) {
272 *ptr = val;
273 return;
274 }
275
276 xen_set_pmd_hyper(ptr, val);
277}
278
3b827c1b
JF
279/*
280 * Associate a virtual page frame with a given physical page frame
281 * and protection flags for that frame.
282 */
283void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
284{
285 pgd_t *pgd;
286 pud_t *pud;
287 pmd_t *pmd;
288 pte_t *pte;
289
290 pgd = swapper_pg_dir + pgd_index(vaddr);
291 if (pgd_none(*pgd)) {
292 BUG();
293 return;
294 }
295 pud = pud_offset(pgd, vaddr);
296 if (pud_none(*pud)) {
297 BUG();
298 return;
299 }
300 pmd = pmd_offset(pud, vaddr);
301 if (pmd_none(*pmd)) {
302 BUG();
303 return;
304 }
305 pte = pte_offset_kernel(pmd, vaddr);
306 /* <mfn,flags> stored as-is, to permit clearing entries */
307 xen_set_pte(pte, mfn_pte(mfn, flags));
308
309 /*
310 * It's enough to flush this one mapping.
311 * (PGE mappings get flushed as well)
312 */
313 __flush_tlb_one(vaddr);
314}
315
316void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
317 pte_t *ptep, pte_t pteval)
318{
2bd50036
JF
319 /* updates to init_mm may be done without lock */
320 if (mm == &init_mm)
321 preempt_disable();
322
d66bf8fc 323 if (mm == current->mm || mm == &init_mm) {
8965c1c0 324 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
d66bf8fc
JF
325 struct multicall_space mcs;
326 mcs = xen_mc_entry(0);
327
328 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
329 xen_mc_issue(PARAVIRT_LAZY_MMU);
2bd50036 330 goto out;
d66bf8fc
JF
331 } else
332 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
2bd50036 333 goto out;
d66bf8fc
JF
334 }
335 xen_set_pte(ptep, pteval);
2bd50036
JF
336
337out:
338 if (mm == &init_mm)
339 preempt_enable();
3b827c1b
JF
340}
341
e57778a1 342pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
947a69c9 343{
e57778a1
JF
344 /* Just return the pte as-is. We preserve the bits on commit */
345 return *ptep;
346}
347
348void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
349 pte_t *ptep, pte_t pte)
350{
400d3494 351 struct mmu_update u;
e57778a1 352
400d3494 353 xen_mc_batch();
947a69c9 354
400d3494
JF
355 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
356 u.val = pte_val_ma(pte);
357 extend_mmu_update(&u);
947a69c9 358
e57778a1 359 xen_mc_issue(PARAVIRT_LAZY_MMU);
947a69c9
JF
360}
361
ebb9cfe2
JF
362/* Assume pteval_t is equivalent to all the other *val_t types. */
363static pteval_t pte_mfn_to_pfn(pteval_t val)
947a69c9 364{
ebb9cfe2
JF
365 if (val & _PAGE_PRESENT) {
366 unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT;
367 pteval_t flags = val & ~PTE_MASK;
d8355aca 368 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
ebb9cfe2 369 }
947a69c9 370
ebb9cfe2 371 return val;
947a69c9
JF
372}
373
ebb9cfe2 374static pteval_t pte_pfn_to_mfn(pteval_t val)
947a69c9 375{
ebb9cfe2
JF
376 if (val & _PAGE_PRESENT) {
377 unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT;
378 pteval_t flags = val & ~PTE_MASK;
d8355aca 379 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
947a69c9
JF
380 }
381
ebb9cfe2 382 return val;
947a69c9
JF
383}
384
ebb9cfe2 385pteval_t xen_pte_val(pte_t pte)
947a69c9 386{
ebb9cfe2 387 return pte_mfn_to_pfn(pte.pte);
947a69c9 388}
947a69c9 389
947a69c9
JF
390pgdval_t xen_pgd_val(pgd_t pgd)
391{
ebb9cfe2 392 return pte_mfn_to_pfn(pgd.pgd);
947a69c9
JF
393}
394
395pte_t xen_make_pte(pteval_t pte)
396{
ebb9cfe2
JF
397 pte = pte_pfn_to_mfn(pte);
398 return native_make_pte(pte);
947a69c9
JF
399}
400
401pgd_t xen_make_pgd(pgdval_t pgd)
402{
ebb9cfe2
JF
403 pgd = pte_pfn_to_mfn(pgd);
404 return native_make_pgd(pgd);
947a69c9
JF
405}
406
407pmdval_t xen_pmd_val(pmd_t pmd)
408{
ebb9cfe2 409 return pte_mfn_to_pfn(pmd.pmd);
947a69c9 410}
28499143 411
e2426cf8 412void xen_set_pud_hyper(pud_t *ptr, pud_t val)
f4f97b3e 413{
400d3494 414 struct mmu_update u;
f4f97b3e 415
d66bf8fc
JF
416 preempt_disable();
417
400d3494
JF
418 xen_mc_batch();
419
ce803e70
JF
420 /* ptr may be ioremapped for 64-bit pagetable setup */
421 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
400d3494
JF
422 u.val = pud_val_ma(val);
423 extend_mmu_update(&u);
d66bf8fc
JF
424
425 xen_mc_issue(PARAVIRT_LAZY_MMU);
426
427 preempt_enable();
f4f97b3e
JF
428}
429
e2426cf8
JF
430void xen_set_pud(pud_t *ptr, pud_t val)
431{
432 /* If page is not pinned, we can just update the entry
433 directly */
434 if (!page_pinned(ptr)) {
435 *ptr = val;
436 return;
437 }
438
439 xen_set_pud_hyper(ptr, val);
440}
441
f4f97b3e
JF
442void xen_set_pte(pte_t *ptep, pte_t pte)
443{
f6e58732 444#ifdef CONFIG_X86_PAE
f4f97b3e
JF
445 ptep->pte_high = pte.pte_high;
446 smp_wmb();
447 ptep->pte_low = pte.pte_low;
f6e58732
JF
448#else
449 *ptep = pte;
450#endif
f4f97b3e
JF
451}
452
f6e58732 453#ifdef CONFIG_X86_PAE
3b827c1b
JF
454void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
455{
f6e58732 456 set_64bit((u64 *)ptep, native_pte_val(pte));
3b827c1b
JF
457}
458
459void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
460{
461 ptep->pte_low = 0;
462 smp_wmb(); /* make sure low gets written first */
463 ptep->pte_high = 0;
464}
465
466void xen_pmd_clear(pmd_t *pmdp)
467{
e2426cf8 468 set_pmd(pmdp, __pmd(0));
3b827c1b 469}
f6e58732 470#endif /* CONFIG_X86_PAE */
3b827c1b 471
abf33038 472pmd_t xen_make_pmd(pmdval_t pmd)
3b827c1b 473{
ebb9cfe2 474 pmd = pte_pfn_to_mfn(pmd);
947a69c9 475 return native_make_pmd(pmd);
3b827c1b 476}
3b827c1b 477
f6e58732
JF
478#if PAGETABLE_LEVELS == 4
479pudval_t xen_pud_val(pud_t pud)
480{
481 return pte_mfn_to_pfn(pud.pud);
482}
483
484pud_t xen_make_pud(pudval_t pud)
485{
486 pud = pte_pfn_to_mfn(pud);
487
488 return native_make_pud(pud);
489}
490
491void xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
492{
493 struct mmu_update u;
494
495 preempt_disable();
496
497 xen_mc_batch();
498
499 u.ptr = virt_to_machine(ptr).maddr;
500 u.val = pgd_val_ma(val);
501 extend_mmu_update(&u);
502
503 xen_mc_issue(PARAVIRT_LAZY_MMU);
504
505 preempt_enable();
506}
507
508void xen_set_pgd(pgd_t *ptr, pgd_t val)
509{
510 /* If page is not pinned, we can just update the entry
511 directly */
512 if (!page_pinned(ptr)) {
513 *ptr = val;
514 return;
515 }
516
517 xen_set_pgd_hyper(ptr, val);
518}
519#endif /* PAGETABLE_LEVELS == 4 */
520
f4f97b3e
JF
521/*
522 (Yet another) pagetable walker. This one is intended for pinning a
523 pagetable. This means that it walks a pagetable and calls the
524 callback function on each page it finds making up the page table,
525 at every level. It walks the entire pagetable, but it only bothers
526 pinning pte pages which are below pte_limit. In the normal case
527 this will be TASK_SIZE, but at boot we need to pin up to
528 FIXADDR_TOP. But the important bit is that we don't pin beyond
529 there, because then we start getting into Xen's ptes.
530*/
74260714 531static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
f4f97b3e 532 unsigned long limit)
3b827c1b
JF
533{
534 pgd_t *pgd = pgd_base;
f4f97b3e
JF
535 int flush = 0;
536 unsigned long addr = 0;
537 unsigned long pgd_next;
538
539 BUG_ON(limit > FIXADDR_TOP);
3b827c1b
JF
540
541 if (xen_feature(XENFEAT_auto_translated_physmap))
f4f97b3e
JF
542 return 0;
543
544 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
545 pud_t *pud;
546 unsigned long pud_limit, pud_next;
3b827c1b 547
f4f97b3e
JF
548 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
549
550 if (!pgd_val(*pgd))
3b827c1b 551 continue;
f4f97b3e 552
3b827c1b
JF
553 pud = pud_offset(pgd, 0);
554
555 if (PTRS_PER_PUD > 1) /* not folded */
74260714 556 flush |= (*func)(virt_to_page(pud), PT_PUD);
f4f97b3e
JF
557
558 for (; addr != pud_limit; pud++, addr = pud_next) {
559 pmd_t *pmd;
560 unsigned long pmd_limit;
561
562 pud_next = pud_addr_end(addr, pud_limit);
563
564 if (pud_next < limit)
565 pmd_limit = pud_next;
566 else
567 pmd_limit = limit;
3b827c1b 568
3b827c1b
JF
569 if (pud_none(*pud))
570 continue;
f4f97b3e 571
3b827c1b
JF
572 pmd = pmd_offset(pud, 0);
573
574 if (PTRS_PER_PMD > 1) /* not folded */
74260714 575 flush |= (*func)(virt_to_page(pmd), PT_PMD);
f4f97b3e
JF
576
577 for (; addr != pmd_limit; pmd++) {
578 addr += (PAGE_SIZE * PTRS_PER_PTE);
579 if ((pmd_limit-1) < (addr-1)) {
580 addr = pmd_limit;
581 break;
582 }
3b827c1b 583
3b827c1b
JF
584 if (pmd_none(*pmd))
585 continue;
586
74260714 587 flush |= (*func)(pmd_page(*pmd), PT_PTE);
3b827c1b
JF
588 }
589 }
590 }
591
74260714 592 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
f4f97b3e
JF
593
594 return flush;
3b827c1b
JF
595}
596
74260714
JF
597static spinlock_t *lock_pte(struct page *page)
598{
599 spinlock_t *ptl = NULL;
600
601#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
602 ptl = __pte_lockptr(page);
603 spin_lock(ptl);
604#endif
605
606 return ptl;
607}
608
609static void do_unlock(void *v)
610{
611 spinlock_t *ptl = v;
612 spin_unlock(ptl);
613}
614
615static void xen_do_pin(unsigned level, unsigned long pfn)
616{
617 struct mmuext_op *op;
618 struct multicall_space mcs;
619
620 mcs = __xen_mc_entry(sizeof(*op));
621 op = mcs.args;
622 op->cmd = level;
623 op->arg1.mfn = pfn_to_mfn(pfn);
624 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
625}
626
627static int pin_page(struct page *page, enum pt_level level)
f4f97b3e 628{
d60cd46b 629 unsigned pgfl = TestSetPagePinned(page);
f4f97b3e
JF
630 int flush;
631
632 if (pgfl)
633 flush = 0; /* already pinned */
634 else if (PageHighMem(page))
635 /* kmaps need flushing if we found an unpinned
636 highpage */
637 flush = 1;
638 else {
639 void *pt = lowmem_page_address(page);
640 unsigned long pfn = page_to_pfn(page);
641 struct multicall_space mcs = __xen_mc_entry(0);
74260714 642 spinlock_t *ptl;
f4f97b3e
JF
643
644 flush = 0;
645
74260714
JF
646 ptl = NULL;
647 if (level == PT_PTE)
648 ptl = lock_pte(page);
649
f4f97b3e
JF
650 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
651 pfn_pte(pfn, PAGE_KERNEL_RO),
74260714
JF
652 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
653
654 if (level == PT_PTE)
655 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
656
657 if (ptl) {
658 /* Queue a deferred unlock for when this batch
659 is completed. */
660 xen_mc_callback(do_unlock, ptl);
661 }
f4f97b3e
JF
662 }
663
664 return flush;
665}
3b827c1b 666
f4f97b3e
JF
667/* This is called just after a mm has been created, but it has not
668 been used yet. We need to make sure that its pagetable is all
669 read-only, and can be pinned. */
3b827c1b
JF
670void xen_pgd_pin(pgd_t *pgd)
671{
f4f97b3e 672 xen_mc_batch();
3b827c1b 673
f87e4cac
JF
674 if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
675 /* re-enable interrupts for kmap_flush_unused */
676 xen_mc_issue(0);
f4f97b3e 677 kmap_flush_unused();
f87e4cac
JF
678 xen_mc_batch();
679 }
f4f97b3e 680
28499143 681 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
f4f97b3e 682 xen_mc_issue(0);
3b827c1b
JF
683}
684
0e91398f
JF
685/*
686 * On save, we need to pin all pagetables to make sure they get their
687 * mfns turned into pfns. Search the list for any unpinned pgds and pin
688 * them (unpinned pgds are not currently in use, probably because the
689 * process is under construction or destruction).
690 */
691void xen_mm_pin_all(void)
692{
693 unsigned long flags;
694 struct page *page;
74260714 695
0e91398f 696 spin_lock_irqsave(&pgd_lock, flags);
f4f97b3e 697
0e91398f
JF
698 list_for_each_entry(page, &pgd_list, lru) {
699 if (!PagePinned(page)) {
700 xen_pgd_pin((pgd_t *)page_address(page));
701 SetPageSavePinned(page);
702 }
703 }
704
705 spin_unlock_irqrestore(&pgd_lock, flags);
3b827c1b
JF
706}
707
c1f2f09e
EH
708/*
709 * The init_mm pagetable is really pinned as soon as its created, but
710 * that's before we have page structures to store the bits. So do all
711 * the book-keeping now.
712 */
74260714 713static __init int mark_pinned(struct page *page, enum pt_level level)
3b827c1b 714{
f4f97b3e
JF
715 SetPagePinned(page);
716 return 0;
717}
3b827c1b 718
f4f97b3e
JF
719void __init xen_mark_init_mm_pinned(void)
720{
721 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
722}
3b827c1b 723
74260714 724static int unpin_page(struct page *page, enum pt_level level)
f4f97b3e 725{
d60cd46b 726 unsigned pgfl = TestClearPagePinned(page);
3b827c1b 727
f4f97b3e
JF
728 if (pgfl && !PageHighMem(page)) {
729 void *pt = lowmem_page_address(page);
730 unsigned long pfn = page_to_pfn(page);
74260714
JF
731 spinlock_t *ptl = NULL;
732 struct multicall_space mcs;
733
734 if (level == PT_PTE) {
735 ptl = lock_pte(page);
736
737 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
738 }
739
740 mcs = __xen_mc_entry(0);
f4f97b3e
JF
741
742 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
743 pfn_pte(pfn, PAGE_KERNEL),
74260714
JF
744 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
745
746 if (ptl) {
747 /* unlock when batch completed */
748 xen_mc_callback(do_unlock, ptl);
749 }
f4f97b3e
JF
750 }
751
752 return 0; /* never need to flush on unpin */
3b827c1b
JF
753}
754
f4f97b3e
JF
755/* Release a pagetables pages back as normal RW */
756static void xen_pgd_unpin(pgd_t *pgd)
757{
f4f97b3e
JF
758 xen_mc_batch();
759
74260714 760 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
f4f97b3e
JF
761
762 pgd_walk(pgd, unpin_page, TASK_SIZE);
763
764 xen_mc_issue(0);
765}
3b827c1b 766
0e91398f
JF
767/*
768 * On resume, undo any pinning done at save, so that the rest of the
769 * kernel doesn't see any unexpected pinned pagetables.
770 */
771void xen_mm_unpin_all(void)
772{
773 unsigned long flags;
774 struct page *page;
775
776 spin_lock_irqsave(&pgd_lock, flags);
777
778 list_for_each_entry(page, &pgd_list, lru) {
779 if (PageSavePinned(page)) {
780 BUG_ON(!PagePinned(page));
781 printk("unpinning pinned %p\n", page_address(page));
782 xen_pgd_unpin((pgd_t *)page_address(page));
783 ClearPageSavePinned(page);
784 }
785 }
786
787 spin_unlock_irqrestore(&pgd_lock, flags);
788}
789
3b827c1b
JF
790void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
791{
f4f97b3e 792 spin_lock(&next->page_table_lock);
3b827c1b 793 xen_pgd_pin(next->pgd);
f4f97b3e 794 spin_unlock(&next->page_table_lock);
3b827c1b
JF
795}
796
797void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
798{
f4f97b3e 799 spin_lock(&mm->page_table_lock);
3b827c1b 800 xen_pgd_pin(mm->pgd);
f4f97b3e 801 spin_unlock(&mm->page_table_lock);
3b827c1b
JF
802}
803
3b827c1b 804
f87e4cac
JF
805#ifdef CONFIG_SMP
806/* Another cpu may still have their %cr3 pointing at the pagetable, so
807 we need to repoint it somewhere else before we can unpin it. */
808static void drop_other_mm_ref(void *info)
809{
810 struct mm_struct *mm = info;
ce87b3d3 811 struct mm_struct *active_mm;
3b827c1b 812
ce87b3d3
JF
813#ifdef CONFIG_X86_64
814 active_mm = read_pda(active_mm);
815#else
816 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
817#endif
818
819 if (active_mm == mm)
f87e4cac 820 leave_mm(smp_processor_id());
9f79991d
JF
821
822 /* If this cpu still has a stale cr3 reference, then make sure
823 it has been flushed. */
824 if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
825 load_cr3(swapper_pg_dir);
826 arch_flush_lazy_cpu_mode();
827 }
f87e4cac 828}
3b827c1b 829
f87e4cac
JF
830static void drop_mm_ref(struct mm_struct *mm)
831{
9f79991d
JF
832 cpumask_t mask;
833 unsigned cpu;
834
f87e4cac
JF
835 if (current->active_mm == mm) {
836 if (current->mm == mm)
837 load_cr3(swapper_pg_dir);
838 else
839 leave_mm(smp_processor_id());
9f79991d
JF
840 arch_flush_lazy_cpu_mode();
841 }
842
843 /* Get the "official" set of cpus referring to our pagetable. */
844 mask = mm->cpu_vm_mask;
845
846 /* It's possible that a vcpu may have a stale reference to our
847 cr3, because its in lazy mode, and it hasn't yet flushed
848 its set of pending hypercalls yet. In this case, we can
849 look at its actual current cr3 value, and force it to flush
850 if needed. */
851 for_each_online_cpu(cpu) {
852 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
853 cpu_set(cpu, mask);
3b827c1b
JF
854 }
855
9f79991d 856 if (!cpus_empty(mask))
3b16cf87 857 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
f87e4cac
JF
858}
859#else
860static void drop_mm_ref(struct mm_struct *mm)
861{
862 if (current->active_mm == mm)
863 load_cr3(swapper_pg_dir);
864}
865#endif
866
867/*
868 * While a process runs, Xen pins its pagetables, which means that the
869 * hypervisor forces it to be read-only, and it controls all updates
870 * to it. This means that all pagetable updates have to go via the
871 * hypervisor, which is moderately expensive.
872 *
873 * Since we're pulling the pagetable down, we switch to use init_mm,
874 * unpin old process pagetable and mark it all read-write, which
875 * allows further operations on it to be simple memory accesses.
876 *
877 * The only subtle point is that another CPU may be still using the
878 * pagetable because of lazy tlb flushing. This means we need need to
879 * switch all CPUs off this pagetable before we can unpin it.
880 */
881void xen_exit_mmap(struct mm_struct *mm)
882{
883 get_cpu(); /* make sure we don't move around */
884 drop_mm_ref(mm);
885 put_cpu();
3b827c1b 886
f120f13e 887 spin_lock(&mm->page_table_lock);
df912ea4
JF
888
889 /* pgd may not be pinned in the error exit path of execve */
e2426cf8 890 if (page_pinned(mm->pgd))
df912ea4 891 xen_pgd_unpin(mm->pgd);
74260714 892
f120f13e 893 spin_unlock(&mm->page_table_lock);
3b827c1b 894}