]> bbs.cooldavid.org Git - net-next-2.6.git/blame - arch/x86/xen/mmu.c
x86/paravirt: implement PVOP_CALL macros for callee-save functions
[net-next-2.6.git] / arch / x86 / xen / mmu.c
CommitLineData
3b827c1b
JF
1/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
f120f13e 41#include <linux/sched.h>
f4f97b3e 42#include <linux/highmem.h>
994025ca 43#include <linux/debugfs.h>
3b827c1b 44#include <linux/bug.h>
3b827c1b
JF
45
46#include <asm/pgtable.h>
47#include <asm/tlbflush.h>
5deb30d1 48#include <asm/fixmap.h>
3b827c1b 49#include <asm/mmu_context.h>
319f3ba5 50#include <asm/setup.h>
f4f97b3e 51#include <asm/paravirt.h>
cbcd79c2 52#include <asm/linkage.h>
3b827c1b
JF
53
54#include <asm/xen/hypercall.h>
f4f97b3e 55#include <asm/xen/hypervisor.h>
3b827c1b
JF
56
57#include <xen/page.h>
58#include <xen/interface/xen.h>
319f3ba5
JF
59#include <xen/interface/version.h>
60#include <xen/hvc-console.h>
3b827c1b 61
f4f97b3e 62#include "multicalls.h"
3b827c1b 63#include "mmu.h"
994025ca
JF
64#include "debugfs.h"
65
66#define MMU_UPDATE_HISTO 30
67
68#ifdef CONFIG_XEN_DEBUG_FS
69
70static struct {
71 u32 pgd_update;
72 u32 pgd_update_pinned;
73 u32 pgd_update_batched;
74
75 u32 pud_update;
76 u32 pud_update_pinned;
77 u32 pud_update_batched;
78
79 u32 pmd_update;
80 u32 pmd_update_pinned;
81 u32 pmd_update_batched;
82
83 u32 pte_update;
84 u32 pte_update_pinned;
85 u32 pte_update_batched;
86
87 u32 mmu_update;
88 u32 mmu_update_extended;
89 u32 mmu_update_histo[MMU_UPDATE_HISTO];
90
91 u32 prot_commit;
92 u32 prot_commit_batched;
93
94 u32 set_pte_at;
95 u32 set_pte_at_batched;
96 u32 set_pte_at_pinned;
97 u32 set_pte_at_current;
98 u32 set_pte_at_kernel;
99} mmu_stats;
100
101static u8 zero_stats;
102
103static inline void check_zero(void)
104{
105 if (unlikely(zero_stats)) {
106 memset(&mmu_stats, 0, sizeof(mmu_stats));
107 zero_stats = 0;
108 }
109}
110
111#define ADD_STATS(elem, val) \
112 do { check_zero(); mmu_stats.elem += (val); } while(0)
113
114#else /* !CONFIG_XEN_DEBUG_FS */
115
116#define ADD_STATS(elem, val) do { (void)(val); } while(0)
117
118#endif /* CONFIG_XEN_DEBUG_FS */
3b827c1b 119
319f3ba5
JF
120
121/*
122 * Identity map, in addition to plain kernel map. This needs to be
123 * large enough to allocate page table pages to allocate the rest.
124 * Each page can map 2MB.
125 */
126static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
127
128#ifdef CONFIG_X86_64
129/* l3 pud for userspace vsyscall mapping */
130static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
131#endif /* CONFIG_X86_64 */
132
133/*
134 * Note about cr3 (pagetable base) values:
135 *
136 * xen_cr3 contains the current logical cr3 value; it contains the
137 * last set cr3. This may not be the current effective cr3, because
138 * its update may be being lazily deferred. However, a vcpu looking
139 * at its own cr3 can use this value knowing that it everything will
140 * be self-consistent.
141 *
142 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
143 * hypercall to set the vcpu cr3 is complete (so it may be a little
144 * out of date, but it will never be set early). If one vcpu is
145 * looking at another vcpu's cr3 value, it should use this variable.
146 */
147DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
148DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
149
150
d6182fbf
JF
151/*
152 * Just beyond the highest usermode address. STACK_TOP_MAX has a
153 * redzone above it, so round it up to a PGD boundary.
154 */
155#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
156
157
d451bb7a 158#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
cf0923ea 159#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
d451bb7a 160
cf0923ea 161/* Placeholder for holes in the address space */
cbcd79c2 162static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
cf0923ea
JF
163 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
164
165 /* Array of pointers to pages containing p2m entries */
cbcd79c2 166static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
cf0923ea 167 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
d451bb7a 168
d5edbc1f 169/* Arrays of p2m arrays expressed in mfns used for save/restore */
cbcd79c2 170static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
d5edbc1f 171
cbcd79c2
JF
172static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
173 __page_aligned_bss;
d5edbc1f 174
d451bb7a
JF
175static inline unsigned p2m_top_index(unsigned long pfn)
176{
8006ec3e 177 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
d451bb7a
JF
178 return pfn / P2M_ENTRIES_PER_PAGE;
179}
180
181static inline unsigned p2m_index(unsigned long pfn)
182{
183 return pfn % P2M_ENTRIES_PER_PAGE;
184}
185
d5edbc1f
JF
186/* Build the parallel p2m_top_mfn structures */
187void xen_setup_mfn_list_list(void)
188{
189 unsigned pfn, idx;
190
f63c2f24 191 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
d5edbc1f
JF
192 unsigned topidx = p2m_top_index(pfn);
193
194 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
195 }
196
f63c2f24 197 for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
d5edbc1f
JF
198 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
199 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
200 }
201
202 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
203
204 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
205 virt_to_mfn(p2m_top_mfn_list);
206 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
207}
208
209/* Set up p2m_top to point to the domain-builder provided p2m pages */
d451bb7a
JF
210void __init xen_build_dynamic_phys_to_machine(void)
211{
d451bb7a 212 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
8006ec3e 213 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
d5edbc1f 214 unsigned pfn;
d451bb7a 215
f63c2f24 216 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
d451bb7a
JF
217 unsigned topidx = p2m_top_index(pfn);
218
219 p2m_top[topidx] = &mfn_list[pfn];
220 }
221}
222
223unsigned long get_phys_to_machine(unsigned long pfn)
224{
225 unsigned topidx, idx;
226
8006ec3e
JF
227 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
228 return INVALID_P2M_ENTRY;
229
d451bb7a 230 topidx = p2m_top_index(pfn);
d451bb7a
JF
231 idx = p2m_index(pfn);
232 return p2m_top[topidx][idx];
233}
15ce6005 234EXPORT_SYMBOL_GPL(get_phys_to_machine);
d451bb7a 235
d5edbc1f 236static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
d451bb7a
JF
237{
238 unsigned long *p;
239 unsigned i;
240
241 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
242 BUG_ON(p == NULL);
243
f63c2f24 244 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
d451bb7a
JF
245 p[i] = INVALID_P2M_ENTRY;
246
cf0923ea 247 if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
d451bb7a 248 free_page((unsigned long)p);
d5edbc1f
JF
249 else
250 *mfnp = virt_to_mfn(p);
d451bb7a
JF
251}
252
253void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
254{
255 unsigned topidx, idx;
256
257 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
258 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
8006ec3e
JF
259 return;
260 }
261
262 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
263 BUG_ON(mfn != INVALID_P2M_ENTRY);
d451bb7a
JF
264 return;
265 }
266
267 topidx = p2m_top_index(pfn);
cf0923ea 268 if (p2m_top[topidx] == p2m_missing) {
d451bb7a
JF
269 /* no need to allocate a page to store an invalid entry */
270 if (mfn == INVALID_P2M_ENTRY)
271 return;
d5edbc1f 272 alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
d451bb7a
JF
273 }
274
275 idx = p2m_index(pfn);
276 p2m_top[topidx][idx] = mfn;
277}
278
ce803e70 279xmaddr_t arbitrary_virt_to_machine(void *vaddr)
3b827c1b 280{
ce803e70 281 unsigned long address = (unsigned long)vaddr;
da7bfc50 282 unsigned int level;
9f32d21c
CL
283 pte_t *pte;
284 unsigned offset;
3b827c1b 285
9f32d21c
CL
286 /*
287 * if the PFN is in the linear mapped vaddr range, we can just use
288 * the (quick) virt_to_machine() p2m lookup
289 */
290 if (virt_addr_valid(vaddr))
291 return virt_to_machine(vaddr);
292
293 /* otherwise we have to do a (slower) full page-table walk */
3b827c1b 294
9f32d21c
CL
295 pte = lookup_address(address, &level);
296 BUG_ON(pte == NULL);
297 offset = address & ~PAGE_MASK;
ebd879e3 298 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
3b827c1b
JF
299}
300
301void make_lowmem_page_readonly(void *vaddr)
302{
303 pte_t *pte, ptev;
304 unsigned long address = (unsigned long)vaddr;
da7bfc50 305 unsigned int level;
3b827c1b 306
f0646e43 307 pte = lookup_address(address, &level);
3b827c1b
JF
308 BUG_ON(pte == NULL);
309
310 ptev = pte_wrprotect(*pte);
311
312 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
313 BUG();
314}
315
316void make_lowmem_page_readwrite(void *vaddr)
317{
318 pte_t *pte, ptev;
319 unsigned long address = (unsigned long)vaddr;
da7bfc50 320 unsigned int level;
3b827c1b 321
f0646e43 322 pte = lookup_address(address, &level);
3b827c1b
JF
323 BUG_ON(pte == NULL);
324
325 ptev = pte_mkwrite(*pte);
326
327 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
328 BUG();
329}
330
331
7708ad64 332static bool xen_page_pinned(void *ptr)
e2426cf8
JF
333{
334 struct page *page = virt_to_page(ptr);
335
336 return PagePinned(page);
337}
338
7708ad64 339static void xen_extend_mmu_update(const struct mmu_update *update)
3b827c1b 340{
d66bf8fc
JF
341 struct multicall_space mcs;
342 struct mmu_update *u;
3b827c1b 343
400d3494
JF
344 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
345
994025ca
JF
346 if (mcs.mc != NULL) {
347 ADD_STATS(mmu_update_extended, 1);
348 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
349
400d3494 350 mcs.mc->args[1]++;
994025ca
JF
351
352 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
353 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
354 else
355 ADD_STATS(mmu_update_histo[0], 1);
356 } else {
357 ADD_STATS(mmu_update, 1);
400d3494
JF
358 mcs = __xen_mc_entry(sizeof(*u));
359 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
994025ca 360 ADD_STATS(mmu_update_histo[1], 1);
400d3494 361 }
d66bf8fc 362
d66bf8fc 363 u = mcs.args;
400d3494
JF
364 *u = *update;
365}
366
367void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
368{
369 struct mmu_update u;
370
371 preempt_disable();
372
373 xen_mc_batch();
374
ce803e70
JF
375 /* ptr may be ioremapped for 64-bit pagetable setup */
376 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
400d3494 377 u.val = pmd_val_ma(val);
7708ad64 378 xen_extend_mmu_update(&u);
d66bf8fc 379
994025ca
JF
380 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
381
d66bf8fc
JF
382 xen_mc_issue(PARAVIRT_LAZY_MMU);
383
384 preempt_enable();
3b827c1b
JF
385}
386
e2426cf8
JF
387void xen_set_pmd(pmd_t *ptr, pmd_t val)
388{
994025ca
JF
389 ADD_STATS(pmd_update, 1);
390
e2426cf8
JF
391 /* If page is not pinned, we can just update the entry
392 directly */
7708ad64 393 if (!xen_page_pinned(ptr)) {
e2426cf8
JF
394 *ptr = val;
395 return;
396 }
397
994025ca
JF
398 ADD_STATS(pmd_update_pinned, 1);
399
e2426cf8
JF
400 xen_set_pmd_hyper(ptr, val);
401}
402
3b827c1b
JF
403/*
404 * Associate a virtual page frame with a given physical page frame
405 * and protection flags for that frame.
406 */
407void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
408{
836fe2f2 409 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
3b827c1b
JF
410}
411
412void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
413 pte_t *ptep, pte_t pteval)
414{
2bd50036
JF
415 /* updates to init_mm may be done without lock */
416 if (mm == &init_mm)
417 preempt_disable();
418
994025ca
JF
419 ADD_STATS(set_pte_at, 1);
420// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
421 ADD_STATS(set_pte_at_current, mm == current->mm);
422 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
423
d66bf8fc 424 if (mm == current->mm || mm == &init_mm) {
8965c1c0 425 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
d66bf8fc
JF
426 struct multicall_space mcs;
427 mcs = xen_mc_entry(0);
428
429 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
994025ca 430 ADD_STATS(set_pte_at_batched, 1);
d66bf8fc 431 xen_mc_issue(PARAVIRT_LAZY_MMU);
2bd50036 432 goto out;
d66bf8fc
JF
433 } else
434 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
2bd50036 435 goto out;
d66bf8fc
JF
436 }
437 xen_set_pte(ptep, pteval);
2bd50036
JF
438
439out:
440 if (mm == &init_mm)
441 preempt_enable();
3b827c1b
JF
442}
443
f63c2f24
T
444pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
445 unsigned long addr, pte_t *ptep)
947a69c9 446{
e57778a1
JF
447 /* Just return the pte as-is. We preserve the bits on commit */
448 return *ptep;
449}
450
451void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
452 pte_t *ptep, pte_t pte)
453{
400d3494 454 struct mmu_update u;
e57778a1 455
400d3494 456 xen_mc_batch();
947a69c9 457
9f32d21c 458 u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
400d3494 459 u.val = pte_val_ma(pte);
7708ad64 460 xen_extend_mmu_update(&u);
947a69c9 461
994025ca
JF
462 ADD_STATS(prot_commit, 1);
463 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
464
e57778a1 465 xen_mc_issue(PARAVIRT_LAZY_MMU);
947a69c9
JF
466}
467
ebb9cfe2
JF
468/* Assume pteval_t is equivalent to all the other *val_t types. */
469static pteval_t pte_mfn_to_pfn(pteval_t val)
947a69c9 470{
ebb9cfe2 471 if (val & _PAGE_PRESENT) {
59438c9f 472 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
77be1fab 473 pteval_t flags = val & PTE_FLAGS_MASK;
d8355aca 474 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
ebb9cfe2 475 }
947a69c9 476
ebb9cfe2 477 return val;
947a69c9
JF
478}
479
ebb9cfe2 480static pteval_t pte_pfn_to_mfn(pteval_t val)
947a69c9 481{
ebb9cfe2 482 if (val & _PAGE_PRESENT) {
59438c9f 483 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
77be1fab 484 pteval_t flags = val & PTE_FLAGS_MASK;
d8355aca 485 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
947a69c9
JF
486 }
487
ebb9cfe2 488 return val;
947a69c9
JF
489}
490
ebb9cfe2 491pteval_t xen_pte_val(pte_t pte)
947a69c9 492{
ebb9cfe2 493 return pte_mfn_to_pfn(pte.pte);
947a69c9 494}
947a69c9 495
947a69c9
JF
496pgdval_t xen_pgd_val(pgd_t pgd)
497{
ebb9cfe2 498 return pte_mfn_to_pfn(pgd.pgd);
947a69c9
JF
499}
500
501pte_t xen_make_pte(pteval_t pte)
502{
ebb9cfe2
JF
503 pte = pte_pfn_to_mfn(pte);
504 return native_make_pte(pte);
947a69c9
JF
505}
506
507pgd_t xen_make_pgd(pgdval_t pgd)
508{
ebb9cfe2
JF
509 pgd = pte_pfn_to_mfn(pgd);
510 return native_make_pgd(pgd);
947a69c9
JF
511}
512
513pmdval_t xen_pmd_val(pmd_t pmd)
514{
ebb9cfe2 515 return pte_mfn_to_pfn(pmd.pmd);
947a69c9 516}
28499143 517
e2426cf8 518void xen_set_pud_hyper(pud_t *ptr, pud_t val)
f4f97b3e 519{
400d3494 520 struct mmu_update u;
f4f97b3e 521
d66bf8fc
JF
522 preempt_disable();
523
400d3494
JF
524 xen_mc_batch();
525
ce803e70
JF
526 /* ptr may be ioremapped for 64-bit pagetable setup */
527 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
400d3494 528 u.val = pud_val_ma(val);
7708ad64 529 xen_extend_mmu_update(&u);
d66bf8fc 530
994025ca
JF
531 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
532
d66bf8fc
JF
533 xen_mc_issue(PARAVIRT_LAZY_MMU);
534
535 preempt_enable();
f4f97b3e
JF
536}
537
e2426cf8
JF
538void xen_set_pud(pud_t *ptr, pud_t val)
539{
994025ca
JF
540 ADD_STATS(pud_update, 1);
541
e2426cf8
JF
542 /* If page is not pinned, we can just update the entry
543 directly */
7708ad64 544 if (!xen_page_pinned(ptr)) {
e2426cf8
JF
545 *ptr = val;
546 return;
547 }
548
994025ca
JF
549 ADD_STATS(pud_update_pinned, 1);
550
e2426cf8
JF
551 xen_set_pud_hyper(ptr, val);
552}
553
f4f97b3e
JF
554void xen_set_pte(pte_t *ptep, pte_t pte)
555{
994025ca
JF
556 ADD_STATS(pte_update, 1);
557// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
558 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
559
f6e58732 560#ifdef CONFIG_X86_PAE
f4f97b3e
JF
561 ptep->pte_high = pte.pte_high;
562 smp_wmb();
563 ptep->pte_low = pte.pte_low;
f6e58732
JF
564#else
565 *ptep = pte;
566#endif
f4f97b3e
JF
567}
568
f6e58732 569#ifdef CONFIG_X86_PAE
3b827c1b
JF
570void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
571{
f6e58732 572 set_64bit((u64 *)ptep, native_pte_val(pte));
3b827c1b
JF
573}
574
575void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
576{
577 ptep->pte_low = 0;
578 smp_wmb(); /* make sure low gets written first */
579 ptep->pte_high = 0;
580}
581
582void xen_pmd_clear(pmd_t *pmdp)
583{
e2426cf8 584 set_pmd(pmdp, __pmd(0));
3b827c1b 585}
f6e58732 586#endif /* CONFIG_X86_PAE */
3b827c1b 587
abf33038 588pmd_t xen_make_pmd(pmdval_t pmd)
3b827c1b 589{
ebb9cfe2 590 pmd = pte_pfn_to_mfn(pmd);
947a69c9 591 return native_make_pmd(pmd);
3b827c1b 592}
3b827c1b 593
f6e58732
JF
594#if PAGETABLE_LEVELS == 4
595pudval_t xen_pud_val(pud_t pud)
596{
597 return pte_mfn_to_pfn(pud.pud);
598}
599
600pud_t xen_make_pud(pudval_t pud)
601{
602 pud = pte_pfn_to_mfn(pud);
603
604 return native_make_pud(pud);
605}
606
d6182fbf 607pgd_t *xen_get_user_pgd(pgd_t *pgd)
f6e58732 608{
d6182fbf
JF
609 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
610 unsigned offset = pgd - pgd_page;
611 pgd_t *user_ptr = NULL;
f6e58732 612
d6182fbf
JF
613 if (offset < pgd_index(USER_LIMIT)) {
614 struct page *page = virt_to_page(pgd_page);
615 user_ptr = (pgd_t *)page->private;
616 if (user_ptr)
617 user_ptr += offset;
618 }
f6e58732 619
d6182fbf
JF
620 return user_ptr;
621}
622
623static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
624{
625 struct mmu_update u;
f6e58732
JF
626
627 u.ptr = virt_to_machine(ptr).maddr;
628 u.val = pgd_val_ma(val);
7708ad64 629 xen_extend_mmu_update(&u);
d6182fbf
JF
630}
631
632/*
633 * Raw hypercall-based set_pgd, intended for in early boot before
634 * there's a page structure. This implies:
635 * 1. The only existing pagetable is the kernel's
636 * 2. It is always pinned
637 * 3. It has no user pagetable attached to it
638 */
639void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
640{
641 preempt_disable();
642
643 xen_mc_batch();
644
645 __xen_set_pgd_hyper(ptr, val);
f6e58732
JF
646
647 xen_mc_issue(PARAVIRT_LAZY_MMU);
648
649 preempt_enable();
650}
651
652void xen_set_pgd(pgd_t *ptr, pgd_t val)
653{
d6182fbf
JF
654 pgd_t *user_ptr = xen_get_user_pgd(ptr);
655
994025ca
JF
656 ADD_STATS(pgd_update, 1);
657
f6e58732
JF
658 /* If page is not pinned, we can just update the entry
659 directly */
7708ad64 660 if (!xen_page_pinned(ptr)) {
f6e58732 661 *ptr = val;
d6182fbf 662 if (user_ptr) {
7708ad64 663 WARN_ON(xen_page_pinned(user_ptr));
d6182fbf
JF
664 *user_ptr = val;
665 }
f6e58732
JF
666 return;
667 }
668
994025ca
JF
669 ADD_STATS(pgd_update_pinned, 1);
670 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
671
d6182fbf
JF
672 /* If it's pinned, then we can at least batch the kernel and
673 user updates together. */
674 xen_mc_batch();
675
676 __xen_set_pgd_hyper(ptr, val);
677 if (user_ptr)
678 __xen_set_pgd_hyper(user_ptr, val);
679
680 xen_mc_issue(PARAVIRT_LAZY_MMU);
f6e58732
JF
681}
682#endif /* PAGETABLE_LEVELS == 4 */
683
f4f97b3e 684/*
5deb30d1
JF
685 * (Yet another) pagetable walker. This one is intended for pinning a
686 * pagetable. This means that it walks a pagetable and calls the
687 * callback function on each page it finds making up the page table,
688 * at every level. It walks the entire pagetable, but it only bothers
689 * pinning pte pages which are below limit. In the normal case this
690 * will be STACK_TOP_MAX, but at boot we need to pin up to
691 * FIXADDR_TOP.
692 *
693 * For 32-bit the important bit is that we don't pin beyond there,
694 * because then we start getting into Xen's ptes.
695 *
696 * For 64-bit, we must skip the Xen hole in the middle of the address
697 * space, just after the big x86-64 virtual hole.
698 */
86bbc2c2
IC
699static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
700 int (*func)(struct mm_struct *mm, struct page *,
701 enum pt_level),
702 unsigned long limit)
3b827c1b 703{
f4f97b3e 704 int flush = 0;
5deb30d1
JF
705 unsigned hole_low, hole_high;
706 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
707 unsigned pgdidx, pudidx, pmdidx;
f4f97b3e 708
5deb30d1
JF
709 /* The limit is the last byte to be touched */
710 limit--;
711 BUG_ON(limit >= FIXADDR_TOP);
3b827c1b
JF
712
713 if (xen_feature(XENFEAT_auto_translated_physmap))
f4f97b3e
JF
714 return 0;
715
5deb30d1
JF
716 /*
717 * 64-bit has a great big hole in the middle of the address
718 * space, which contains the Xen mappings. On 32-bit these
719 * will end up making a zero-sized hole and so is a no-op.
720 */
d6182fbf 721 hole_low = pgd_index(USER_LIMIT);
5deb30d1
JF
722 hole_high = pgd_index(PAGE_OFFSET);
723
724 pgdidx_limit = pgd_index(limit);
725#if PTRS_PER_PUD > 1
726 pudidx_limit = pud_index(limit);
727#else
728 pudidx_limit = 0;
729#endif
730#if PTRS_PER_PMD > 1
731 pmdidx_limit = pmd_index(limit);
732#else
733 pmdidx_limit = 0;
734#endif
735
5deb30d1 736 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
f4f97b3e 737 pud_t *pud;
3b827c1b 738
5deb30d1
JF
739 if (pgdidx >= hole_low && pgdidx < hole_high)
740 continue;
f4f97b3e 741
5deb30d1 742 if (!pgd_val(pgd[pgdidx]))
3b827c1b 743 continue;
f4f97b3e 744
5deb30d1 745 pud = pud_offset(&pgd[pgdidx], 0);
3b827c1b
JF
746
747 if (PTRS_PER_PUD > 1) /* not folded */
eefb47f6 748 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
f4f97b3e 749
5deb30d1 750 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
f4f97b3e 751 pmd_t *pmd;
f4f97b3e 752
5deb30d1
JF
753 if (pgdidx == pgdidx_limit &&
754 pudidx > pudidx_limit)
755 goto out;
3b827c1b 756
5deb30d1 757 if (pud_none(pud[pudidx]))
3b827c1b 758 continue;
f4f97b3e 759
5deb30d1 760 pmd = pmd_offset(&pud[pudidx], 0);
3b827c1b
JF
761
762 if (PTRS_PER_PMD > 1) /* not folded */
eefb47f6 763 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
f4f97b3e 764
5deb30d1
JF
765 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
766 struct page *pte;
767
768 if (pgdidx == pgdidx_limit &&
769 pudidx == pudidx_limit &&
770 pmdidx > pmdidx_limit)
771 goto out;
3b827c1b 772
5deb30d1 773 if (pmd_none(pmd[pmdidx]))
3b827c1b
JF
774 continue;
775
5deb30d1 776 pte = pmd_page(pmd[pmdidx]);
eefb47f6 777 flush |= (*func)(mm, pte, PT_PTE);
3b827c1b
JF
778 }
779 }
780 }
11ad93e5 781
5deb30d1 782out:
11ad93e5
JF
783 /* Do the top level last, so that the callbacks can use it as
784 a cue to do final things like tlb flushes. */
eefb47f6 785 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
f4f97b3e
JF
786
787 return flush;
3b827c1b
JF
788}
789
86bbc2c2
IC
790static int xen_pgd_walk(struct mm_struct *mm,
791 int (*func)(struct mm_struct *mm, struct page *,
792 enum pt_level),
793 unsigned long limit)
794{
795 return __xen_pgd_walk(mm, mm->pgd, func, limit);
796}
797
7708ad64
JF
798/* If we're using split pte locks, then take the page's lock and
799 return a pointer to it. Otherwise return NULL. */
eefb47f6 800static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
74260714
JF
801{
802 spinlock_t *ptl = NULL;
803
f7d0b926 804#if USE_SPLIT_PTLOCKS
74260714 805 ptl = __pte_lockptr(page);
eefb47f6 806 spin_lock_nest_lock(ptl, &mm->page_table_lock);
74260714
JF
807#endif
808
809 return ptl;
810}
811
7708ad64 812static void xen_pte_unlock(void *v)
74260714
JF
813{
814 spinlock_t *ptl = v;
815 spin_unlock(ptl);
816}
817
818static void xen_do_pin(unsigned level, unsigned long pfn)
819{
820 struct mmuext_op *op;
821 struct multicall_space mcs;
822
823 mcs = __xen_mc_entry(sizeof(*op));
824 op = mcs.args;
825 op->cmd = level;
826 op->arg1.mfn = pfn_to_mfn(pfn);
827 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
828}
829
eefb47f6
JF
830static int xen_pin_page(struct mm_struct *mm, struct page *page,
831 enum pt_level level)
f4f97b3e 832{
d60cd46b 833 unsigned pgfl = TestSetPagePinned(page);
f4f97b3e
JF
834 int flush;
835
836 if (pgfl)
837 flush = 0; /* already pinned */
838 else if (PageHighMem(page))
839 /* kmaps need flushing if we found an unpinned
840 highpage */
841 flush = 1;
842 else {
843 void *pt = lowmem_page_address(page);
844 unsigned long pfn = page_to_pfn(page);
845 struct multicall_space mcs = __xen_mc_entry(0);
74260714 846 spinlock_t *ptl;
f4f97b3e
JF
847
848 flush = 0;
849
11ad93e5
JF
850 /*
851 * We need to hold the pagetable lock between the time
852 * we make the pagetable RO and when we actually pin
853 * it. If we don't, then other users may come in and
854 * attempt to update the pagetable by writing it,
855 * which will fail because the memory is RO but not
856 * pinned, so Xen won't do the trap'n'emulate.
857 *
858 * If we're using split pte locks, we can't hold the
859 * entire pagetable's worth of locks during the
860 * traverse, because we may wrap the preempt count (8
861 * bits). The solution is to mark RO and pin each PTE
862 * page while holding the lock. This means the number
863 * of locks we end up holding is never more than a
864 * batch size (~32 entries, at present).
865 *
866 * If we're not using split pte locks, we needn't pin
867 * the PTE pages independently, because we're
868 * protected by the overall pagetable lock.
869 */
74260714
JF
870 ptl = NULL;
871 if (level == PT_PTE)
eefb47f6 872 ptl = xen_pte_lock(page, mm);
74260714 873
f4f97b3e
JF
874 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
875 pfn_pte(pfn, PAGE_KERNEL_RO),
74260714
JF
876 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
877
11ad93e5 878 if (ptl) {
74260714
JF
879 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
880
74260714
JF
881 /* Queue a deferred unlock for when this batch
882 is completed. */
7708ad64 883 xen_mc_callback(xen_pte_unlock, ptl);
74260714 884 }
f4f97b3e
JF
885 }
886
887 return flush;
888}
3b827c1b 889
f4f97b3e
JF
890/* This is called just after a mm has been created, but it has not
891 been used yet. We need to make sure that its pagetable is all
892 read-only, and can be pinned. */
eefb47f6 893static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
3b827c1b 894{
d05fdf31
JF
895 vm_unmap_aliases();
896
f4f97b3e 897 xen_mc_batch();
3b827c1b 898
86bbc2c2 899 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
d05fdf31 900 /* re-enable interrupts for flushing */
f87e4cac 901 xen_mc_issue(0);
d05fdf31 902
f4f97b3e 903 kmap_flush_unused();
d05fdf31 904
f87e4cac
JF
905 xen_mc_batch();
906 }
f4f97b3e 907
d6182fbf
JF
908#ifdef CONFIG_X86_64
909 {
910 pgd_t *user_pgd = xen_get_user_pgd(pgd);
911
912 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
913
914 if (user_pgd) {
eefb47f6 915 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
f63c2f24
T
916 xen_do_pin(MMUEXT_PIN_L4_TABLE,
917 PFN_DOWN(__pa(user_pgd)));
d6182fbf
JF
918 }
919 }
920#else /* CONFIG_X86_32 */
5deb30d1
JF
921#ifdef CONFIG_X86_PAE
922 /* Need to make sure unshared kernel PMD is pinnable */
47cb2ed9 923 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
eefb47f6 924 PT_PMD);
5deb30d1 925#endif
28499143 926 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
d6182fbf 927#endif /* CONFIG_X86_64 */
f4f97b3e 928 xen_mc_issue(0);
3b827c1b
JF
929}
930
eefb47f6
JF
931static void xen_pgd_pin(struct mm_struct *mm)
932{
933 __xen_pgd_pin(mm, mm->pgd);
934}
935
0e91398f
JF
936/*
937 * On save, we need to pin all pagetables to make sure they get their
938 * mfns turned into pfns. Search the list for any unpinned pgds and pin
939 * them (unpinned pgds are not currently in use, probably because the
940 * process is under construction or destruction).
eefb47f6
JF
941 *
942 * Expected to be called in stop_machine() ("equivalent to taking
943 * every spinlock in the system"), so the locking doesn't really
944 * matter all that much.
0e91398f
JF
945 */
946void xen_mm_pin_all(void)
947{
948 unsigned long flags;
949 struct page *page;
74260714 950
0e91398f 951 spin_lock_irqsave(&pgd_lock, flags);
f4f97b3e 952
0e91398f
JF
953 list_for_each_entry(page, &pgd_list, lru) {
954 if (!PagePinned(page)) {
eefb47f6 955 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
0e91398f
JF
956 SetPageSavePinned(page);
957 }
958 }
959
960 spin_unlock_irqrestore(&pgd_lock, flags);
3b827c1b
JF
961}
962
c1f2f09e
EH
963/*
964 * The init_mm pagetable is really pinned as soon as its created, but
965 * that's before we have page structures to store the bits. So do all
966 * the book-keeping now.
967 */
eefb47f6
JF
968static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
969 enum pt_level level)
3b827c1b 970{
f4f97b3e
JF
971 SetPagePinned(page);
972 return 0;
973}
3b827c1b 974
f4f97b3e
JF
975void __init xen_mark_init_mm_pinned(void)
976{
eefb47f6 977 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
f4f97b3e 978}
3b827c1b 979
eefb47f6
JF
980static int xen_unpin_page(struct mm_struct *mm, struct page *page,
981 enum pt_level level)
f4f97b3e 982{
d60cd46b 983 unsigned pgfl = TestClearPagePinned(page);
3b827c1b 984
f4f97b3e
JF
985 if (pgfl && !PageHighMem(page)) {
986 void *pt = lowmem_page_address(page);
987 unsigned long pfn = page_to_pfn(page);
74260714
JF
988 spinlock_t *ptl = NULL;
989 struct multicall_space mcs;
990
11ad93e5
JF
991 /*
992 * Do the converse to pin_page. If we're using split
993 * pte locks, we must be holding the lock for while
994 * the pte page is unpinned but still RO to prevent
995 * concurrent updates from seeing it in this
996 * partially-pinned state.
997 */
74260714 998 if (level == PT_PTE) {
eefb47f6 999 ptl = xen_pte_lock(page, mm);
74260714 1000
11ad93e5
JF
1001 if (ptl)
1002 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
74260714
JF
1003 }
1004
1005 mcs = __xen_mc_entry(0);
f4f97b3e
JF
1006
1007 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1008 pfn_pte(pfn, PAGE_KERNEL),
74260714
JF
1009 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1010
1011 if (ptl) {
1012 /* unlock when batch completed */
7708ad64 1013 xen_mc_callback(xen_pte_unlock, ptl);
74260714 1014 }
f4f97b3e
JF
1015 }
1016
1017 return 0; /* never need to flush on unpin */
3b827c1b
JF
1018}
1019
f4f97b3e 1020/* Release a pagetables pages back as normal RW */
eefb47f6 1021static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
f4f97b3e 1022{
f4f97b3e
JF
1023 xen_mc_batch();
1024
74260714 1025 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
f4f97b3e 1026
d6182fbf
JF
1027#ifdef CONFIG_X86_64
1028 {
1029 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1030
1031 if (user_pgd) {
f63c2f24
T
1032 xen_do_pin(MMUEXT_UNPIN_TABLE,
1033 PFN_DOWN(__pa(user_pgd)));
eefb47f6 1034 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
d6182fbf
JF
1035 }
1036 }
1037#endif
1038
5deb30d1
JF
1039#ifdef CONFIG_X86_PAE
1040 /* Need to make sure unshared kernel PMD is unpinned */
47cb2ed9 1041 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
eefb47f6 1042 PT_PMD);
5deb30d1 1043#endif
d6182fbf 1044
86bbc2c2 1045 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
f4f97b3e
JF
1046
1047 xen_mc_issue(0);
1048}
3b827c1b 1049
eefb47f6
JF
1050static void xen_pgd_unpin(struct mm_struct *mm)
1051{
1052 __xen_pgd_unpin(mm, mm->pgd);
1053}
1054
0e91398f
JF
1055/*
1056 * On resume, undo any pinning done at save, so that the rest of the
1057 * kernel doesn't see any unexpected pinned pagetables.
1058 */
1059void xen_mm_unpin_all(void)
1060{
1061 unsigned long flags;
1062 struct page *page;
1063
1064 spin_lock_irqsave(&pgd_lock, flags);
1065
1066 list_for_each_entry(page, &pgd_list, lru) {
1067 if (PageSavePinned(page)) {
1068 BUG_ON(!PagePinned(page));
eefb47f6 1069 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
0e91398f
JF
1070 ClearPageSavePinned(page);
1071 }
1072 }
1073
1074 spin_unlock_irqrestore(&pgd_lock, flags);
1075}
1076
3b827c1b
JF
1077void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1078{
f4f97b3e 1079 spin_lock(&next->page_table_lock);
eefb47f6 1080 xen_pgd_pin(next);
f4f97b3e 1081 spin_unlock(&next->page_table_lock);
3b827c1b
JF
1082}
1083
1084void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1085{
f4f97b3e 1086 spin_lock(&mm->page_table_lock);
eefb47f6 1087 xen_pgd_pin(mm);
f4f97b3e 1088 spin_unlock(&mm->page_table_lock);
3b827c1b
JF
1089}
1090
3b827c1b 1091
f87e4cac
JF
1092#ifdef CONFIG_SMP
1093/* Another cpu may still have their %cr3 pointing at the pagetable, so
1094 we need to repoint it somewhere else before we can unpin it. */
1095static void drop_other_mm_ref(void *info)
1096{
1097 struct mm_struct *mm = info;
ce87b3d3 1098 struct mm_struct *active_mm;
3b827c1b 1099
9eb912d1 1100 active_mm = percpu_read(cpu_tlbstate.active_mm);
ce87b3d3
JF
1101
1102 if (active_mm == mm)
f87e4cac 1103 leave_mm(smp_processor_id());
9f79991d
JF
1104
1105 /* If this cpu still has a stale cr3 reference, then make sure
1106 it has been flushed. */
6dbde353 1107 if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
9f79991d
JF
1108 load_cr3(swapper_pg_dir);
1109 arch_flush_lazy_cpu_mode();
1110 }
f87e4cac 1111}
3b827c1b 1112
7708ad64 1113static void xen_drop_mm_ref(struct mm_struct *mm)
f87e4cac 1114{
e4d98207 1115 cpumask_var_t mask;
9f79991d
JF
1116 unsigned cpu;
1117
f87e4cac
JF
1118 if (current->active_mm == mm) {
1119 if (current->mm == mm)
1120 load_cr3(swapper_pg_dir);
1121 else
1122 leave_mm(smp_processor_id());
9f79991d
JF
1123 arch_flush_lazy_cpu_mode();
1124 }
1125
1126 /* Get the "official" set of cpus referring to our pagetable. */
e4d98207
MT
1127 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1128 for_each_online_cpu(cpu) {
1129 if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
1130 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1131 continue;
1132 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1133 }
1134 return;
1135 }
1136 cpumask_copy(mask, &mm->cpu_vm_mask);
9f79991d
JF
1137
1138 /* It's possible that a vcpu may have a stale reference to our
1139 cr3, because its in lazy mode, and it hasn't yet flushed
1140 its set of pending hypercalls yet. In this case, we can
1141 look at its actual current cr3 value, and force it to flush
1142 if needed. */
1143 for_each_online_cpu(cpu) {
1144 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
e4d98207 1145 cpumask_set_cpu(cpu, mask);
3b827c1b
JF
1146 }
1147
e4d98207
MT
1148 if (!cpumask_empty(mask))
1149 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1150 free_cpumask_var(mask);
f87e4cac
JF
1151}
1152#else
7708ad64 1153static void xen_drop_mm_ref(struct mm_struct *mm)
f87e4cac
JF
1154{
1155 if (current->active_mm == mm)
1156 load_cr3(swapper_pg_dir);
1157}
1158#endif
1159
1160/*
1161 * While a process runs, Xen pins its pagetables, which means that the
1162 * hypervisor forces it to be read-only, and it controls all updates
1163 * to it. This means that all pagetable updates have to go via the
1164 * hypervisor, which is moderately expensive.
1165 *
1166 * Since we're pulling the pagetable down, we switch to use init_mm,
1167 * unpin old process pagetable and mark it all read-write, which
1168 * allows further operations on it to be simple memory accesses.
1169 *
1170 * The only subtle point is that another CPU may be still using the
1171 * pagetable because of lazy tlb flushing. This means we need need to
1172 * switch all CPUs off this pagetable before we can unpin it.
1173 */
1174void xen_exit_mmap(struct mm_struct *mm)
1175{
1176 get_cpu(); /* make sure we don't move around */
7708ad64 1177 xen_drop_mm_ref(mm);
f87e4cac 1178 put_cpu();
3b827c1b 1179
f120f13e 1180 spin_lock(&mm->page_table_lock);
df912ea4
JF
1181
1182 /* pgd may not be pinned in the error exit path of execve */
7708ad64 1183 if (xen_page_pinned(mm->pgd))
eefb47f6 1184 xen_pgd_unpin(mm);
74260714 1185
f120f13e 1186 spin_unlock(&mm->page_table_lock);
3b827c1b 1187}
994025ca 1188
319f3ba5
JF
1189static __init void xen_pagetable_setup_start(pgd_t *base)
1190{
1191}
1192
1193static __init void xen_pagetable_setup_done(pgd_t *base)
1194{
1195 xen_setup_shared_info();
1196}
1197
1198static void xen_write_cr2(unsigned long cr2)
1199{
1200 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1201}
1202
1203static unsigned long xen_read_cr2(void)
1204{
1205 return percpu_read(xen_vcpu)->arch.cr2;
1206}
1207
1208unsigned long xen_read_cr2_direct(void)
1209{
1210 return percpu_read(xen_vcpu_info.arch.cr2);
1211}
1212
1213static void xen_flush_tlb(void)
1214{
1215 struct mmuext_op *op;
1216 struct multicall_space mcs;
1217
1218 preempt_disable();
1219
1220 mcs = xen_mc_entry(sizeof(*op));
1221
1222 op = mcs.args;
1223 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1224 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1225
1226 xen_mc_issue(PARAVIRT_LAZY_MMU);
1227
1228 preempt_enable();
1229}
1230
1231static void xen_flush_tlb_single(unsigned long addr)
1232{
1233 struct mmuext_op *op;
1234 struct multicall_space mcs;
1235
1236 preempt_disable();
1237
1238 mcs = xen_mc_entry(sizeof(*op));
1239 op = mcs.args;
1240 op->cmd = MMUEXT_INVLPG_LOCAL;
1241 op->arg1.linear_addr = addr & PAGE_MASK;
1242 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1243
1244 xen_mc_issue(PARAVIRT_LAZY_MMU);
1245
1246 preempt_enable();
1247}
1248
1249static void xen_flush_tlb_others(const struct cpumask *cpus,
1250 struct mm_struct *mm, unsigned long va)
1251{
1252 struct {
1253 struct mmuext_op op;
1254 DECLARE_BITMAP(mask, NR_CPUS);
1255 } *args;
1256 struct multicall_space mcs;
1257
1258 BUG_ON(cpumask_empty(cpus));
1259 BUG_ON(!mm);
1260
1261 mcs = xen_mc_entry(sizeof(*args));
1262 args = mcs.args;
1263 args->op.arg2.vcpumask = to_cpumask(args->mask);
1264
1265 /* Remove us, and any offline CPUS. */
1266 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1267 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1268 if (unlikely(cpumask_empty(to_cpumask(args->mask))))
1269 goto issue;
1270
1271 if (va == TLB_FLUSH_ALL) {
1272 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1273 } else {
1274 args->op.cmd = MMUEXT_INVLPG_MULTI;
1275 args->op.arg1.linear_addr = va;
1276 }
1277
1278 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1279
1280issue:
1281 xen_mc_issue(PARAVIRT_LAZY_MMU);
1282}
1283
1284static unsigned long xen_read_cr3(void)
1285{
1286 return percpu_read(xen_cr3);
1287}
1288
1289static void set_current_cr3(void *v)
1290{
1291 percpu_write(xen_current_cr3, (unsigned long)v);
1292}
1293
1294static void __xen_write_cr3(bool kernel, unsigned long cr3)
1295{
1296 struct mmuext_op *op;
1297 struct multicall_space mcs;
1298 unsigned long mfn;
1299
1300 if (cr3)
1301 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1302 else
1303 mfn = 0;
1304
1305 WARN_ON(mfn == 0 && kernel);
1306
1307 mcs = __xen_mc_entry(sizeof(*op));
1308
1309 op = mcs.args;
1310 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1311 op->arg1.mfn = mfn;
1312
1313 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1314
1315 if (kernel) {
1316 percpu_write(xen_cr3, cr3);
1317
1318 /* Update xen_current_cr3 once the batch has actually
1319 been submitted. */
1320 xen_mc_callback(set_current_cr3, (void *)cr3);
1321 }
1322}
1323
1324static void xen_write_cr3(unsigned long cr3)
1325{
1326 BUG_ON(preemptible());
1327
1328 xen_mc_batch(); /* disables interrupts */
1329
1330 /* Update while interrupts are disabled, so its atomic with
1331 respect to ipis */
1332 percpu_write(xen_cr3, cr3);
1333
1334 __xen_write_cr3(true, cr3);
1335
1336#ifdef CONFIG_X86_64
1337 {
1338 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1339 if (user_pgd)
1340 __xen_write_cr3(false, __pa(user_pgd));
1341 else
1342 __xen_write_cr3(false, 0);
1343 }
1344#endif
1345
1346 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1347}
1348
1349static int xen_pgd_alloc(struct mm_struct *mm)
1350{
1351 pgd_t *pgd = mm->pgd;
1352 int ret = 0;
1353
1354 BUG_ON(PagePinned(virt_to_page(pgd)));
1355
1356#ifdef CONFIG_X86_64
1357 {
1358 struct page *page = virt_to_page(pgd);
1359 pgd_t *user_pgd;
1360
1361 BUG_ON(page->private != 0);
1362
1363 ret = -ENOMEM;
1364
1365 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1366 page->private = (unsigned long)user_pgd;
1367
1368 if (user_pgd != NULL) {
1369 user_pgd[pgd_index(VSYSCALL_START)] =
1370 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1371 ret = 0;
1372 }
1373
1374 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1375 }
1376#endif
1377
1378 return ret;
1379}
1380
1381static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1382{
1383#ifdef CONFIG_X86_64
1384 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1385
1386 if (user_pgd)
1387 free_page((unsigned long)user_pgd);
1388#endif
1389}
1390
1391
1392/* Early in boot, while setting up the initial pagetable, assume
1393 everything is pinned. */
1394static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1395{
1396#ifdef CONFIG_FLATMEM
1397 BUG_ON(mem_map); /* should only be used early */
1398#endif
1399 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1400}
1401
1402/* Early release_pte assumes that all pts are pinned, since there's
1403 only init_mm and anything attached to that is pinned. */
1404static void xen_release_pte_init(unsigned long pfn)
1405{
1406 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1407}
1408
1409static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1410{
1411 struct mmuext_op op;
1412 op.cmd = cmd;
1413 op.arg1.mfn = pfn_to_mfn(pfn);
1414 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1415 BUG();
1416}
1417
1418/* This needs to make sure the new pte page is pinned iff its being
1419 attached to a pinned pagetable. */
1420static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1421{
1422 struct page *page = pfn_to_page(pfn);
1423
1424 if (PagePinned(virt_to_page(mm->pgd))) {
1425 SetPagePinned(page);
1426
1427 vm_unmap_aliases();
1428 if (!PageHighMem(page)) {
1429 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1430 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1431 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1432 } else {
1433 /* make sure there are no stray mappings of
1434 this page */
1435 kmap_flush_unused();
1436 }
1437 }
1438}
1439
1440static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1441{
1442 xen_alloc_ptpage(mm, pfn, PT_PTE);
1443}
1444
1445static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1446{
1447 xen_alloc_ptpage(mm, pfn, PT_PMD);
1448}
1449
1450/* This should never happen until we're OK to use struct page */
1451static void xen_release_ptpage(unsigned long pfn, unsigned level)
1452{
1453 struct page *page = pfn_to_page(pfn);
1454
1455 if (PagePinned(page)) {
1456 if (!PageHighMem(page)) {
1457 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1458 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1459 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1460 }
1461 ClearPagePinned(page);
1462 }
1463}
1464
1465static void xen_release_pte(unsigned long pfn)
1466{
1467 xen_release_ptpage(pfn, PT_PTE);
1468}
1469
1470static void xen_release_pmd(unsigned long pfn)
1471{
1472 xen_release_ptpage(pfn, PT_PMD);
1473}
1474
1475#if PAGETABLE_LEVELS == 4
1476static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1477{
1478 xen_alloc_ptpage(mm, pfn, PT_PUD);
1479}
1480
1481static void xen_release_pud(unsigned long pfn)
1482{
1483 xen_release_ptpage(pfn, PT_PUD);
1484}
1485#endif
1486
1487void __init xen_reserve_top(void)
1488{
1489#ifdef CONFIG_X86_32
1490 unsigned long top = HYPERVISOR_VIRT_START;
1491 struct xen_platform_parameters pp;
1492
1493 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1494 top = pp.virt_start;
1495
1496 reserve_top_address(-top);
1497#endif /* CONFIG_X86_32 */
1498}
1499
1500/*
1501 * Like __va(), but returns address in the kernel mapping (which is
1502 * all we have until the physical memory mapping has been set up.
1503 */
1504static void *__ka(phys_addr_t paddr)
1505{
1506#ifdef CONFIG_X86_64
1507 return (void *)(paddr + __START_KERNEL_map);
1508#else
1509 return __va(paddr);
1510#endif
1511}
1512
1513/* Convert a machine address to physical address */
1514static unsigned long m2p(phys_addr_t maddr)
1515{
1516 phys_addr_t paddr;
1517
1518 maddr &= PTE_PFN_MASK;
1519 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1520
1521 return paddr;
1522}
1523
1524/* Convert a machine address to kernel virtual */
1525static void *m2v(phys_addr_t maddr)
1526{
1527 return __ka(m2p(maddr));
1528}
1529
1530static void set_page_prot(void *addr, pgprot_t prot)
1531{
1532 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1533 pte_t pte = pfn_pte(pfn, prot);
1534
1535 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1536 BUG();
1537}
1538
1539static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1540{
1541 unsigned pmdidx, pteidx;
1542 unsigned ident_pte;
1543 unsigned long pfn;
1544
1545 ident_pte = 0;
1546 pfn = 0;
1547 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1548 pte_t *pte_page;
1549
1550 /* Reuse or allocate a page of ptes */
1551 if (pmd_present(pmd[pmdidx]))
1552 pte_page = m2v(pmd[pmdidx].pmd);
1553 else {
1554 /* Check for free pte pages */
1555 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1556 break;
1557
1558 pte_page = &level1_ident_pgt[ident_pte];
1559 ident_pte += PTRS_PER_PTE;
1560
1561 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1562 }
1563
1564 /* Install mappings */
1565 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1566 pte_t pte;
1567
1568 if (pfn > max_pfn_mapped)
1569 max_pfn_mapped = pfn;
1570
1571 if (!pte_none(pte_page[pteidx]))
1572 continue;
1573
1574 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1575 pte_page[pteidx] = pte;
1576 }
1577 }
1578
1579 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1580 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1581
1582 set_page_prot(pmd, PAGE_KERNEL_RO);
1583}
1584
1585#ifdef CONFIG_X86_64
1586static void convert_pfn_mfn(void *v)
1587{
1588 pte_t *pte = v;
1589 int i;
1590
1591 /* All levels are converted the same way, so just treat them
1592 as ptes. */
1593 for (i = 0; i < PTRS_PER_PTE; i++)
1594 pte[i] = xen_make_pte(pte[i].pte);
1595}
1596
1597/*
1598 * Set up the inital kernel pagetable.
1599 *
1600 * We can construct this by grafting the Xen provided pagetable into
1601 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1602 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1603 * means that only the kernel has a physical mapping to start with -
1604 * but that's enough to get __va working. We need to fill in the rest
1605 * of the physical mapping once some sort of allocator has been set
1606 * up.
1607 */
1608__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1609 unsigned long max_pfn)
1610{
1611 pud_t *l3;
1612 pmd_t *l2;
1613
1614 /* Zap identity mapping */
1615 init_level4_pgt[0] = __pgd(0);
1616
1617 /* Pre-constructed entries are in pfn, so convert to mfn */
1618 convert_pfn_mfn(init_level4_pgt);
1619 convert_pfn_mfn(level3_ident_pgt);
1620 convert_pfn_mfn(level3_kernel_pgt);
1621
1622 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1623 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1624
1625 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1626 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1627
1628 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1629 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1630 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1631
1632 /* Set up identity map */
1633 xen_map_identity_early(level2_ident_pgt, max_pfn);
1634
1635 /* Make pagetable pieces RO */
1636 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1637 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1638 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1639 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1640 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1641 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1642
1643 /* Pin down new L4 */
1644 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1645 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1646
1647 /* Unpin Xen-provided one */
1648 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1649
1650 /* Switch over */
1651 pgd = init_level4_pgt;
1652
1653 /*
1654 * At this stage there can be no user pgd, and no page
1655 * structure to attach it to, so make sure we just set kernel
1656 * pgd.
1657 */
1658 xen_mc_batch();
1659 __xen_write_cr3(true, __pa(pgd));
1660 xen_mc_issue(PARAVIRT_LAZY_CPU);
1661
1662 reserve_early(__pa(xen_start_info->pt_base),
1663 __pa(xen_start_info->pt_base +
1664 xen_start_info->nr_pt_frames * PAGE_SIZE),
1665 "XEN PAGETABLES");
1666
1667 return pgd;
1668}
1669#else /* !CONFIG_X86_64 */
1670static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1671
1672__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1673 unsigned long max_pfn)
1674{
1675 pmd_t *kernel_pmd;
1676
1677 init_pg_tables_start = __pa(pgd);
1678 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1679 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1680
1681 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1682 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1683
1684 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1685
1686 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1687 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1688 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1689
1690 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1691 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1692 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1693
1694 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1695
1696 xen_write_cr3(__pa(swapper_pg_dir));
1697
1698 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1699
1700 return swapper_pg_dir;
1701}
1702#endif /* CONFIG_X86_64 */
1703
1704static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1705{
1706 pte_t pte;
1707
1708 phys >>= PAGE_SHIFT;
1709
1710 switch (idx) {
1711 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1712#ifdef CONFIG_X86_F00F_BUG
1713 case FIX_F00F_IDT:
1714#endif
1715#ifdef CONFIG_X86_32
1716 case FIX_WP_TEST:
1717 case FIX_VDSO:
1718# ifdef CONFIG_HIGHMEM
1719 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1720# endif
1721#else
1722 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1723#endif
1724#ifdef CONFIG_X86_LOCAL_APIC
1725 case FIX_APIC_BASE: /* maps dummy local APIC */
1726#endif
1727 pte = pfn_pte(phys, prot);
1728 break;
1729
1730 default:
1731 pte = mfn_pte(phys, prot);
1732 break;
1733 }
1734
1735 __native_set_fixmap(idx, pte);
1736
1737#ifdef CONFIG_X86_64
1738 /* Replicate changes to map the vsyscall page into the user
1739 pagetable vsyscall mapping. */
1740 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1741 unsigned long vaddr = __fix_to_virt(idx);
1742 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1743 }
1744#endif
1745}
1746
1747__init void xen_post_allocator_init(void)
1748{
1749 pv_mmu_ops.set_pte = xen_set_pte;
1750 pv_mmu_ops.set_pmd = xen_set_pmd;
1751 pv_mmu_ops.set_pud = xen_set_pud;
1752#if PAGETABLE_LEVELS == 4
1753 pv_mmu_ops.set_pgd = xen_set_pgd;
1754#endif
1755
1756 /* This will work as long as patching hasn't happened yet
1757 (which it hasn't) */
1758 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1759 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1760 pv_mmu_ops.release_pte = xen_release_pte;
1761 pv_mmu_ops.release_pmd = xen_release_pmd;
1762#if PAGETABLE_LEVELS == 4
1763 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1764 pv_mmu_ops.release_pud = xen_release_pud;
1765#endif
1766
1767#ifdef CONFIG_X86_64
1768 SetPagePinned(virt_to_page(level3_user_vsyscall));
1769#endif
1770 xen_mark_init_mm_pinned();
1771}
1772
1773
1774const struct pv_mmu_ops xen_mmu_ops __initdata = {
1775 .pagetable_setup_start = xen_pagetable_setup_start,
1776 .pagetable_setup_done = xen_pagetable_setup_done,
1777
1778 .read_cr2 = xen_read_cr2,
1779 .write_cr2 = xen_write_cr2,
1780
1781 .read_cr3 = xen_read_cr3,
1782 .write_cr3 = xen_write_cr3,
1783
1784 .flush_tlb_user = xen_flush_tlb,
1785 .flush_tlb_kernel = xen_flush_tlb,
1786 .flush_tlb_single = xen_flush_tlb_single,
1787 .flush_tlb_others = xen_flush_tlb_others,
1788
1789 .pte_update = paravirt_nop,
1790 .pte_update_defer = paravirt_nop,
1791
1792 .pgd_alloc = xen_pgd_alloc,
1793 .pgd_free = xen_pgd_free,
1794
1795 .alloc_pte = xen_alloc_pte_init,
1796 .release_pte = xen_release_pte_init,
1797 .alloc_pmd = xen_alloc_pte_init,
1798 .alloc_pmd_clone = paravirt_nop,
1799 .release_pmd = xen_release_pte_init,
1800
1801#ifdef CONFIG_HIGHPTE
1802 .kmap_atomic_pte = xen_kmap_atomic_pte,
1803#endif
1804
1805#ifdef CONFIG_X86_64
1806 .set_pte = xen_set_pte,
1807#else
1808 .set_pte = xen_set_pte_init,
1809#endif
1810 .set_pte_at = xen_set_pte_at,
1811 .set_pmd = xen_set_pmd_hyper,
1812
1813 .ptep_modify_prot_start = __ptep_modify_prot_start,
1814 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1815
1816 .pte_val = xen_pte_val,
1817 .pgd_val = xen_pgd_val,
1818
1819 .make_pte = xen_make_pte,
1820 .make_pgd = xen_make_pgd,
1821
1822#ifdef CONFIG_X86_PAE
1823 .set_pte_atomic = xen_set_pte_atomic,
1824 .set_pte_present = xen_set_pte_at,
1825 .pte_clear = xen_pte_clear,
1826 .pmd_clear = xen_pmd_clear,
1827#endif /* CONFIG_X86_PAE */
1828 .set_pud = xen_set_pud_hyper,
1829
1830 .make_pmd = xen_make_pmd,
1831 .pmd_val = xen_pmd_val,
1832
1833#if PAGETABLE_LEVELS == 4
1834 .pud_val = xen_pud_val,
1835 .make_pud = xen_make_pud,
1836 .set_pgd = xen_set_pgd_hyper,
1837
1838 .alloc_pud = xen_alloc_pte_init,
1839 .release_pud = xen_release_pte_init,
1840#endif /* PAGETABLE_LEVELS == 4 */
1841
1842 .activate_mm = xen_activate_mm,
1843 .dup_mmap = xen_dup_mmap,
1844 .exit_mmap = xen_exit_mmap,
1845
1846 .lazy_mode = {
1847 .enter = paravirt_enter_lazy_mmu,
1848 .leave = xen_leave_lazy,
1849 },
1850
1851 .set_fixmap = xen_set_fixmap,
1852};
1853
1854
994025ca
JF
1855#ifdef CONFIG_XEN_DEBUG_FS
1856
1857static struct dentry *d_mmu_debug;
1858
1859static int __init xen_mmu_debugfs(void)
1860{
1861 struct dentry *d_xen = xen_init_debugfs();
1862
1863 if (d_xen == NULL)
1864 return -ENOMEM;
1865
1866 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1867
1868 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1869
1870 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1871 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1872 &mmu_stats.pgd_update_pinned);
1873 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1874 &mmu_stats.pgd_update_pinned);
1875
1876 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1877 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1878 &mmu_stats.pud_update_pinned);
1879 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1880 &mmu_stats.pud_update_pinned);
1881
1882 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1883 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1884 &mmu_stats.pmd_update_pinned);
1885 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1886 &mmu_stats.pmd_update_pinned);
1887
1888 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1889// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
1890// &mmu_stats.pte_update_pinned);
1891 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
1892 &mmu_stats.pte_update_pinned);
1893
1894 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
1895 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
1896 &mmu_stats.mmu_update_extended);
1897 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
1898 mmu_stats.mmu_update_histo, 20);
1899
1900 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
1901 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
1902 &mmu_stats.set_pte_at_batched);
1903 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
1904 &mmu_stats.set_pte_at_current);
1905 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
1906 &mmu_stats.set_pte_at_kernel);
1907
1908 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
1909 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
1910 &mmu_stats.prot_commit_batched);
1911
1912 return 0;
1913}
1914fs_initcall(xen_mmu_debugfs);
1915
1916#endif /* CONFIG_XEN_DEBUG_FS */