]> bbs.cooldavid.org Git - net-next-2.6.git/blame - arch/x86_64/mm/init.c
[PATCH] x86_64: Don't apply __PHYSICAL_MASK to page frame numbers
[net-next-2.6.git] / arch / x86_64 / mm / init.c
CommitLineData
1da177e4
LT
1/*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
8
9#include <linux/config.h>
10#include <linux/signal.h>
11#include <linux/sched.h>
12#include <linux/kernel.h>
13#include <linux/errno.h>
14#include <linux/string.h>
15#include <linux/types.h>
16#include <linux/ptrace.h>
17#include <linux/mman.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/smp.h>
21#include <linux/init.h>
22#include <linux/pagemap.h>
23#include <linux/bootmem.h>
24#include <linux/proc_fs.h>
25
26#include <asm/processor.h>
27#include <asm/system.h>
28#include <asm/uaccess.h>
29#include <asm/pgtable.h>
30#include <asm/pgalloc.h>
31#include <asm/dma.h>
32#include <asm/fixmap.h>
33#include <asm/e820.h>
34#include <asm/apic.h>
35#include <asm/tlb.h>
36#include <asm/mmu_context.h>
37#include <asm/proto.h>
38#include <asm/smp.h>
39
40#ifndef Dprintk
41#define Dprintk(x...)
42#endif
43
44#ifdef CONFIG_GART_IOMMU
45extern int swiotlb;
46#endif
47
48extern char _stext[];
49
e18c6874
AK
50static unsigned long dma_reserve __initdata;
51
1da177e4
LT
52DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
53
54/*
55 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
56 * physical space so we can cache the place of the first one and move
57 * around without checking the pgd every time.
58 */
59
60void show_mem(void)
61{
e92343cc
AK
62 long i, total = 0, reserved = 0;
63 long shared = 0, cached = 0;
1da177e4
LT
64 pg_data_t *pgdat;
65 struct page *page;
66
e92343cc 67 printk(KERN_INFO "Mem-info:\n");
1da177e4 68 show_free_areas();
e92343cc 69 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
1da177e4
LT
70
71 for_each_pgdat(pgdat) {
72 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
73 page = pfn_to_page(pgdat->node_start_pfn + i);
74 total++;
e92343cc
AK
75 if (PageReserved(page))
76 reserved++;
77 else if (PageSwapCache(page))
78 cached++;
79 else if (page_count(page))
80 shared += page_count(page) - 1;
1da177e4
LT
81 }
82 }
e92343cc
AK
83 printk(KERN_INFO "%lu pages of RAM\n", total);
84 printk(KERN_INFO "%lu reserved pages\n",reserved);
85 printk(KERN_INFO "%lu pages shared\n",shared);
86 printk(KERN_INFO "%lu pages swap cached\n",cached);
1da177e4
LT
87}
88
89/* References to section boundaries */
90
91extern char _text, _etext, _edata, __bss_start, _end[];
92extern char __init_begin, __init_end;
93
94int after_bootmem;
95
96static void *spp_getpage(void)
97{
98 void *ptr;
99 if (after_bootmem)
100 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
101 else
102 ptr = alloc_bootmem_pages(PAGE_SIZE);
103 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
104 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
105
106 Dprintk("spp_getpage %p\n", ptr);
107 return ptr;
108}
109
110static void set_pte_phys(unsigned long vaddr,
111 unsigned long phys, pgprot_t prot)
112{
113 pgd_t *pgd;
114 pud_t *pud;
115 pmd_t *pmd;
116 pte_t *pte, new_pte;
117
118 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
119
120 pgd = pgd_offset_k(vaddr);
121 if (pgd_none(*pgd)) {
122 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
123 return;
124 }
125 pud = pud_offset(pgd, vaddr);
126 if (pud_none(*pud)) {
127 pmd = (pmd_t *) spp_getpage();
128 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
129 if (pmd != pmd_offset(pud, 0)) {
130 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
131 return;
132 }
133 }
134 pmd = pmd_offset(pud, vaddr);
135 if (pmd_none(*pmd)) {
136 pte = (pte_t *) spp_getpage();
137 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
138 if (pte != pte_offset_kernel(pmd, 0)) {
139 printk("PAGETABLE BUG #02!\n");
140 return;
141 }
142 }
143 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
144
145 pte = pte_offset_kernel(pmd, vaddr);
146 if (!pte_none(*pte) &&
147 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
148 pte_ERROR(*pte);
149 set_pte(pte, new_pte);
150
151 /*
152 * It's enough to flush this one mapping.
153 * (PGE mappings get flushed as well)
154 */
155 __flush_tlb_one(vaddr);
156}
157
158/* NOTE: this is meant to be run only at boot */
159void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
160{
161 unsigned long address = __fix_to_virt(idx);
162
163 if (idx >= __end_of_fixed_addresses) {
164 printk("Invalid __set_fixmap\n");
165 return;
166 }
167 set_pte_phys(address, phys, prot);
168}
169
170unsigned long __initdata table_start, table_end;
171
172extern pmd_t temp_boot_pmds[];
173
174static struct temp_map {
175 pmd_t *pmd;
176 void *address;
177 int allocated;
178} temp_mappings[] __initdata = {
179 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
180 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
181 {}
182};
183
184static __init void *alloc_low_page(int *index, unsigned long *phys)
185{
186 struct temp_map *ti;
187 int i;
188 unsigned long pfn = table_end++, paddr;
189 void *adr;
190
191 if (pfn >= end_pfn)
192 panic("alloc_low_page: ran out of memory");
193 for (i = 0; temp_mappings[i].allocated; i++) {
194 if (!temp_mappings[i].pmd)
195 panic("alloc_low_page: ran out of temp mappings");
196 }
197 ti = &temp_mappings[i];
198 paddr = (pfn << PAGE_SHIFT) & PMD_MASK;
199 set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE));
200 ti->allocated = 1;
201 __flush_tlb();
202 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
203 *index = i;
204 *phys = pfn * PAGE_SIZE;
205 return adr;
206}
207
208static __init void unmap_low_page(int i)
209{
210 struct temp_map *ti = &temp_mappings[i];
211 set_pmd(ti->pmd, __pmd(0));
212 ti->allocated = 0;
213}
214
215static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
216{
217 long i, j;
218
219 i = pud_index(address);
220 pud = pud + i;
221 for (; i < PTRS_PER_PUD; pud++, i++) {
222 int map;
223 unsigned long paddr, pmd_phys;
224 pmd_t *pmd;
225
226 paddr = address + i*PUD_SIZE;
227 if (paddr >= end) {
228 for (; i < PTRS_PER_PUD; i++, pud++)
229 set_pud(pud, __pud(0));
230 break;
231 }
232
233 if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
234 set_pud(pud, __pud(0));
235 continue;
236 }
237
238 pmd = alloc_low_page(&map, &pmd_phys);
239 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
240 for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
241 unsigned long pe;
242
243 if (paddr >= end) {
244 for (; j < PTRS_PER_PMD; j++, pmd++)
245 set_pmd(pmd, __pmd(0));
246 break;
247 }
248 pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
249 pe &= __supported_pte_mask;
250 set_pmd(pmd, __pmd(pe));
251 }
252 unmap_low_page(map);
253 }
254 __flush_tlb();
255}
256
257static void __init find_early_table_space(unsigned long end)
258{
259 unsigned long puds, pmds, tables;
260
261 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
262 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
263 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
264 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
265
266 table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
267 if (table_start == -1UL)
268 panic("Cannot find space for the kernel page tables");
269
270 table_start >>= PAGE_SHIFT;
271 table_end = table_start;
272}
273
274/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
275 This runs before bootmem is initialized and gets pages directly from the
276 physical memory. To access them they are temporarily mapped. */
277void __init init_memory_mapping(unsigned long start, unsigned long end)
278{
279 unsigned long next;
280
281 Dprintk("init_memory_mapping\n");
282
283 /*
284 * Find space for the kernel direct mapping tables.
285 * Later we should allocate these tables in the local node of the memory
286 * mapped. Unfortunately this is done currently before the nodes are
287 * discovered.
288 */
289 find_early_table_space(end);
290
291 start = (unsigned long)__va(start);
292 end = (unsigned long)__va(end);
293
294 for (; start < end; start = next) {
295 int map;
296 unsigned long pud_phys;
297 pud_t *pud = alloc_low_page(&map, &pud_phys);
298 next = start + PGDIR_SIZE;
299 if (next > end)
300 next = end;
301 phys_pud_init(pud, __pa(start), __pa(next));
302 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
303 unmap_low_page(map);
304 }
305
306 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
307 __flush_tlb_all();
308 early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
309 table_start<<PAGE_SHIFT,
310 table_end<<PAGE_SHIFT);
311}
312
313extern struct x8664_pda cpu_pda[NR_CPUS];
314
f6c2e333 315void __cpuinit zap_low_mappings(int cpu)
1da177e4 316{
f6c2e333
SS
317 if (cpu == 0) {
318 pgd_t *pgd = pgd_offset_k(0UL);
319 pgd_clear(pgd);
320 } else {
321 /*
322 * For AP's, zap the low identity mappings by changing the cr3
323 * to init_level4_pgt and doing local flush tlb all
324 */
325 asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
326 }
327 __flush_tlb_all();
1da177e4
LT
328}
329
a2f1b424
AK
330/* Compute zone sizes for the DMA and DMA32 zones in a node. */
331__init void
332size_zones(unsigned long *z, unsigned long *h,
333 unsigned long start_pfn, unsigned long end_pfn)
334{
335 int i;
336 unsigned long w;
337
338 for (i = 0; i < MAX_NR_ZONES; i++)
339 z[i] = 0;
340
341 if (start_pfn < MAX_DMA_PFN)
342 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
343 if (start_pfn < MAX_DMA32_PFN) {
344 unsigned long dma32_pfn = MAX_DMA32_PFN;
345 if (dma32_pfn > end_pfn)
346 dma32_pfn = end_pfn;
347 z[ZONE_DMA32] = dma32_pfn - start_pfn;
348 }
349 z[ZONE_NORMAL] = end_pfn - start_pfn;
350
351 /* Remove lower zones from higher ones. */
352 w = 0;
353 for (i = 0; i < MAX_NR_ZONES; i++) {
354 if (z[i])
355 z[i] -= w;
356 w += z[i];
357 }
358
359 /* Compute holes */
360 w = 0;
361 for (i = 0; i < MAX_NR_ZONES; i++) {
362 unsigned long s = w;
363 w += z[i];
364 h[i] = e820_hole_size(s, w);
365 }
e18c6874
AK
366
367 /* Add the space pace needed for mem_map to the holes too. */
368 for (i = 0; i < MAX_NR_ZONES; i++)
369 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
370
371 /* The 16MB DMA zone has the kernel and other misc mappings.
372 Account them too */
373 if (h[ZONE_DMA]) {
374 h[ZONE_DMA] += dma_reserve;
375 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
376 printk(KERN_WARNING
377 "Kernel too large and filling up ZONE_DMA?\n");
378 h[ZONE_DMA] = z[ZONE_DMA];
379 }
380 }
a2f1b424
AK
381}
382
2b97690f 383#ifndef CONFIG_NUMA
1da177e4
LT
384void __init paging_init(void)
385{
a2f1b424
AK
386 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
387 size_zones(zones, holes, 0, end_pfn);
388 free_area_init_node(0, NODE_DATA(0), zones,
389 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
1da177e4
LT
390}
391#endif
392
393/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
394 from the CPU leading to inconsistent cache lines. address and size
395 must be aligned to 2MB boundaries.
396 Does nothing when the mapping doesn't exist. */
397void __init clear_kernel_mapping(unsigned long address, unsigned long size)
398{
399 unsigned long end = address + size;
400
401 BUG_ON(address & ~LARGE_PAGE_MASK);
402 BUG_ON(size & ~LARGE_PAGE_MASK);
403
404 for (; address < end; address += LARGE_PAGE_SIZE) {
405 pgd_t *pgd = pgd_offset_k(address);
406 pud_t *pud;
407 pmd_t *pmd;
408 if (pgd_none(*pgd))
409 continue;
410 pud = pud_offset(pgd, address);
411 if (pud_none(*pud))
412 continue;
413 pmd = pmd_offset(pud, address);
414 if (!pmd || pmd_none(*pmd))
415 continue;
416 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
417 /* Could handle this, but it should not happen currently. */
418 printk(KERN_ERR
419 "clear_kernel_mapping: mapping has been split. will leak memory\n");
420 pmd_ERROR(*pmd);
421 }
422 set_pmd(pmd, __pmd(0));
423 }
424 __flush_tlb_all();
425}
426
1da177e4
LT
427static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
428 kcore_vsyscall;
429
430void __init mem_init(void)
431{
0a43e4bf 432 long codesize, reservedpages, datasize, initsize;
1da177e4
LT
433
434#ifdef CONFIG_SWIOTLB
1da177e4
LT
435 if (!iommu_aperture &&
436 (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu))
437 swiotlb = 1;
438 if (swiotlb)
439 swiotlb_init();
440#endif
441
442 /* How many end-of-memory variables you have, grandma! */
443 max_low_pfn = end_pfn;
444 max_pfn = end_pfn;
445 num_physpages = end_pfn;
446 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
447
448 /* clear the zero-page */
449 memset(empty_zero_page, 0, PAGE_SIZE);
450
451 reservedpages = 0;
452
453 /* this will put all low memory onto the freelists */
2b97690f 454#ifdef CONFIG_NUMA
0a43e4bf 455 totalram_pages = numa_free_all_bootmem();
1da177e4 456#else
0a43e4bf 457 totalram_pages = free_all_bootmem();
1da177e4 458#endif
0a43e4bf 459 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
1da177e4
LT
460
461 after_bootmem = 1;
462
463 codesize = (unsigned long) &_etext - (unsigned long) &_text;
464 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
465 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
466
467 /* Register memory areas for /proc/kcore */
468 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
469 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
470 VMALLOC_END-VMALLOC_START);
471 kclist_add(&kcore_kernel, &_stext, _end - _stext);
472 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
473 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
474 VSYSCALL_END - VSYSCALL_START);
475
0a43e4bf 476 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
1da177e4
LT
477 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
478 end_pfn << (PAGE_SHIFT-10),
479 codesize >> 10,
480 reservedpages << (PAGE_SHIFT-10),
481 datasize >> 10,
482 initsize >> 10);
483
f6c2e333 484#ifdef CONFIG_SMP
1da177e4 485 /*
f6c2e333
SS
486 * Sync boot_level4_pgt mappings with the init_level4_pgt
487 * except for the low identity mappings which are already zapped
488 * in init_level4_pgt. This sync-up is essential for AP's bringup
1da177e4 489 */
f6c2e333 490 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
1da177e4
LT
491#endif
492}
493
494extern char __initdata_begin[], __initdata_end[];
495
496void free_initmem(void)
497{
498 unsigned long addr;
499
500 addr = (unsigned long)(&__init_begin);
501 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
502 ClearPageReserved(virt_to_page(addr));
503 set_page_count(virt_to_page(addr), 1);
504 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
505 free_page(addr);
506 totalram_pages++;
507 }
508 memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
509 printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10);
510}
511
512#ifdef CONFIG_BLK_DEV_INITRD
513void free_initrd_mem(unsigned long start, unsigned long end)
514{
515 if (start < (unsigned long)&_end)
516 return;
517 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
518 for (; start < end; start += PAGE_SIZE) {
519 ClearPageReserved(virt_to_page(start));
520 set_page_count(virt_to_page(start), 1);
521 free_page(start);
522 totalram_pages++;
523 }
524}
525#endif
526
527void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
528{
529 /* Should check here against the e820 map to avoid double free */
2b97690f 530#ifdef CONFIG_NUMA
1da177e4
LT
531 int nid = phys_to_nid(phys);
532 reserve_bootmem_node(NODE_DATA(nid), phys, len);
533#else
534 reserve_bootmem(phys, len);
535#endif
e18c6874
AK
536 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
537 dma_reserve += len / PAGE_SIZE;
1da177e4
LT
538}
539
540int kern_addr_valid(unsigned long addr)
541{
542 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
543 pgd_t *pgd;
544 pud_t *pud;
545 pmd_t *pmd;
546 pte_t *pte;
547
548 if (above != 0 && above != -1UL)
549 return 0;
550
551 pgd = pgd_offset_k(addr);
552 if (pgd_none(*pgd))
553 return 0;
554
555 pud = pud_offset(pgd, addr);
556 if (pud_none(*pud))
557 return 0;
558
559 pmd = pmd_offset(pud, addr);
560 if (pmd_none(*pmd))
561 return 0;
562 if (pmd_large(*pmd))
563 return pfn_valid(pmd_pfn(*pmd));
564
565 pte = pte_offset_kernel(pmd, addr);
566 if (pte_none(*pte))
567 return 0;
568 return pfn_valid(pte_pfn(*pte));
569}
570
571#ifdef CONFIG_SYSCTL
572#include <linux/sysctl.h>
573
574extern int exception_trace, page_fault_trace;
575
576static ctl_table debug_table2[] = {
577 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
578 proc_dointvec },
579#ifdef CONFIG_CHECKING
580 { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
581 proc_dointvec },
582#endif
583 { 0, }
584};
585
586static ctl_table debug_root_table2[] = {
587 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
588 .child = debug_table2 },
589 { 0 },
590};
591
592static __init int x8664_sysctl_init(void)
593{
594 register_sysctl_table(debug_root_table2, 1);
595 return 0;
596}
597__initcall(x8664_sysctl_init);
598#endif
599
1e014410
AK
600/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
601 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
602 not need special handling anymore. */
1da177e4
LT
603
604static struct vm_area_struct gate_vma = {
605 .vm_start = VSYSCALL_START,
606 .vm_end = VSYSCALL_END,
607 .vm_page_prot = PAGE_READONLY
608};
609
1da177e4
LT
610struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
611{
612#ifdef CONFIG_IA32_EMULATION
1e014410
AK
613 if (test_tsk_thread_flag(tsk, TIF_IA32))
614 return NULL;
1da177e4
LT
615#endif
616 return &gate_vma;
617}
618
619int in_gate_area(struct task_struct *task, unsigned long addr)
620{
621 struct vm_area_struct *vma = get_gate_vma(task);
1e014410
AK
622 if (!vma)
623 return 0;
1da177e4
LT
624 return (addr >= vma->vm_start) && (addr < vma->vm_end);
625}
626
627/* Use this when you have no reliable task/vma, typically from interrupt
628 * context. It is less reliable than using the task's vma and may give
629 * false positives.
630 */
631int in_gate_area_no_task(unsigned long addr)
632{
1e014410 633 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
1da177e4 634}