]> bbs.cooldavid.org Git - net-next-2.6.git/blame - arch/x86/mm/fault.c
geode: fix modular build
[net-next-2.6.git] / arch / x86 / mm / fault.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
1da177e4
LT
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
1da177e4
LT
16#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
c61e211d
HH
21#include <linux/highmem.h>
22#include <linux/bootmem.h> /* for max_low_pfn */
1eeb66a1 23#include <linux/vmalloc.h>
1da177e4 24#include <linux/module.h>
0f2fbdcb 25#include <linux/kprobes.h>
ab2bf0c1 26#include <linux/uaccess.h>
1eeb66a1 27#include <linux/kdebug.h>
1da177e4
LT
28
29#include <asm/system.h>
c61e211d
HH
30#include <asm/desc.h>
31#include <asm/segment.h>
1da177e4
LT
32#include <asm/pgalloc.h>
33#include <asm/smp.h>
34#include <asm/tlbflush.h>
35#include <asm/proto.h>
1da177e4 36#include <asm-generic/sections.h>
1da177e4 37
33cb5243
HH
38/*
39 * Page fault error code bits
40 * bit 0 == 0 means no page found, 1 means protection fault
41 * bit 1 == 0 means read, 1 means write
42 * bit 2 == 0 means kernel, 1 means user-mode
43 * bit 3 == 1 means use of reserved bit detected
44 * bit 4 == 1 means fault was an instruction fetch
45 */
8a19da7b 46#define PF_PROT (1<<0)
66c58156 47#define PF_WRITE (1<<1)
8a19da7b
IM
48#define PF_USER (1<<2)
49#define PF_RSVD (1<<3)
66c58156
AK
50#define PF_INSTR (1<<4)
51
74a0b576 52static inline int notify_page_fault(struct pt_regs *regs)
1bd858a5 53{
33cb5243 54#ifdef CONFIG_KPROBES
74a0b576
CH
55 int ret = 0;
56
57 /* kprobe_running() needs smp_processor_id() */
f8c2ee22
HH
58#ifdef CONFIG_X86_32
59 if (!user_mode_vm(regs)) {
60#else
74a0b576 61 if (!user_mode(regs)) {
f8c2ee22 62#endif
74a0b576
CH
63 preempt_disable();
64 if (kprobe_running() && kprobe_fault_handler(regs, 14))
65 ret = 1;
66 preempt_enable();
67 }
1bd858a5 68
74a0b576 69 return ret;
74a0b576 70#else
74a0b576 71 return 0;
74a0b576 72#endif
33cb5243 73}
1bd858a5 74
1dc85be0
HH
75/*
76 * X86_32
77 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
78 * Check that here and ignore it.
79 *
80 * X86_64
81 * Sometimes the CPU reports invalid exceptions on prefetch.
82 * Check that here and ignore it.
83 *
84 * Opcode checker based on code by Richard Brunner
85 */
86static int is_prefetch(struct pt_regs *regs, unsigned long addr,
87 unsigned long error_code)
33cb5243 88{
ab2bf0c1 89 unsigned char *instr;
1da177e4 90 int scan_more = 1;
33cb5243 91 int prefetch = 0;
f1290ec9 92 unsigned char *max_instr;
1da177e4 93
3085354d
IM
94 /*
95 * If it was a exec (instruction fetch) fault on NX page, then
96 * do not ignore the fault:
97 */
66c58156 98 if (error_code & PF_INSTR)
1da177e4 99 return 0;
1dc85be0 100
f2857ce9 101 instr = (unsigned char *)convert_ip_to_linear(current, regs);
f1290ec9 102 max_instr = instr + 15;
1da177e4 103
76381fee 104 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4
LT
105 return 0;
106
33cb5243 107 while (scan_more && instr < max_instr) {
1da177e4
LT
108 unsigned char opcode;
109 unsigned char instr_hi;
110 unsigned char instr_lo;
111
ab2bf0c1 112 if (probe_kernel_address(instr, opcode))
33cb5243 113 break;
1da177e4 114
33cb5243
HH
115 instr_hi = opcode & 0xf0;
116 instr_lo = opcode & 0x0f;
1da177e4
LT
117 instr++;
118
33cb5243 119 switch (instr_hi) {
1da177e4
LT
120 case 0x20:
121 case 0x30:
33cb5243
HH
122 /*
123 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
124 * In X86_64 long mode, the CPU will signal invalid
125 * opcode if some of these prefixes are present so
126 * X86_64 will never get here anyway
127 */
1da177e4
LT
128 scan_more = ((instr_lo & 7) == 0x6);
129 break;
33cb5243 130#ifdef CONFIG_X86_64
1da177e4 131 case 0x40:
33cb5243
HH
132 /*
133 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
134 * Need to figure out under what instruction mode the
135 * instruction was issued. Could check the LDT for lm,
136 * but for now it's good enough to assume that long
137 * mode only uses well known segments or kernel.
138 */
76381fee 139 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
1da177e4 140 break;
33cb5243 141#endif
1da177e4
LT
142 case 0x60:
143 /* 0x64 thru 0x67 are valid prefixes in all modes. */
144 scan_more = (instr_lo & 0xC) == 0x4;
33cb5243 145 break;
1da177e4 146 case 0xF0:
1dc85be0 147 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
1da177e4 148 scan_more = !instr_lo || (instr_lo>>1) == 1;
33cb5243 149 break;
1da177e4
LT
150 case 0x00:
151 /* Prefetch instruction is 0x0F0D or 0x0F18 */
152 scan_more = 0;
f2857ce9 153
ab2bf0c1 154 if (probe_kernel_address(instr, opcode))
1da177e4
LT
155 break;
156 prefetch = (instr_lo == 0xF) &&
157 (opcode == 0x0D || opcode == 0x18);
33cb5243 158 break;
1da177e4
LT
159 default:
160 scan_more = 0;
161 break;
33cb5243 162 }
1da177e4
LT
163 }
164 return prefetch;
165}
166
c4aba4a8
HH
167static void force_sig_info_fault(int si_signo, int si_code,
168 unsigned long address, struct task_struct *tsk)
169{
170 siginfo_t info;
171
172 info.si_signo = si_signo;
173 info.si_errno = 0;
174 info.si_code = si_code;
175 info.si_addr = (void __user *)address;
176 force_sig_info(si_signo, &info, tsk);
177}
178
1156e098 179#ifdef CONFIG_X86_64
33cb5243
HH
180static int bad_address(void *p)
181{
1da177e4 182 unsigned long dummy;
ab2bf0c1 183 return probe_kernel_address((unsigned long *)p, dummy);
33cb5243 184}
1156e098 185#endif
1da177e4 186
cae30f82 187static void dump_pagetable(unsigned long address)
1da177e4 188{
1156e098
HH
189#ifdef CONFIG_X86_32
190 __typeof__(pte_val(__pte(0))) page;
191
192 page = read_cr3();
193 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
194#ifdef CONFIG_X86_PAE
195 printk("*pdpt = %016Lx ", page);
196 if ((page >> PAGE_SHIFT) < max_low_pfn
197 && page & _PAGE_PRESENT) {
198 page &= PAGE_MASK;
199 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
200 & (PTRS_PER_PMD - 1)];
201 printk(KERN_CONT "*pde = %016Lx ", page);
202 page &= ~_PAGE_NX;
203 }
204#else
205 printk("*pde = %08lx ", page);
206#endif
207
208 /*
209 * We must not directly access the pte in the highpte
210 * case if the page table is located in highmem.
211 * And let's rather not kmap-atomic the pte, just in case
212 * it's allocated already.
213 */
214 if ((page >> PAGE_SHIFT) < max_low_pfn
215 && (page & _PAGE_PRESENT)
216 && !(page & _PAGE_PSE)) {
217 page &= PAGE_MASK;
218 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
219 & (PTRS_PER_PTE - 1)];
220 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
221 }
222
223 printk("\n");
224#else /* CONFIG_X86_64 */
1da177e4
LT
225 pgd_t *pgd;
226 pud_t *pud;
227 pmd_t *pmd;
228 pte_t *pte;
229
f51c9452 230 pgd = (pgd_t *)read_cr3();
1da177e4 231
33cb5243 232 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
1da177e4 233 pgd += pgd_index(address);
1da177e4 234 if (bad_address(pgd)) goto bad;
d646bce4 235 printk("PGD %lx ", pgd_val(*pgd));
33cb5243 236 if (!pgd_present(*pgd)) goto ret;
1da177e4 237
d2ae5b5f 238 pud = pud_offset(pgd, address);
1da177e4
LT
239 if (bad_address(pud)) goto bad;
240 printk("PUD %lx ", pud_val(*pud));
b5360222
AK
241 if (!pud_present(*pud) || pud_large(*pud))
242 goto ret;
1da177e4
LT
243
244 pmd = pmd_offset(pud, address);
245 if (bad_address(pmd)) goto bad;
246 printk("PMD %lx ", pmd_val(*pmd));
b1992df3 247 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
1da177e4
LT
248
249 pte = pte_offset_kernel(pmd, address);
250 if (bad_address(pte)) goto bad;
33cb5243 251 printk("PTE %lx", pte_val(*pte));
1da177e4
LT
252ret:
253 printk("\n");
254 return;
255bad:
256 printk("BAD\n");
1156e098
HH
257#endif
258}
259
260#ifdef CONFIG_X86_32
261static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
262{
263 unsigned index = pgd_index(address);
264 pgd_t *pgd_k;
265 pud_t *pud, *pud_k;
266 pmd_t *pmd, *pmd_k;
267
268 pgd += index;
269 pgd_k = init_mm.pgd + index;
270
271 if (!pgd_present(*pgd_k))
272 return NULL;
273
274 /*
275 * set_pgd(pgd, *pgd_k); here would be useless on PAE
276 * and redundant with the set_pmd() on non-PAE. As would
277 * set_pud.
278 */
279
280 pud = pud_offset(pgd, address);
281 pud_k = pud_offset(pgd_k, address);
282 if (!pud_present(*pud_k))
283 return NULL;
284
285 pmd = pmd_offset(pud, address);
286 pmd_k = pmd_offset(pud_k, address);
287 if (!pmd_present(*pmd_k))
288 return NULL;
289 if (!pmd_present(*pmd)) {
290 set_pmd(pmd, *pmd_k);
291 arch_flush_lazy_mmu_mode();
292 } else
293 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
294 return pmd_k;
1da177e4 295}
1156e098 296#endif
1da177e4 297
1dc85be0 298#ifdef CONFIG_X86_64
33cb5243 299static const char errata93_warning[] =
1da177e4
LT
300KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
301KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
302KERN_ERR "******* Please consider a BIOS update.\n"
303KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
fdfe8aa8 304#endif
1da177e4
LT
305
306/* Workaround for K8 erratum #93 & buggy BIOS.
307 BIOS SMM functions are required to use a specific workaround
33cb5243
HH
308 to avoid corruption of the 64bit RIP register on C stepping K8.
309 A lot of BIOS that didn't get tested properly miss this.
1da177e4
LT
310 The OS sees this as a page fault with the upper 32bits of RIP cleared.
311 Try to work around it here.
fdfe8aa8
HH
312 Note we only handle faults in kernel here.
313 Does nothing for X86_32
314 */
33cb5243 315static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4 316{
fdfe8aa8 317#ifdef CONFIG_X86_64
1da177e4 318 static int warned;
65ea5b03 319 if (address != regs->ip)
1da177e4 320 return 0;
33cb5243 321 if ((address >> 32) != 0)
1da177e4
LT
322 return 0;
323 address |= 0xffffffffUL << 32;
33cb5243
HH
324 if ((address >= (u64)_stext && address <= (u64)_etext) ||
325 (address >= MODULES_VADDR && address <= MODULES_END)) {
1da177e4 326 if (!warned) {
33cb5243 327 printk(errata93_warning);
1da177e4
LT
328 warned = 1;
329 }
65ea5b03 330 regs->ip = address;
1da177e4
LT
331 return 1;
332 }
fdfe8aa8 333#endif
1da177e4 334 return 0;
33cb5243 335}
1da177e4 336
35f3266f
HH
337/*
338 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
339 * addresses >4GB. We catch this in the page fault handler because these
340 * addresses are not reachable. Just detect this case and return. Any code
341 * segment in LDT is compatibility mode.
342 */
343static int is_errata100(struct pt_regs *regs, unsigned long address)
344{
345#ifdef CONFIG_X86_64
346 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
347 (address >> 32))
348 return 1;
349#endif
350 return 0;
351}
352
29caf2f9
HH
353void do_invalid_op(struct pt_regs *, unsigned long);
354
355static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
356{
357#ifdef CONFIG_X86_F00F_BUG
358 unsigned long nr;
359 /*
360 * Pentium F0 0F C7 C8 bug workaround.
361 */
362 if (boot_cpu_data.f00f_bug) {
363 nr = (address - idt_descr.address) >> 3;
364
365 if (nr == 6) {
366 do_invalid_op(regs, 0);
367 return 1;
368 }
369 }
370#endif
371 return 0;
372}
373
b3279c7f
HH
374static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
375 unsigned long address)
376{
1156e098
HH
377#ifdef CONFIG_X86_32
378 if (!oops_may_print())
379 return;
fd40d6e3 380#endif
1156e098
HH
381
382#ifdef CONFIG_X86_PAE
383 if (error_code & PF_INSTR) {
93809be8 384 unsigned int level;
1156e098
HH
385 pte_t *pte = lookup_address(address, &level);
386
387 if (pte && pte_present(*pte) && !pte_exec(*pte))
388 printk(KERN_CRIT "kernel tried to execute "
389 "NX-protected page - exploit attempt? "
390 "(uid: %d)\n", current->uid);
391 }
392#endif
1156e098 393
19f0dda9 394 printk(KERN_ALERT "BUG: unable to handle kernel ");
b3279c7f 395 if (address < PAGE_SIZE)
19f0dda9 396 printk(KERN_CONT "NULL pointer dereference");
b3279c7f 397 else
19f0dda9 398 printk(KERN_CONT "paging request");
fd40d6e3
HH
399#ifdef CONFIG_X86_32
400 printk(KERN_CONT " at %08lx\n", address);
401#else
19f0dda9 402 printk(KERN_CONT " at %016lx\n", address);
fd40d6e3 403#endif
19f0dda9 404 printk(KERN_ALERT "IP:");
b3279c7f
HH
405 printk_address(regs->ip, 1);
406 dump_pagetable(address);
407}
408
1156e098 409#ifdef CONFIG_X86_64
1da177e4
LT
410static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
411 unsigned long error_code)
412{
1209140c 413 unsigned long flags = oops_begin();
6e3f3617 414 struct task_struct *tsk;
1209140c 415
1da177e4
LT
416 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
417 current->comm, address);
418 dump_pagetable(address);
6e3f3617
JB
419 tsk = current;
420 tsk->thread.cr2 = address;
421 tsk->thread.trap_no = 14;
422 tsk->thread.error_code = error_code;
22f5991c
JB
423 if (__die("Bad pagetable", regs, error_code))
424 regs = NULL;
425 oops_end(flags, regs, SIGKILL);
1da177e4 426}
1156e098 427#endif
1da177e4 428
d8b57bb7
TG
429static int spurious_fault_check(unsigned long error_code, pte_t *pte)
430{
431 if ((error_code & PF_WRITE) && !pte_write(*pte))
432 return 0;
433 if ((error_code & PF_INSTR) && !pte_exec(*pte))
434 return 0;
435
436 return 1;
437}
438
5b727a3b
JF
439/*
440 * Handle a spurious fault caused by a stale TLB entry. This allows
441 * us to lazily refresh the TLB when increasing the permissions of a
442 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
443 * expensive since that implies doing a full cross-processor TLB
444 * flush, even if no stale TLB entries exist on other processors.
445 * There are no security implications to leaving a stale TLB when
446 * increasing the permissions on a page.
447 */
448static int spurious_fault(unsigned long address,
449 unsigned long error_code)
450{
451 pgd_t *pgd;
452 pud_t *pud;
453 pmd_t *pmd;
454 pte_t *pte;
455
456 /* Reserved-bit violation or user access to kernel space? */
457 if (error_code & (PF_USER | PF_RSVD))
458 return 0;
459
460 pgd = init_mm.pgd + pgd_index(address);
461 if (!pgd_present(*pgd))
462 return 0;
463
464 pud = pud_offset(pgd, address);
465 if (!pud_present(*pud))
466 return 0;
467
d8b57bb7
TG
468 if (pud_large(*pud))
469 return spurious_fault_check(error_code, (pte_t *) pud);
470
5b727a3b
JF
471 pmd = pmd_offset(pud, address);
472 if (!pmd_present(*pmd))
473 return 0;
474
d8b57bb7
TG
475 if (pmd_large(*pmd))
476 return spurious_fault_check(error_code, (pte_t *) pmd);
477
5b727a3b
JF
478 pte = pte_offset_kernel(pmd, address);
479 if (!pte_present(*pte))
480 return 0;
481
d8b57bb7 482 return spurious_fault_check(error_code, pte);
5b727a3b
JF
483}
484
1da177e4 485/*
f8c2ee22
HH
486 * X86_32
487 * Handle a fault on the vmalloc or module mapping area
488 *
489 * X86_64
f95190b2 490 * Handle a fault on the vmalloc area
3b9ba4d5
AK
491 *
492 * This assumes no large pages in there.
1da177e4
LT
493 */
494static int vmalloc_fault(unsigned long address)
495{
fdfe8aa8
HH
496#ifdef CONFIG_X86_32
497 unsigned long pgd_paddr;
498 pmd_t *pmd_k;
499 pte_t *pte_k;
500 /*
501 * Synchronize this task's top level page-table
502 * with the 'reference' page table.
503 *
504 * Do _not_ use "current" here. We might be inside
505 * an interrupt in the middle of a task switch..
506 */
507 pgd_paddr = read_cr3();
508 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
509 if (!pmd_k)
510 return -1;
511 pte_k = pte_offset_kernel(pmd_k, address);
512 if (!pte_present(*pte_k))
513 return -1;
514 return 0;
515#else
1da177e4
LT
516 pgd_t *pgd, *pgd_ref;
517 pud_t *pud, *pud_ref;
518 pmd_t *pmd, *pmd_ref;
519 pte_t *pte, *pte_ref;
520
cf89ec92
HH
521 /* Make sure we are in vmalloc area */
522 if (!(address >= VMALLOC_START && address < VMALLOC_END))
523 return -1;
524
1da177e4
LT
525 /* Copy kernel mappings over when needed. This can also
526 happen within a race in page table update. In the later
527 case just flush. */
528
529 pgd = pgd_offset(current->mm ?: &init_mm, address);
530 pgd_ref = pgd_offset_k(address);
531 if (pgd_none(*pgd_ref))
532 return -1;
533 if (pgd_none(*pgd))
534 set_pgd(pgd, *pgd_ref);
8c914cb7 535 else
46a82b2d 536 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1da177e4
LT
537
538 /* Below here mismatches are bugs because these lower tables
539 are shared */
540
541 pud = pud_offset(pgd, address);
542 pud_ref = pud_offset(pgd_ref, address);
543 if (pud_none(*pud_ref))
544 return -1;
46a82b2d 545 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
1da177e4
LT
546 BUG();
547 pmd = pmd_offset(pud, address);
548 pmd_ref = pmd_offset(pud_ref, address);
549 if (pmd_none(*pmd_ref))
550 return -1;
551 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
552 BUG();
553 pte_ref = pte_offset_kernel(pmd_ref, address);
554 if (!pte_present(*pte_ref))
555 return -1;
556 pte = pte_offset_kernel(pmd, address);
3b9ba4d5
AK
557 /* Don't use pte_page here, because the mappings can point
558 outside mem_map, and the NUMA hash lookup cannot handle
559 that. */
560 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
1da177e4 561 BUG();
1da177e4 562 return 0;
fdfe8aa8 563#endif
1da177e4
LT
564}
565
abd4f750 566int show_unhandled_signals = 1;
1da177e4
LT
567
568/*
569 * This routine handles page faults. It determines the address,
570 * and the problem, and then passes it off to one of the appropriate
571 * routines.
1da177e4 572 */
f8c2ee22
HH
573#ifdef CONFIG_X86_64
574asmlinkage
575#endif
576void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
1da177e4
LT
577{
578 struct task_struct *tsk;
579 struct mm_struct *mm;
33cb5243 580 struct vm_area_struct *vma;
1da177e4 581 unsigned long address;
f8c2ee22
HH
582 int write, si_code;
583 int fault;
584#ifdef CONFIG_X86_64
1209140c 585 unsigned long flags;
f8c2ee22 586#endif
1da177e4 587
143a5d32
PZ
588 /*
589 * We can fault from pretty much anywhere, with unknown IRQ state.
590 */
591 trace_hardirqs_fixup();
592
a9ba9a3b
AV
593 tsk = current;
594 mm = tsk->mm;
595 prefetchw(&mm->mmap_sem);
596
1da177e4 597 /* get the address */
f51c9452 598 address = read_cr2();
1da177e4 599
c4aba4a8 600 si_code = SEGV_MAPERR;
1da177e4 601
608566b4
HH
602 if (notify_page_fault(regs))
603 return;
1da177e4
LT
604
605 /*
606 * We fault-in kernel-space virtual memory on-demand. The
607 * 'reference' page table is init_mm.pgd.
608 *
609 * NOTE! We MUST NOT take any locks for this case. We may
610 * be in an interrupt or a critical region, and should
611 * only copy the information from the master page table,
612 * nothing more.
613 *
614 * This verifies that the fault happens in kernel space
615 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 616 * protection error (error_code & 9) == 0.
1da177e4 617 */
f8c2ee22
HH
618#ifdef CONFIG_X86_32
619 if (unlikely(address >= TASK_SIZE)) {
cf89ec92
HH
620#else
621 if (unlikely(address >= TASK_SIZE64)) {
622#endif
f8c2ee22
HH
623 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
624 vmalloc_fault(address) >= 0)
625 return;
5b727a3b
JF
626
627 /* Can handle a stale RO->RW TLB */
628 if (spurious_fault(address, error_code))
629 return;
630
f8c2ee22
HH
631 /*
632 * Don't take the mm semaphore here. If we fixup a prefetch
633 * fault we could otherwise deadlock.
634 */
635 goto bad_area_nosemaphore;
636 }
637
cf89ec92
HH
638
639#ifdef CONFIG_X86_32
f8c2ee22
HH
640 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
641 fault has been handled. */
6b6891f9 642 if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
f8c2ee22
HH
643 local_irq_enable();
644
645 /*
646 * If we're in an interrupt, have no user context or are running in an
647 * atomic region then we must not take the fault.
648 */
649 if (in_atomic() || !mm)
650 goto bad_area_nosemaphore;
651#else /* CONFIG_X86_64 */
65ea5b03 652 if (likely(regs->flags & X86_EFLAGS_IF))
8c914cb7
JB
653 local_irq_enable();
654
66c58156 655 if (unlikely(error_code & PF_RSVD))
1da177e4
LT
656 pgtable_bad(address, regs, error_code);
657
658 /*
33cb5243
HH
659 * If we're in an interrupt, have no user context or are running in an
660 * atomic region then we must not take the fault.
1da177e4
LT
661 */
662 if (unlikely(in_atomic() || !mm))
663 goto bad_area_nosemaphore;
664
dbe3ed1c
LT
665 /*
666 * User-mode registers count as a user access even for any
667 * potential system fault or CPU buglet.
668 */
669 if (user_mode_vm(regs))
670 error_code |= PF_USER;
f8c2ee22
HH
671again:
672#endif
1da177e4
LT
673 /* When running in the kernel we expect faults to occur only to
674 * addresses in user space. All other faults represent errors in the
676b1855 675 * kernel and should generate an OOPS. Unfortunately, in the case of an
80f7228b 676 * erroneous fault occurring in a code path which already holds mmap_sem
1da177e4
LT
677 * we will deadlock attempting to validate the fault against the
678 * address space. Luckily the kernel only validly references user
679 * space from well defined areas of code, which are listed in the
680 * exceptions table.
681 *
682 * As the vast majority of faults will be valid we will only perform
676b1855 683 * the source reference check when there is a possibility of a deadlock.
1da177e4
LT
684 * Attempt to lock the address space, if we cannot we then validate the
685 * source. If this is invalid we can skip the address space check,
686 * thus avoiding the deadlock.
687 */
688 if (!down_read_trylock(&mm->mmap_sem)) {
66c58156 689 if ((error_code & PF_USER) == 0 &&
65ea5b03 690 !search_exception_tables(regs->ip))
1da177e4
LT
691 goto bad_area_nosemaphore;
692 down_read(&mm->mmap_sem);
693 }
694
695 vma = find_vma(mm, address);
696 if (!vma)
697 goto bad_area;
f8c2ee22 698 if (vma->vm_start <= address)
1da177e4
LT
699 goto good_area;
700 if (!(vma->vm_flags & VM_GROWSDOWN))
701 goto bad_area;
33cb5243 702 if (error_code & PF_USER) {
6f4d368e
HH
703 /*
704 * Accessing the stack below %sp is always a bug.
705 * The large cushion allows instructions like enter
706 * and pusha to work. ("enter $65535,$31" pushes
707 * 32 pointers and then decrements %sp by 65535.)
03fdc2c2 708 */
65ea5b03 709 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
1da177e4
LT
710 goto bad_area;
711 }
712 if (expand_stack(vma, address))
713 goto bad_area;
714/*
715 * Ok, we have a good vm_area for this memory access, so
716 * we can handle it..
717 */
718good_area:
c4aba4a8 719 si_code = SEGV_ACCERR;
1da177e4 720 write = 0;
66c58156 721 switch (error_code & (PF_PROT|PF_WRITE)) {
33cb5243
HH
722 default: /* 3: write, present */
723 /* fall through */
724 case PF_WRITE: /* write, not present */
725 if (!(vma->vm_flags & VM_WRITE))
726 goto bad_area;
727 write++;
728 break;
729 case PF_PROT: /* read, present */
730 goto bad_area;
731 case 0: /* read, not present */
732 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1da177e4 733 goto bad_area;
1da177e4
LT
734 }
735
f8c2ee22
HH
736#ifdef CONFIG_X86_32
737survive:
738#endif
1da177e4
LT
739 /*
740 * If for any reason at all we couldn't handle the fault,
741 * make sure we exit gracefully rather than endlessly redo
742 * the fault.
743 */
83c54070
NP
744 fault = handle_mm_fault(mm, vma, address, write);
745 if (unlikely(fault & VM_FAULT_ERROR)) {
746 if (fault & VM_FAULT_OOM)
747 goto out_of_memory;
748 else if (fault & VM_FAULT_SIGBUS)
749 goto do_sigbus;
750 BUG();
1da177e4 751 }
83c54070
NP
752 if (fault & VM_FAULT_MAJOR)
753 tsk->maj_flt++;
754 else
755 tsk->min_flt++;
d729ab35
HH
756
757#ifdef CONFIG_X86_32
758 /*
759 * Did it hit the DOS screen memory VA from vm86 mode?
760 */
761 if (v8086_mode(regs)) {
762 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
763 if (bit < 32)
764 tsk->thread.screen_bitmap |= 1 << bit;
765 }
766#endif
1da177e4
LT
767 up_read(&mm->mmap_sem);
768 return;
769
770/*
771 * Something tried to access memory that isn't in our memory map..
772 * Fix it, but check if it's kernel or user first..
773 */
774bad_area:
775 up_read(&mm->mmap_sem);
776
777bad_area_nosemaphore:
1da177e4 778 /* User mode accesses just cause a SIGSEGV */
66c58156 779 if (error_code & PF_USER) {
e5e3c84b
SR
780 /*
781 * It's possible to have interrupts off here.
782 */
783 local_irq_enable();
784
1156e098
HH
785 /*
786 * Valid to do another page fault here because this one came
787 * from user space.
788 */
1da177e4
LT
789 if (is_prefetch(regs, address, error_code))
790 return;
791
35f3266f 792 if (is_errata100(regs, address))
1da177e4
LT
793 return;
794
abd4f750
MAS
795 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
796 printk_ratelimit()) {
1da177e4 797 printk(
6f4d368e 798#ifdef CONFIG_X86_32
edcd8119 799 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
6f4d368e 800#else
03252919 801 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
6f4d368e
HH
802#endif
803 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
804 tsk->comm, task_pid_nr(tsk), address, regs->ip,
805 regs->sp, error_code);
03252919
AK
806 print_vma_addr(" in ", regs->ip);
807 printk("\n");
1da177e4 808 }
33cb5243 809
1da177e4
LT
810 tsk->thread.cr2 = address;
811 /* Kernel addresses are always protection faults */
812 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
813 tsk->thread.trap_no = 14;
c4aba4a8 814 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
1da177e4
LT
815 return;
816 }
817
29caf2f9
HH
818 if (is_f00f_bug(regs, address))
819 return;
820
1da177e4 821no_context:
1da177e4 822 /* Are we prepared to handle this kernel fault? */
33cb5243 823 if (fixup_exception(regs))
1da177e4 824 return;
1da177e4 825
33cb5243 826 /*
f8c2ee22
HH
827 * X86_32
828 * Valid to do another page fault here, because if this fault
829 * had been triggered by is_prefetch fixup_exception would have
830 * handled it.
831 *
832 * X86_64
1da177e4
LT
833 * Hall of shame of CPU/BIOS bugs.
834 */
33cb5243
HH
835 if (is_prefetch(regs, address, error_code))
836 return;
1da177e4
LT
837
838 if (is_errata93(regs, address))
33cb5243 839 return;
1da177e4
LT
840
841/*
842 * Oops. The kernel tried to access some bad page. We'll have to
843 * terminate things with extreme prejudice.
844 */
f8c2ee22
HH
845#ifdef CONFIG_X86_32
846 bust_spinlocks(1);
fd40d6e3
HH
847#else
848 flags = oops_begin();
849#endif
f8c2ee22
HH
850
851 show_fault_oops(regs, error_code, address);
1da177e4 852
f8c2ee22
HH
853 tsk->thread.cr2 = address;
854 tsk->thread.trap_no = 14;
855 tsk->thread.error_code = error_code;
fd40d6e3
HH
856
857#ifdef CONFIG_X86_32
f8c2ee22
HH
858 die("Oops", regs, error_code);
859 bust_spinlocks(0);
860 do_exit(SIGKILL);
fd40d6e3 861#else
22f5991c
JB
862 if (__die("Oops", regs, error_code))
863 regs = NULL;
1da177e4
LT
864 /* Executive summary in case the body of the oops scrolled away */
865 printk(KERN_EMERG "CR2: %016lx\n", address);
22f5991c 866 oops_end(flags, regs, SIGKILL);
f8c2ee22 867#endif
1da177e4
LT
868
869/*
870 * We ran out of memory, or some other thing happened to us that made
871 * us unable to handle the page fault gracefully.
872 */
873out_of_memory:
874 up_read(&mm->mmap_sem);
f8c2ee22
HH
875 if (is_global_init(tsk)) {
876 yield();
fd40d6e3 877#ifdef CONFIG_X86_32
f8c2ee22
HH
878 down_read(&mm->mmap_sem);
879 goto survive;
f8c2ee22 880#else
1da177e4 881 goto again;
f8c2ee22 882#endif
fd40d6e3
HH
883 }
884
1da177e4 885 printk("VM: killing process %s\n", tsk->comm);
318aa296 886 if (error_code & PF_USER)
021daae2 887 do_group_exit(SIGKILL);
1da177e4
LT
888 goto no_context;
889
890do_sigbus:
891 up_read(&mm->mmap_sem);
892
893 /* Kernel mode? Handle exceptions or die */
66c58156 894 if (!(error_code & PF_USER))
1da177e4 895 goto no_context;
f8c2ee22
HH
896#ifdef CONFIG_X86_32
897 /* User space => ok to do another page fault */
898 if (is_prefetch(regs, address, error_code))
899 return;
900#endif
1da177e4
LT
901 tsk->thread.cr2 = address;
902 tsk->thread.error_code = error_code;
903 tsk->thread.trap_no = 14;
c4aba4a8 904 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
1da177e4 905}
9e43e1b7 906
8c914cb7 907DEFINE_SPINLOCK(pgd_lock);
2bff7383 908LIST_HEAD(pgd_list);
8c914cb7
JB
909
910void vmalloc_sync_all(void)
911{
1156e098
HH
912#ifdef CONFIG_X86_32
913 /*
914 * Note that races in the updates of insync and start aren't
915 * problematic: insync can only get set bits added, and updates to
916 * start are only improving performance (without affecting correctness
917 * if undone).
918 */
919 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
920 static unsigned long start = TASK_SIZE;
921 unsigned long address;
922
923 if (SHARED_KERNEL_PMD)
924 return;
925
926 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
927 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
928 if (!test_bit(pgd_index(address), insync)) {
929 unsigned long flags;
930 struct page *page;
931
932 spin_lock_irqsave(&pgd_lock, flags);
e3ed910d 933 list_for_each_entry(page, &pgd_list, lru) {
1156e098 934 if (!vmalloc_sync_one(page_address(page),
e3ed910d 935 address))
1156e098 936 break;
e3ed910d 937 }
1156e098
HH
938 spin_unlock_irqrestore(&pgd_lock, flags);
939 if (!page)
940 set_bit(pgd_index(address), insync);
941 }
942 if (address == start && test_bit(pgd_index(address), insync))
943 start = address + PGDIR_SIZE;
944 }
945#else /* CONFIG_X86_64 */
6f4d368e
HH
946 /*
947 * Note that races in the updates of insync and start aren't
948 * problematic: insync can only get set bits added, and updates to
949 * start are only improving performance (without affecting correctness
950 * if undone).
951 */
8c914cb7
JB
952 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
953 static unsigned long start = VMALLOC_START & PGDIR_MASK;
954 unsigned long address;
955
956 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
957 if (!test_bit(pgd_index(address), insync)) {
958 const pgd_t *pgd_ref = pgd_offset_k(address);
58d5d0d8 959 unsigned long flags;
8c914cb7
JB
960 struct page *page;
961
962 if (pgd_none(*pgd_ref))
963 continue;
58d5d0d8 964 spin_lock_irqsave(&pgd_lock, flags);
2bff7383 965 list_for_each_entry(page, &pgd_list, lru) {
8c914cb7
JB
966 pgd_t *pgd;
967 pgd = (pgd_t *)page_address(page) + pgd_index(address);
968 if (pgd_none(*pgd))
969 set_pgd(pgd, *pgd_ref);
970 else
46a82b2d 971 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8c914cb7 972 }
58d5d0d8 973 spin_unlock_irqrestore(&pgd_lock, flags);
8c914cb7
JB
974 set_bit(pgd_index(address), insync);
975 }
976 if (address == start)
977 start = address + PGDIR_SIZE;
978 }
1156e098 979#endif
8c914cb7 980}