]> bbs.cooldavid.org Git - net-next-2.6.git/blame - arch/x86/mm/fault.c
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/x86/linux...
[net-next-2.6.git] / arch / x86 / mm / fault.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
1da177e4
LT
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
1da177e4
LT
16#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
c61e211d
HH
21#include <linux/highmem.h>
22#include <linux/bootmem.h> /* for max_low_pfn */
1eeb66a1 23#include <linux/vmalloc.h>
1da177e4 24#include <linux/module.h>
0f2fbdcb 25#include <linux/kprobes.h>
ab2bf0c1 26#include <linux/uaccess.h>
1eeb66a1 27#include <linux/kdebug.h>
1da177e4
LT
28
29#include <asm/system.h>
c61e211d
HH
30#include <asm/desc.h>
31#include <asm/segment.h>
1da177e4
LT
32#include <asm/pgalloc.h>
33#include <asm/smp.h>
34#include <asm/tlbflush.h>
35#include <asm/proto.h>
1da177e4 36#include <asm-generic/sections.h>
1da177e4 37
33cb5243
HH
38/*
39 * Page fault error code bits
40 * bit 0 == 0 means no page found, 1 means protection fault
41 * bit 1 == 0 means read, 1 means write
42 * bit 2 == 0 means kernel, 1 means user-mode
43 * bit 3 == 1 means use of reserved bit detected
44 * bit 4 == 1 means fault was an instruction fetch
45 */
8a19da7b 46#define PF_PROT (1<<0)
66c58156 47#define PF_WRITE (1<<1)
8a19da7b
IM
48#define PF_USER (1<<2)
49#define PF_RSVD (1<<3)
66c58156
AK
50#define PF_INSTR (1<<4)
51
74a0b576 52static inline int notify_page_fault(struct pt_regs *regs)
1bd858a5 53{
33cb5243 54#ifdef CONFIG_KPROBES
74a0b576
CH
55 int ret = 0;
56
57 /* kprobe_running() needs smp_processor_id() */
f8c2ee22
HH
58#ifdef CONFIG_X86_32
59 if (!user_mode_vm(regs)) {
60#else
74a0b576 61 if (!user_mode(regs)) {
f8c2ee22 62#endif
74a0b576
CH
63 preempt_disable();
64 if (kprobe_running() && kprobe_fault_handler(regs, 14))
65 ret = 1;
66 preempt_enable();
67 }
1bd858a5 68
74a0b576 69 return ret;
74a0b576 70#else
74a0b576 71 return 0;
74a0b576 72#endif
33cb5243 73}
1bd858a5 74
1dc85be0
HH
75/*
76 * X86_32
77 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
78 * Check that here and ignore it.
79 *
80 * X86_64
81 * Sometimes the CPU reports invalid exceptions on prefetch.
82 * Check that here and ignore it.
83 *
84 * Opcode checker based on code by Richard Brunner
85 */
86static int is_prefetch(struct pt_regs *regs, unsigned long addr,
87 unsigned long error_code)
33cb5243 88{
ab2bf0c1 89 unsigned char *instr;
1da177e4 90 int scan_more = 1;
33cb5243 91 int prefetch = 0;
f1290ec9 92 unsigned char *max_instr;
1da177e4 93
1dc85be0 94#ifdef CONFIG_X86_32
bc713dcf
IM
95 /* Catch an obscure case of prefetch inside an NX page: */
96 if ((__supported_pte_mask & _PAGE_NX) && (error_code & 16))
1dc85be0 97 return 0;
b406ac61
HH
98#endif
99
c61e211d 100 /* If it was a exec fault on NX page, ignore */
66c58156 101 if (error_code & PF_INSTR)
1da177e4 102 return 0;
1dc85be0 103
f2857ce9 104 instr = (unsigned char *)convert_ip_to_linear(current, regs);
f1290ec9 105 max_instr = instr + 15;
1da177e4 106
76381fee 107 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4
LT
108 return 0;
109
33cb5243 110 while (scan_more && instr < max_instr) {
1da177e4
LT
111 unsigned char opcode;
112 unsigned char instr_hi;
113 unsigned char instr_lo;
114
ab2bf0c1 115 if (probe_kernel_address(instr, opcode))
33cb5243 116 break;
1da177e4 117
33cb5243
HH
118 instr_hi = opcode & 0xf0;
119 instr_lo = opcode & 0x0f;
1da177e4
LT
120 instr++;
121
33cb5243 122 switch (instr_hi) {
1da177e4
LT
123 case 0x20:
124 case 0x30:
33cb5243
HH
125 /*
126 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
127 * In X86_64 long mode, the CPU will signal invalid
128 * opcode if some of these prefixes are present so
129 * X86_64 will never get here anyway
130 */
1da177e4
LT
131 scan_more = ((instr_lo & 7) == 0x6);
132 break;
33cb5243 133#ifdef CONFIG_X86_64
1da177e4 134 case 0x40:
33cb5243
HH
135 /*
136 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
137 * Need to figure out under what instruction mode the
138 * instruction was issued. Could check the LDT for lm,
139 * but for now it's good enough to assume that long
140 * mode only uses well known segments or kernel.
141 */
76381fee 142 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
1da177e4 143 break;
33cb5243 144#endif
1da177e4
LT
145 case 0x60:
146 /* 0x64 thru 0x67 are valid prefixes in all modes. */
147 scan_more = (instr_lo & 0xC) == 0x4;
33cb5243 148 break;
1da177e4 149 case 0xF0:
1dc85be0 150 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
1da177e4 151 scan_more = !instr_lo || (instr_lo>>1) == 1;
33cb5243 152 break;
1da177e4
LT
153 case 0x00:
154 /* Prefetch instruction is 0x0F0D or 0x0F18 */
155 scan_more = 0;
f2857ce9 156
ab2bf0c1 157 if (probe_kernel_address(instr, opcode))
1da177e4
LT
158 break;
159 prefetch = (instr_lo == 0xF) &&
160 (opcode == 0x0D || opcode == 0x18);
33cb5243 161 break;
1da177e4
LT
162 default:
163 scan_more = 0;
164 break;
33cb5243 165 }
1da177e4
LT
166 }
167 return prefetch;
168}
169
c4aba4a8
HH
170static void force_sig_info_fault(int si_signo, int si_code,
171 unsigned long address, struct task_struct *tsk)
172{
173 siginfo_t info;
174
175 info.si_signo = si_signo;
176 info.si_errno = 0;
177 info.si_code = si_code;
178 info.si_addr = (void __user *)address;
179 force_sig_info(si_signo, &info, tsk);
180}
181
1156e098 182#ifdef CONFIG_X86_64
33cb5243
HH
183static int bad_address(void *p)
184{
1da177e4 185 unsigned long dummy;
ab2bf0c1 186 return probe_kernel_address((unsigned long *)p, dummy);
33cb5243 187}
1156e098 188#endif
1da177e4 189
cae30f82 190static void dump_pagetable(unsigned long address)
1da177e4 191{
1156e098
HH
192#ifdef CONFIG_X86_32
193 __typeof__(pte_val(__pte(0))) page;
194
195 page = read_cr3();
196 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
197#ifdef CONFIG_X86_PAE
198 printk("*pdpt = %016Lx ", page);
199 if ((page >> PAGE_SHIFT) < max_low_pfn
200 && page & _PAGE_PRESENT) {
201 page &= PAGE_MASK;
202 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
203 & (PTRS_PER_PMD - 1)];
204 printk(KERN_CONT "*pde = %016Lx ", page);
205 page &= ~_PAGE_NX;
206 }
207#else
208 printk("*pde = %08lx ", page);
209#endif
210
211 /*
212 * We must not directly access the pte in the highpte
213 * case if the page table is located in highmem.
214 * And let's rather not kmap-atomic the pte, just in case
215 * it's allocated already.
216 */
217 if ((page >> PAGE_SHIFT) < max_low_pfn
218 && (page & _PAGE_PRESENT)
219 && !(page & _PAGE_PSE)) {
220 page &= PAGE_MASK;
221 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
222 & (PTRS_PER_PTE - 1)];
223 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
224 }
225
226 printk("\n");
227#else /* CONFIG_X86_64 */
1da177e4
LT
228 pgd_t *pgd;
229 pud_t *pud;
230 pmd_t *pmd;
231 pte_t *pte;
232
f51c9452 233 pgd = (pgd_t *)read_cr3();
1da177e4 234
33cb5243 235 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
1da177e4 236 pgd += pgd_index(address);
1da177e4 237 if (bad_address(pgd)) goto bad;
d646bce4 238 printk("PGD %lx ", pgd_val(*pgd));
33cb5243 239 if (!pgd_present(*pgd)) goto ret;
1da177e4 240
d2ae5b5f 241 pud = pud_offset(pgd, address);
1da177e4
LT
242 if (bad_address(pud)) goto bad;
243 printk("PUD %lx ", pud_val(*pud));
b5360222
AK
244 if (!pud_present(*pud) || pud_large(*pud))
245 goto ret;
1da177e4
LT
246
247 pmd = pmd_offset(pud, address);
248 if (bad_address(pmd)) goto bad;
249 printk("PMD %lx ", pmd_val(*pmd));
b1992df3 250 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
1da177e4
LT
251
252 pte = pte_offset_kernel(pmd, address);
253 if (bad_address(pte)) goto bad;
33cb5243 254 printk("PTE %lx", pte_val(*pte));
1da177e4
LT
255ret:
256 printk("\n");
257 return;
258bad:
259 printk("BAD\n");
1156e098
HH
260#endif
261}
262
263#ifdef CONFIG_X86_32
264static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
265{
266 unsigned index = pgd_index(address);
267 pgd_t *pgd_k;
268 pud_t *pud, *pud_k;
269 pmd_t *pmd, *pmd_k;
270
271 pgd += index;
272 pgd_k = init_mm.pgd + index;
273
274 if (!pgd_present(*pgd_k))
275 return NULL;
276
277 /*
278 * set_pgd(pgd, *pgd_k); here would be useless on PAE
279 * and redundant with the set_pmd() on non-PAE. As would
280 * set_pud.
281 */
282
283 pud = pud_offset(pgd, address);
284 pud_k = pud_offset(pgd_k, address);
285 if (!pud_present(*pud_k))
286 return NULL;
287
288 pmd = pmd_offset(pud, address);
289 pmd_k = pmd_offset(pud_k, address);
290 if (!pmd_present(*pmd_k))
291 return NULL;
292 if (!pmd_present(*pmd)) {
293 set_pmd(pmd, *pmd_k);
294 arch_flush_lazy_mmu_mode();
295 } else
296 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
297 return pmd_k;
1da177e4 298}
1156e098 299#endif
1da177e4 300
1dc85be0 301#ifdef CONFIG_X86_64
33cb5243 302static const char errata93_warning[] =
1da177e4
LT
303KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
304KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
305KERN_ERR "******* Please consider a BIOS update.\n"
306KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
fdfe8aa8 307#endif
1da177e4
LT
308
309/* Workaround for K8 erratum #93 & buggy BIOS.
310 BIOS SMM functions are required to use a specific workaround
33cb5243
HH
311 to avoid corruption of the 64bit RIP register on C stepping K8.
312 A lot of BIOS that didn't get tested properly miss this.
1da177e4
LT
313 The OS sees this as a page fault with the upper 32bits of RIP cleared.
314 Try to work around it here.
fdfe8aa8
HH
315 Note we only handle faults in kernel here.
316 Does nothing for X86_32
317 */
33cb5243 318static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4 319{
fdfe8aa8 320#ifdef CONFIG_X86_64
1da177e4 321 static int warned;
65ea5b03 322 if (address != regs->ip)
1da177e4 323 return 0;
33cb5243 324 if ((address >> 32) != 0)
1da177e4
LT
325 return 0;
326 address |= 0xffffffffUL << 32;
33cb5243
HH
327 if ((address >= (u64)_stext && address <= (u64)_etext) ||
328 (address >= MODULES_VADDR && address <= MODULES_END)) {
1da177e4 329 if (!warned) {
33cb5243 330 printk(errata93_warning);
1da177e4
LT
331 warned = 1;
332 }
65ea5b03 333 regs->ip = address;
1da177e4
LT
334 return 1;
335 }
fdfe8aa8 336#endif
1da177e4 337 return 0;
33cb5243 338}
1da177e4 339
35f3266f
HH
340/*
341 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
342 * addresses >4GB. We catch this in the page fault handler because these
343 * addresses are not reachable. Just detect this case and return. Any code
344 * segment in LDT is compatibility mode.
345 */
346static int is_errata100(struct pt_regs *regs, unsigned long address)
347{
348#ifdef CONFIG_X86_64
349 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
350 (address >> 32))
351 return 1;
352#endif
353 return 0;
354}
355
29caf2f9
HH
356void do_invalid_op(struct pt_regs *, unsigned long);
357
358static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
359{
360#ifdef CONFIG_X86_F00F_BUG
361 unsigned long nr;
362 /*
363 * Pentium F0 0F C7 C8 bug workaround.
364 */
365 if (boot_cpu_data.f00f_bug) {
366 nr = (address - idt_descr.address) >> 3;
367
368 if (nr == 6) {
369 do_invalid_op(regs, 0);
370 return 1;
371 }
372 }
373#endif
374 return 0;
375}
376
b3279c7f
HH
377static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
378 unsigned long address)
379{
1156e098
HH
380#ifdef CONFIG_X86_32
381 if (!oops_may_print())
382 return;
fd40d6e3 383#endif
1156e098
HH
384
385#ifdef CONFIG_X86_PAE
386 if (error_code & PF_INSTR) {
93809be8 387 unsigned int level;
1156e098
HH
388 pte_t *pte = lookup_address(address, &level);
389
390 if (pte && pte_present(*pte) && !pte_exec(*pte))
391 printk(KERN_CRIT "kernel tried to execute "
392 "NX-protected page - exploit attempt? "
393 "(uid: %d)\n", current->uid);
394 }
395#endif
1156e098 396
19f0dda9 397 printk(KERN_ALERT "BUG: unable to handle kernel ");
b3279c7f 398 if (address < PAGE_SIZE)
19f0dda9 399 printk(KERN_CONT "NULL pointer dereference");
b3279c7f 400 else
19f0dda9 401 printk(KERN_CONT "paging request");
fd40d6e3
HH
402#ifdef CONFIG_X86_32
403 printk(KERN_CONT " at %08lx\n", address);
404#else
19f0dda9 405 printk(KERN_CONT " at %016lx\n", address);
fd40d6e3 406#endif
19f0dda9 407 printk(KERN_ALERT "IP:");
b3279c7f
HH
408 printk_address(regs->ip, 1);
409 dump_pagetable(address);
410}
411
1156e098 412#ifdef CONFIG_X86_64
1da177e4
LT
413static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
414 unsigned long error_code)
415{
1209140c 416 unsigned long flags = oops_begin();
6e3f3617 417 struct task_struct *tsk;
1209140c 418
1da177e4
LT
419 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
420 current->comm, address);
421 dump_pagetable(address);
6e3f3617
JB
422 tsk = current;
423 tsk->thread.cr2 = address;
424 tsk->thread.trap_no = 14;
425 tsk->thread.error_code = error_code;
22f5991c
JB
426 if (__die("Bad pagetable", regs, error_code))
427 regs = NULL;
428 oops_end(flags, regs, SIGKILL);
1da177e4 429}
1156e098 430#endif
1da177e4 431
d8b57bb7
TG
432static int spurious_fault_check(unsigned long error_code, pte_t *pte)
433{
434 if ((error_code & PF_WRITE) && !pte_write(*pte))
435 return 0;
436 if ((error_code & PF_INSTR) && !pte_exec(*pte))
437 return 0;
438
439 return 1;
440}
441
5b727a3b
JF
442/*
443 * Handle a spurious fault caused by a stale TLB entry. This allows
444 * us to lazily refresh the TLB when increasing the permissions of a
445 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
446 * expensive since that implies doing a full cross-processor TLB
447 * flush, even if no stale TLB entries exist on other processors.
448 * There are no security implications to leaving a stale TLB when
449 * increasing the permissions on a page.
450 */
451static int spurious_fault(unsigned long address,
452 unsigned long error_code)
453{
454 pgd_t *pgd;
455 pud_t *pud;
456 pmd_t *pmd;
457 pte_t *pte;
458
459 /* Reserved-bit violation or user access to kernel space? */
460 if (error_code & (PF_USER | PF_RSVD))
461 return 0;
462
463 pgd = init_mm.pgd + pgd_index(address);
464 if (!pgd_present(*pgd))
465 return 0;
466
467 pud = pud_offset(pgd, address);
468 if (!pud_present(*pud))
469 return 0;
470
d8b57bb7
TG
471 if (pud_large(*pud))
472 return spurious_fault_check(error_code, (pte_t *) pud);
473
5b727a3b
JF
474 pmd = pmd_offset(pud, address);
475 if (!pmd_present(*pmd))
476 return 0;
477
d8b57bb7
TG
478 if (pmd_large(*pmd))
479 return spurious_fault_check(error_code, (pte_t *) pmd);
480
5b727a3b
JF
481 pte = pte_offset_kernel(pmd, address);
482 if (!pte_present(*pte))
483 return 0;
484
d8b57bb7 485 return spurious_fault_check(error_code, pte);
5b727a3b
JF
486}
487
1da177e4 488/*
f8c2ee22
HH
489 * X86_32
490 * Handle a fault on the vmalloc or module mapping area
491 *
492 * X86_64
f95190b2 493 * Handle a fault on the vmalloc area
3b9ba4d5
AK
494 *
495 * This assumes no large pages in there.
1da177e4
LT
496 */
497static int vmalloc_fault(unsigned long address)
498{
fdfe8aa8
HH
499#ifdef CONFIG_X86_32
500 unsigned long pgd_paddr;
501 pmd_t *pmd_k;
502 pte_t *pte_k;
503 /*
504 * Synchronize this task's top level page-table
505 * with the 'reference' page table.
506 *
507 * Do _not_ use "current" here. We might be inside
508 * an interrupt in the middle of a task switch..
509 */
510 pgd_paddr = read_cr3();
511 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
512 if (!pmd_k)
513 return -1;
514 pte_k = pte_offset_kernel(pmd_k, address);
515 if (!pte_present(*pte_k))
516 return -1;
517 return 0;
518#else
1da177e4
LT
519 pgd_t *pgd, *pgd_ref;
520 pud_t *pud, *pud_ref;
521 pmd_t *pmd, *pmd_ref;
522 pte_t *pte, *pte_ref;
523
cf89ec92
HH
524 /* Make sure we are in vmalloc area */
525 if (!(address >= VMALLOC_START && address < VMALLOC_END))
526 return -1;
527
1da177e4
LT
528 /* Copy kernel mappings over when needed. This can also
529 happen within a race in page table update. In the later
530 case just flush. */
531
532 pgd = pgd_offset(current->mm ?: &init_mm, address);
533 pgd_ref = pgd_offset_k(address);
534 if (pgd_none(*pgd_ref))
535 return -1;
536 if (pgd_none(*pgd))
537 set_pgd(pgd, *pgd_ref);
8c914cb7 538 else
46a82b2d 539 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1da177e4
LT
540
541 /* Below here mismatches are bugs because these lower tables
542 are shared */
543
544 pud = pud_offset(pgd, address);
545 pud_ref = pud_offset(pgd_ref, address);
546 if (pud_none(*pud_ref))
547 return -1;
46a82b2d 548 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
1da177e4
LT
549 BUG();
550 pmd = pmd_offset(pud, address);
551 pmd_ref = pmd_offset(pud_ref, address);
552 if (pmd_none(*pmd_ref))
553 return -1;
554 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
555 BUG();
556 pte_ref = pte_offset_kernel(pmd_ref, address);
557 if (!pte_present(*pte_ref))
558 return -1;
559 pte = pte_offset_kernel(pmd, address);
3b9ba4d5
AK
560 /* Don't use pte_page here, because the mappings can point
561 outside mem_map, and the NUMA hash lookup cannot handle
562 that. */
563 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
1da177e4 564 BUG();
1da177e4 565 return 0;
fdfe8aa8 566#endif
1da177e4
LT
567}
568
abd4f750 569int show_unhandled_signals = 1;
1da177e4
LT
570
571/*
572 * This routine handles page faults. It determines the address,
573 * and the problem, and then passes it off to one of the appropriate
574 * routines.
1da177e4 575 */
f8c2ee22
HH
576#ifdef CONFIG_X86_64
577asmlinkage
578#endif
579void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
1da177e4
LT
580{
581 struct task_struct *tsk;
582 struct mm_struct *mm;
33cb5243 583 struct vm_area_struct *vma;
1da177e4 584 unsigned long address;
f8c2ee22
HH
585 int write, si_code;
586 int fault;
587#ifdef CONFIG_X86_64
1209140c 588 unsigned long flags;
f8c2ee22 589#endif
1da177e4 590
143a5d32
PZ
591 /*
592 * We can fault from pretty much anywhere, with unknown IRQ state.
593 */
594 trace_hardirqs_fixup();
595
a9ba9a3b
AV
596 tsk = current;
597 mm = tsk->mm;
598 prefetchw(&mm->mmap_sem);
599
1da177e4 600 /* get the address */
f51c9452 601 address = read_cr2();
1da177e4 602
c4aba4a8 603 si_code = SEGV_MAPERR;
1da177e4 604
608566b4
HH
605 if (notify_page_fault(regs))
606 return;
1da177e4
LT
607
608 /*
609 * We fault-in kernel-space virtual memory on-demand. The
610 * 'reference' page table is init_mm.pgd.
611 *
612 * NOTE! We MUST NOT take any locks for this case. We may
613 * be in an interrupt or a critical region, and should
614 * only copy the information from the master page table,
615 * nothing more.
616 *
617 * This verifies that the fault happens in kernel space
618 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 619 * protection error (error_code & 9) == 0.
1da177e4 620 */
f8c2ee22
HH
621#ifdef CONFIG_X86_32
622 if (unlikely(address >= TASK_SIZE)) {
cf89ec92
HH
623#else
624 if (unlikely(address >= TASK_SIZE64)) {
625#endif
f8c2ee22
HH
626 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
627 vmalloc_fault(address) >= 0)
628 return;
5b727a3b
JF
629
630 /* Can handle a stale RO->RW TLB */
631 if (spurious_fault(address, error_code))
632 return;
633
f8c2ee22
HH
634 /*
635 * Don't take the mm semaphore here. If we fixup a prefetch
636 * fault we could otherwise deadlock.
637 */
638 goto bad_area_nosemaphore;
639 }
640
cf89ec92
HH
641
642#ifdef CONFIG_X86_32
f8c2ee22
HH
643 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
644 fault has been handled. */
645 if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
646 local_irq_enable();
647
648 /*
649 * If we're in an interrupt, have no user context or are running in an
650 * atomic region then we must not take the fault.
651 */
652 if (in_atomic() || !mm)
653 goto bad_area_nosemaphore;
654#else /* CONFIG_X86_64 */
65ea5b03 655 if (likely(regs->flags & X86_EFLAGS_IF))
8c914cb7
JB
656 local_irq_enable();
657
66c58156 658 if (unlikely(error_code & PF_RSVD))
1da177e4
LT
659 pgtable_bad(address, regs, error_code);
660
661 /*
33cb5243
HH
662 * If we're in an interrupt, have no user context or are running in an
663 * atomic region then we must not take the fault.
1da177e4
LT
664 */
665 if (unlikely(in_atomic() || !mm))
666 goto bad_area_nosemaphore;
667
dbe3ed1c
LT
668 /*
669 * User-mode registers count as a user access even for any
670 * potential system fault or CPU buglet.
671 */
672 if (user_mode_vm(regs))
673 error_code |= PF_USER;
f8c2ee22
HH
674again:
675#endif
1da177e4
LT
676 /* When running in the kernel we expect faults to occur only to
677 * addresses in user space. All other faults represent errors in the
676b1855 678 * kernel and should generate an OOPS. Unfortunately, in the case of an
80f7228b 679 * erroneous fault occurring in a code path which already holds mmap_sem
1da177e4
LT
680 * we will deadlock attempting to validate the fault against the
681 * address space. Luckily the kernel only validly references user
682 * space from well defined areas of code, which are listed in the
683 * exceptions table.
684 *
685 * As the vast majority of faults will be valid we will only perform
676b1855 686 * the source reference check when there is a possibility of a deadlock.
1da177e4
LT
687 * Attempt to lock the address space, if we cannot we then validate the
688 * source. If this is invalid we can skip the address space check,
689 * thus avoiding the deadlock.
690 */
691 if (!down_read_trylock(&mm->mmap_sem)) {
66c58156 692 if ((error_code & PF_USER) == 0 &&
65ea5b03 693 !search_exception_tables(regs->ip))
1da177e4
LT
694 goto bad_area_nosemaphore;
695 down_read(&mm->mmap_sem);
696 }
697
698 vma = find_vma(mm, address);
699 if (!vma)
700 goto bad_area;
f8c2ee22 701 if (vma->vm_start <= address)
1da177e4
LT
702 goto good_area;
703 if (!(vma->vm_flags & VM_GROWSDOWN))
704 goto bad_area;
33cb5243 705 if (error_code & PF_USER) {
6f4d368e
HH
706 /*
707 * Accessing the stack below %sp is always a bug.
708 * The large cushion allows instructions like enter
709 * and pusha to work. ("enter $65535,$31" pushes
710 * 32 pointers and then decrements %sp by 65535.)
03fdc2c2 711 */
65ea5b03 712 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
1da177e4
LT
713 goto bad_area;
714 }
715 if (expand_stack(vma, address))
716 goto bad_area;
717/*
718 * Ok, we have a good vm_area for this memory access, so
719 * we can handle it..
720 */
721good_area:
c4aba4a8 722 si_code = SEGV_ACCERR;
1da177e4 723 write = 0;
66c58156 724 switch (error_code & (PF_PROT|PF_WRITE)) {
33cb5243
HH
725 default: /* 3: write, present */
726 /* fall through */
727 case PF_WRITE: /* write, not present */
728 if (!(vma->vm_flags & VM_WRITE))
729 goto bad_area;
730 write++;
731 break;
732 case PF_PROT: /* read, present */
733 goto bad_area;
734 case 0: /* read, not present */
735 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1da177e4 736 goto bad_area;
1da177e4
LT
737 }
738
f8c2ee22
HH
739#ifdef CONFIG_X86_32
740survive:
741#endif
1da177e4
LT
742 /*
743 * If for any reason at all we couldn't handle the fault,
744 * make sure we exit gracefully rather than endlessly redo
745 * the fault.
746 */
83c54070
NP
747 fault = handle_mm_fault(mm, vma, address, write);
748 if (unlikely(fault & VM_FAULT_ERROR)) {
749 if (fault & VM_FAULT_OOM)
750 goto out_of_memory;
751 else if (fault & VM_FAULT_SIGBUS)
752 goto do_sigbus;
753 BUG();
1da177e4 754 }
83c54070
NP
755 if (fault & VM_FAULT_MAJOR)
756 tsk->maj_flt++;
757 else
758 tsk->min_flt++;
d729ab35
HH
759
760#ifdef CONFIG_X86_32
761 /*
762 * Did it hit the DOS screen memory VA from vm86 mode?
763 */
764 if (v8086_mode(regs)) {
765 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
766 if (bit < 32)
767 tsk->thread.screen_bitmap |= 1 << bit;
768 }
769#endif
1da177e4
LT
770 up_read(&mm->mmap_sem);
771 return;
772
773/*
774 * Something tried to access memory that isn't in our memory map..
775 * Fix it, but check if it's kernel or user first..
776 */
777bad_area:
778 up_read(&mm->mmap_sem);
779
780bad_area_nosemaphore:
1da177e4 781 /* User mode accesses just cause a SIGSEGV */
66c58156 782 if (error_code & PF_USER) {
e5e3c84b
SR
783 /*
784 * It's possible to have interrupts off here.
785 */
786 local_irq_enable();
787
1156e098
HH
788 /*
789 * Valid to do another page fault here because this one came
790 * from user space.
791 */
1da177e4
LT
792 if (is_prefetch(regs, address, error_code))
793 return;
794
35f3266f 795 if (is_errata100(regs, address))
1da177e4
LT
796 return;
797
abd4f750
MAS
798 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
799 printk_ratelimit()) {
1da177e4 800 printk(
6f4d368e 801#ifdef CONFIG_X86_32
edcd8119 802 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
6f4d368e 803#else
03252919 804 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
6f4d368e
HH
805#endif
806 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
807 tsk->comm, task_pid_nr(tsk), address, regs->ip,
808 regs->sp, error_code);
03252919
AK
809 print_vma_addr(" in ", regs->ip);
810 printk("\n");
1da177e4 811 }
33cb5243 812
1da177e4
LT
813 tsk->thread.cr2 = address;
814 /* Kernel addresses are always protection faults */
815 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
816 tsk->thread.trap_no = 14;
c4aba4a8 817 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
1da177e4
LT
818 return;
819 }
820
29caf2f9
HH
821 if (is_f00f_bug(regs, address))
822 return;
823
1da177e4 824no_context:
1da177e4 825 /* Are we prepared to handle this kernel fault? */
33cb5243 826 if (fixup_exception(regs))
1da177e4 827 return;
1da177e4 828
33cb5243 829 /*
f8c2ee22
HH
830 * X86_32
831 * Valid to do another page fault here, because if this fault
832 * had been triggered by is_prefetch fixup_exception would have
833 * handled it.
834 *
835 * X86_64
1da177e4
LT
836 * Hall of shame of CPU/BIOS bugs.
837 */
33cb5243
HH
838 if (is_prefetch(regs, address, error_code))
839 return;
1da177e4
LT
840
841 if (is_errata93(regs, address))
33cb5243 842 return;
1da177e4
LT
843
844/*
845 * Oops. The kernel tried to access some bad page. We'll have to
846 * terminate things with extreme prejudice.
847 */
f8c2ee22
HH
848#ifdef CONFIG_X86_32
849 bust_spinlocks(1);
fd40d6e3
HH
850#else
851 flags = oops_begin();
852#endif
f8c2ee22
HH
853
854 show_fault_oops(regs, error_code, address);
1da177e4 855
f8c2ee22
HH
856 tsk->thread.cr2 = address;
857 tsk->thread.trap_no = 14;
858 tsk->thread.error_code = error_code;
fd40d6e3
HH
859
860#ifdef CONFIG_X86_32
f8c2ee22
HH
861 die("Oops", regs, error_code);
862 bust_spinlocks(0);
863 do_exit(SIGKILL);
fd40d6e3 864#else
22f5991c
JB
865 if (__die("Oops", regs, error_code))
866 regs = NULL;
1da177e4
LT
867 /* Executive summary in case the body of the oops scrolled away */
868 printk(KERN_EMERG "CR2: %016lx\n", address);
22f5991c 869 oops_end(flags, regs, SIGKILL);
f8c2ee22 870#endif
1da177e4
LT
871
872/*
873 * We ran out of memory, or some other thing happened to us that made
874 * us unable to handle the page fault gracefully.
875 */
876out_of_memory:
877 up_read(&mm->mmap_sem);
f8c2ee22
HH
878 if (is_global_init(tsk)) {
879 yield();
fd40d6e3 880#ifdef CONFIG_X86_32
f8c2ee22
HH
881 down_read(&mm->mmap_sem);
882 goto survive;
f8c2ee22 883#else
1da177e4 884 goto again;
f8c2ee22 885#endif
fd40d6e3
HH
886 }
887
1da177e4 888 printk("VM: killing process %s\n", tsk->comm);
318aa296 889 if (error_code & PF_USER)
021daae2 890 do_group_exit(SIGKILL);
1da177e4
LT
891 goto no_context;
892
893do_sigbus:
894 up_read(&mm->mmap_sem);
895
896 /* Kernel mode? Handle exceptions or die */
66c58156 897 if (!(error_code & PF_USER))
1da177e4 898 goto no_context;
f8c2ee22
HH
899#ifdef CONFIG_X86_32
900 /* User space => ok to do another page fault */
901 if (is_prefetch(regs, address, error_code))
902 return;
903#endif
1da177e4
LT
904 tsk->thread.cr2 = address;
905 tsk->thread.error_code = error_code;
906 tsk->thread.trap_no = 14;
c4aba4a8 907 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
1da177e4 908}
9e43e1b7 909
8c914cb7 910DEFINE_SPINLOCK(pgd_lock);
2bff7383 911LIST_HEAD(pgd_list);
8c914cb7
JB
912
913void vmalloc_sync_all(void)
914{
1156e098
HH
915#ifdef CONFIG_X86_32
916 /*
917 * Note that races in the updates of insync and start aren't
918 * problematic: insync can only get set bits added, and updates to
919 * start are only improving performance (without affecting correctness
920 * if undone).
921 */
922 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
923 static unsigned long start = TASK_SIZE;
924 unsigned long address;
925
926 if (SHARED_KERNEL_PMD)
927 return;
928
929 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
930 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
931 if (!test_bit(pgd_index(address), insync)) {
932 unsigned long flags;
933 struct page *page;
934
935 spin_lock_irqsave(&pgd_lock, flags);
e3ed910d 936 list_for_each_entry(page, &pgd_list, lru) {
1156e098 937 if (!vmalloc_sync_one(page_address(page),
e3ed910d 938 address))
1156e098 939 break;
e3ed910d 940 }
1156e098
HH
941 spin_unlock_irqrestore(&pgd_lock, flags);
942 if (!page)
943 set_bit(pgd_index(address), insync);
944 }
945 if (address == start && test_bit(pgd_index(address), insync))
946 start = address + PGDIR_SIZE;
947 }
948#else /* CONFIG_X86_64 */
6f4d368e
HH
949 /*
950 * Note that races in the updates of insync and start aren't
951 * problematic: insync can only get set bits added, and updates to
952 * start are only improving performance (without affecting correctness
953 * if undone).
954 */
8c914cb7
JB
955 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
956 static unsigned long start = VMALLOC_START & PGDIR_MASK;
957 unsigned long address;
958
959 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
960 if (!test_bit(pgd_index(address), insync)) {
961 const pgd_t *pgd_ref = pgd_offset_k(address);
58d5d0d8 962 unsigned long flags;
8c914cb7
JB
963 struct page *page;
964
965 if (pgd_none(*pgd_ref))
966 continue;
58d5d0d8 967 spin_lock_irqsave(&pgd_lock, flags);
2bff7383 968 list_for_each_entry(page, &pgd_list, lru) {
8c914cb7
JB
969 pgd_t *pgd;
970 pgd = (pgd_t *)page_address(page) + pgd_index(address);
971 if (pgd_none(*pgd))
972 set_pgd(pgd, *pgd_ref);
973 else
46a82b2d 974 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8c914cb7 975 }
58d5d0d8 976 spin_unlock_irqrestore(&pgd_lock, flags);
8c914cb7
JB
977 set_bit(pgd_index(address), insync);
978 }
979 if (address == start)
980 start = address + PGDIR_SIZE;
981 }
982 /* Check that there is no need to do the same for the modules area. */
983 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
33cb5243 984 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
8c914cb7 985 (__START_KERNEL & PGDIR_MASK)));
1156e098 986#endif
8c914cb7 987}