]>
Commit | Line | Data |
---|---|---|
5234f5eb | 1 | /* |
835c34a1 | 2 | * handle transition of Linux booting another kernel |
5234f5eb EB |
3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> |
4 | * | |
5 | * This source code is licensed under the GNU General Public License, | |
6 | * Version 2. See the file COPYING for more details. | |
7 | */ | |
8 | ||
9 | #include <linux/mm.h> | |
10 | #include <linux/kexec.h> | |
5234f5eb EB |
11 | #include <linux/string.h> |
12 | #include <linux/reboot.h> | |
fd59d231 | 13 | #include <linux/numa.h> |
f43fdad8 | 14 | #include <linux/ftrace.h> |
fef3a7a1 | 15 | #include <linux/io.h> |
fee7b0d8 | 16 | #include <linux/suspend.h> |
f43fdad8 | 17 | |
5234f5eb | 18 | #include <asm/pgtable.h> |
5234f5eb EB |
19 | #include <asm/tlbflush.h> |
20 | #include <asm/mmu_context.h> | |
8bf27556 | 21 | |
53594547 HY |
22 | static int init_one_level2_page(struct kimage *image, pgd_t *pgd, |
23 | unsigned long addr) | |
24 | { | |
25 | pud_t *pud; | |
26 | pmd_t *pmd; | |
27 | struct page *page; | |
28 | int result = -ENOMEM; | |
29 | ||
30 | addr &= PMD_MASK; | |
31 | pgd += pgd_index(addr); | |
32 | if (!pgd_present(*pgd)) { | |
33 | page = kimage_alloc_control_pages(image, 0); | |
34 | if (!page) | |
35 | goto out; | |
36 | pud = (pud_t *)page_address(page); | |
37 | memset(pud, 0, PAGE_SIZE); | |
38 | set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); | |
39 | } | |
40 | pud = pud_offset(pgd, addr); | |
41 | if (!pud_present(*pud)) { | |
42 | page = kimage_alloc_control_pages(image, 0); | |
43 | if (!page) | |
44 | goto out; | |
45 | pmd = (pmd_t *)page_address(page); | |
46 | memset(pmd, 0, PAGE_SIZE); | |
47 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); | |
48 | } | |
49 | pmd = pmd_offset(pud, addr); | |
50 | if (!pmd_present(*pmd)) | |
51 | set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); | |
52 | result = 0; | |
53 | out: | |
54 | return result; | |
55 | } | |
56 | ||
8bf27556 | 57 | static void init_level2_page(pmd_t *level2p, unsigned long addr) |
5234f5eb EB |
58 | { |
59 | unsigned long end_addr; | |
72414d3f | 60 | |
5234f5eb | 61 | addr &= PAGE_MASK; |
8bf27556 | 62 | end_addr = addr + PUD_SIZE; |
72414d3f | 63 | while (addr < end_addr) { |
8bf27556 EB |
64 | set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); |
65 | addr += PMD_SIZE; | |
5234f5eb EB |
66 | } |
67 | } | |
68 | ||
8bf27556 | 69 | static int init_level3_page(struct kimage *image, pud_t *level3p, |
72414d3f | 70 | unsigned long addr, unsigned long last_addr) |
5234f5eb EB |
71 | { |
72 | unsigned long end_addr; | |
73 | int result; | |
72414d3f | 74 | |
5234f5eb EB |
75 | result = 0; |
76 | addr &= PAGE_MASK; | |
8bf27556 | 77 | end_addr = addr + PGDIR_SIZE; |
72414d3f | 78 | while ((addr < last_addr) && (addr < end_addr)) { |
5234f5eb | 79 | struct page *page; |
8bf27556 | 80 | pmd_t *level2p; |
72414d3f | 81 | |
5234f5eb EB |
82 | page = kimage_alloc_control_pages(image, 0); |
83 | if (!page) { | |
84 | result = -ENOMEM; | |
85 | goto out; | |
86 | } | |
8bf27556 | 87 | level2p = (pmd_t *)page_address(page); |
5234f5eb | 88 | init_level2_page(level2p, addr); |
8bf27556 EB |
89 | set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); |
90 | addr += PUD_SIZE; | |
5234f5eb EB |
91 | } |
92 | /* clear the unused entries */ | |
72414d3f | 93 | while (addr < end_addr) { |
8bf27556 EB |
94 | pud_clear(level3p++); |
95 | addr += PUD_SIZE; | |
5234f5eb EB |
96 | } |
97 | out: | |
98 | return result; | |
99 | } | |
100 | ||
101 | ||
8bf27556 | 102 | static int init_level4_page(struct kimage *image, pgd_t *level4p, |
72414d3f | 103 | unsigned long addr, unsigned long last_addr) |
5234f5eb EB |
104 | { |
105 | unsigned long end_addr; | |
106 | int result; | |
72414d3f | 107 | |
5234f5eb EB |
108 | result = 0; |
109 | addr &= PAGE_MASK; | |
8bf27556 | 110 | end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE); |
72414d3f | 111 | while ((addr < last_addr) && (addr < end_addr)) { |
5234f5eb | 112 | struct page *page; |
8bf27556 | 113 | pud_t *level3p; |
72414d3f | 114 | |
5234f5eb EB |
115 | page = kimage_alloc_control_pages(image, 0); |
116 | if (!page) { | |
117 | result = -ENOMEM; | |
118 | goto out; | |
119 | } | |
8bf27556 | 120 | level3p = (pud_t *)page_address(page); |
5234f5eb | 121 | result = init_level3_page(image, level3p, addr, last_addr); |
fef3a7a1 | 122 | if (result) |
5234f5eb | 123 | goto out; |
8bf27556 EB |
124 | set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); |
125 | addr += PGDIR_SIZE; | |
5234f5eb EB |
126 | } |
127 | /* clear the unused entries */ | |
72414d3f | 128 | while (addr < end_addr) { |
8bf27556 EB |
129 | pgd_clear(level4p++); |
130 | addr += PGDIR_SIZE; | |
5234f5eb | 131 | } |
72414d3f | 132 | out: |
5234f5eb EB |
133 | return result; |
134 | } | |
135 | ||
f5deb796 HY |
136 | static void free_transition_pgtable(struct kimage *image) |
137 | { | |
138 | free_page((unsigned long)image->arch.pud); | |
139 | free_page((unsigned long)image->arch.pmd); | |
140 | free_page((unsigned long)image->arch.pte); | |
141 | } | |
142 | ||
143 | static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) | |
144 | { | |
145 | pud_t *pud; | |
146 | pmd_t *pmd; | |
147 | pte_t *pte; | |
148 | unsigned long vaddr, paddr; | |
149 | int result = -ENOMEM; | |
150 | ||
151 | vaddr = (unsigned long)relocate_kernel; | |
152 | paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); | |
153 | pgd += pgd_index(vaddr); | |
154 | if (!pgd_present(*pgd)) { | |
155 | pud = (pud_t *)get_zeroed_page(GFP_KERNEL); | |
156 | if (!pud) | |
157 | goto err; | |
158 | image->arch.pud = pud; | |
159 | set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); | |
160 | } | |
161 | pud = pud_offset(pgd, vaddr); | |
162 | if (!pud_present(*pud)) { | |
163 | pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); | |
164 | if (!pmd) | |
165 | goto err; | |
166 | image->arch.pmd = pmd; | |
167 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); | |
168 | } | |
169 | pmd = pmd_offset(pud, vaddr); | |
170 | if (!pmd_present(*pmd)) { | |
171 | pte = (pte_t *)get_zeroed_page(GFP_KERNEL); | |
172 | if (!pte) | |
173 | goto err; | |
174 | image->arch.pte = pte; | |
175 | set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); | |
176 | } | |
177 | pte = pte_offset_kernel(pmd, vaddr); | |
178 | set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); | |
179 | return 0; | |
180 | err: | |
181 | free_transition_pgtable(image); | |
182 | return result; | |
183 | } | |
184 | ||
5234f5eb EB |
185 | |
186 | static int init_pgtable(struct kimage *image, unsigned long start_pgtable) | |
187 | { | |
8bf27556 | 188 | pgd_t *level4p; |
f5deb796 | 189 | int result; |
8bf27556 | 190 | level4p = (pgd_t *)__va(start_pgtable); |
f5deb796 | 191 | result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); |
53594547 HY |
192 | if (result) |
193 | return result; | |
194 | /* | |
195 | * image->start may be outside 0 ~ max_pfn, for example when | |
196 | * jump back to original kernel from kexeced kernel | |
197 | */ | |
198 | result = init_one_level2_page(image, level4p, image->start); | |
f5deb796 HY |
199 | if (result) |
200 | return result; | |
201 | return init_transition_pgtable(image, level4p); | |
5234f5eb EB |
202 | } |
203 | ||
204 | static void set_idt(void *newidt, u16 limit) | |
205 | { | |
36c4fd23 | 206 | struct desc_ptr curidt; |
5234f5eb EB |
207 | |
208 | /* x86-64 supports unaliged loads & stores */ | |
36c4fd23 EB |
209 | curidt.size = limit; |
210 | curidt.address = (unsigned long)newidt; | |
5234f5eb EB |
211 | |
212 | __asm__ __volatile__ ( | |
36c4fd23 EB |
213 | "lidtq %0\n" |
214 | : : "m" (curidt) | |
5234f5eb EB |
215 | ); |
216 | }; | |
217 | ||
218 | ||
219 | static void set_gdt(void *newgdt, u16 limit) | |
220 | { | |
36c4fd23 | 221 | struct desc_ptr curgdt; |
5234f5eb EB |
222 | |
223 | /* x86-64 supports unaligned loads & stores */ | |
36c4fd23 EB |
224 | curgdt.size = limit; |
225 | curgdt.address = (unsigned long)newgdt; | |
5234f5eb EB |
226 | |
227 | __asm__ __volatile__ ( | |
36c4fd23 EB |
228 | "lgdtq %0\n" |
229 | : : "m" (curgdt) | |
5234f5eb EB |
230 | ); |
231 | }; | |
232 | ||
233 | static void load_segments(void) | |
234 | { | |
235 | __asm__ __volatile__ ( | |
36c4fd23 EB |
236 | "\tmovl %0,%%ds\n" |
237 | "\tmovl %0,%%es\n" | |
238 | "\tmovl %0,%%ss\n" | |
239 | "\tmovl %0,%%fs\n" | |
240 | "\tmovl %0,%%gs\n" | |
2ec5e3a8 | 241 | : : "a" (__KERNEL_DS) : "memory" |
5234f5eb | 242 | ); |
5234f5eb EB |
243 | } |
244 | ||
5234f5eb EB |
245 | int machine_kexec_prepare(struct kimage *image) |
246 | { | |
4bfaaef0 | 247 | unsigned long start_pgtable; |
5234f5eb EB |
248 | int result; |
249 | ||
250 | /* Calculate the offsets */ | |
72414d3f | 251 | start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; |
5234f5eb EB |
252 | |
253 | /* Setup the identity mapped 64bit page table */ | |
254 | result = init_pgtable(image, start_pgtable); | |
72414d3f | 255 | if (result) |
5234f5eb | 256 | return result; |
5234f5eb | 257 | |
5234f5eb EB |
258 | return 0; |
259 | } | |
260 | ||
261 | void machine_kexec_cleanup(struct kimage *image) | |
262 | { | |
f5deb796 | 263 | free_transition_pgtable(image); |
5234f5eb EB |
264 | } |
265 | ||
266 | /* | |
267 | * Do not allocate memory (or fail in any way) in machine_kexec(). | |
268 | * We are past the point of no return, committed to rebooting now. | |
269 | */ | |
3ab83521 | 270 | void machine_kexec(struct kimage *image) |
5234f5eb | 271 | { |
4bfaaef0 MD |
272 | unsigned long page_list[PAGES_NR]; |
273 | void *control_page; | |
fee7b0d8 | 274 | int save_ftrace_enabled; |
5234f5eb | 275 | |
fee7b0d8 HY |
276 | #ifdef CONFIG_KEXEC_JUMP |
277 | if (kexec_image->preserve_context) | |
278 | save_processor_state(); | |
279 | #endif | |
280 | ||
281 | save_ftrace_enabled = __ftrace_enabled_save(); | |
f43fdad8 | 282 | |
5234f5eb EB |
283 | /* Interrupts aren't acceptable while we reboot */ |
284 | local_irq_disable(); | |
285 | ||
fee7b0d8 HY |
286 | if (image->preserve_context) { |
287 | #ifdef CONFIG_X86_IO_APIC | |
288 | /* | |
289 | * We need to put APICs in legacy mode so that we can | |
290 | * get timer interrupts in second kernel. kexec/kdump | |
291 | * paths already have calls to disable_IO_APIC() in | |
292 | * one form or other. kexec jump path also need | |
293 | * one. | |
294 | */ | |
295 | disable_IO_APIC(); | |
296 | #endif | |
297 | } | |
298 | ||
4bfaaef0 | 299 | control_page = page_address(image->control_code_page) + PAGE_SIZE; |
fee7b0d8 | 300 | memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); |
4bfaaef0 | 301 | |
e3ebadd9 | 302 | page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); |
fee7b0d8 | 303 | page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; |
4bfaaef0 MD |
304 | page_list[PA_TABLE_PAGE] = |
305 | (unsigned long)__pa(page_address(image->control_code_page)); | |
5234f5eb | 306 | |
fee7b0d8 HY |
307 | if (image->type == KEXEC_TYPE_DEFAULT) |
308 | page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) | |
309 | << PAGE_SHIFT); | |
310 | ||
fef3a7a1 HY |
311 | /* |
312 | * The segment registers are funny things, they have both a | |
2a8a3d5b EB |
313 | * visible and an invisible part. Whenever the visible part is |
314 | * set to a specific selector, the invisible part is loaded | |
315 | * with from a table in memory. At no other time is the | |
316 | * descriptor table in memory accessed. | |
5234f5eb EB |
317 | * |
318 | * I take advantage of this here by force loading the | |
319 | * segments, before I zap the gdt with an invalid value. | |
320 | */ | |
321 | load_segments(); | |
fef3a7a1 HY |
322 | /* |
323 | * The gdt & idt are now invalid. | |
5234f5eb EB |
324 | * If you want to load them you must set up your own idt & gdt. |
325 | */ | |
fef3a7a1 HY |
326 | set_gdt(phys_to_virt(0), 0); |
327 | set_idt(phys_to_virt(0), 0); | |
4bfaaef0 | 328 | |
5234f5eb | 329 | /* now call it */ |
fee7b0d8 HY |
330 | image->start = relocate_kernel((unsigned long)image->head, |
331 | (unsigned long)page_list, | |
332 | image->start, | |
333 | image->preserve_context); | |
334 | ||
335 | #ifdef CONFIG_KEXEC_JUMP | |
336 | if (kexec_image->preserve_context) | |
337 | restore_processor_state(); | |
338 | #endif | |
339 | ||
340 | __ftrace_enabled_restore(save_ftrace_enabled); | |
5234f5eb | 341 | } |
2c8c0e6b | 342 | |
fd59d231 KO |
343 | void arch_crash_save_vmcoreinfo(void) |
344 | { | |
629c8b4c | 345 | VMCOREINFO_SYMBOL(phys_base); |
69243f91 | 346 | VMCOREINFO_SYMBOL(init_level4_pgt); |
92df5c3e KO |
347 | |
348 | #ifdef CONFIG_NUMA | |
349 | VMCOREINFO_SYMBOL(node_data); | |
350 | VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); | |
351 | #endif | |
fd59d231 KO |
352 | } |
353 |