2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
23 #include <linux/kernel.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39 #include <linux/prctl.h>
41 #include <asm/uaccess.h>
42 #include <asm/pgtable.h>
43 #include <asm/system.h>
45 #include <asm/processor.h>
47 #include <asm/mmu_context.h>
49 #include <asm/prctl.h>
51 #include <asm/proto.h>
55 asmlinkage extern void ret_from_fork(void);
57 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
59 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
61 void idle_notifier_register(struct notifier_block *n)
63 atomic_notifier_chain_register(&idle_notifier, n);
69 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
72 static void __exit_idle(void)
74 if (test_and_clear_bit_pda(0, isidle) == 0)
76 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
79 /* Called from interrupts to signify idle end */
82 /* idle loop has pid 0 */
88 #ifdef CONFIG_HOTPLUG_CPU
89 DECLARE_PER_CPU(int, cpu_state);
92 /* We halt the CPU with physical CPU hotplug */
93 static inline void play_dead(void)
98 __get_cpu_var(cpu_state) = CPU_DEAD;
101 /* mask all interrupts, flush any and all caches, and halt */
105 static inline void play_dead(void)
109 #endif /* CONFIG_HOTPLUG_CPU */
112 * The idle thread. There's no useful work to be
113 * done, so just try to conserve power and have a
114 * low exit latency (ie sit in a loop waiting for
115 * somebody to say that they'd like to reschedule)
119 current_thread_info()->status |= TS_POLLING;
120 /* endless idle loop with no priority at all */
122 tick_nohz_stop_sched_tick(1);
123 while (!need_resched()) {
127 if (cpu_is_offline(smp_processor_id()))
130 * Idle routines should keep interrupts disabled
131 * from here on, until they go to idle.
132 * Otherwise, idle callbacks can misfire.
136 /* Don't trace irqs off for idle */
137 stop_critical_timings();
139 start_critical_timings();
140 /* In many cases the interrupt that ended idle
141 has already called exit_idle. But some idle
142 loops can be woken up without interrupt. */
146 tick_nohz_restart_sched_tick();
147 preempt_enable_no_resched();
153 /* Prints also some state that isn't saved in the pt_regs */
154 void __show_regs(struct pt_regs * regs)
156 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
157 unsigned long d0, d1, d2, d3, d6, d7;
158 unsigned int fsindex, gsindex;
159 unsigned int ds, cs, es;
163 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
164 current->pid, current->comm, print_tainted(),
165 init_utsname()->release,
166 (int)strcspn(init_utsname()->version, " "),
167 init_utsname()->version);
168 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
169 printk_address(regs->ip, 1);
170 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
172 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
173 regs->ax, regs->bx, regs->cx);
174 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
175 regs->dx, regs->si, regs->di);
176 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
177 regs->bp, regs->r8, regs->r9);
178 printk("R10: %016lx R11: %016lx R12: %016lx\n",
179 regs->r10, regs->r11, regs->r12);
180 printk("R13: %016lx R14: %016lx R15: %016lx\n",
181 regs->r13, regs->r14, regs->r15);
183 asm("movl %%ds,%0" : "=r" (ds));
184 asm("movl %%cs,%0" : "=r" (cs));
185 asm("movl %%es,%0" : "=r" (es));
186 asm("movl %%fs,%0" : "=r" (fsindex));
187 asm("movl %%gs,%0" : "=r" (gsindex));
189 rdmsrl(MSR_FS_BASE, fs);
190 rdmsrl(MSR_GS_BASE, gs);
191 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
198 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
199 fs,fsindex,gs,gsindex,shadowgs);
200 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
201 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
206 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
210 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
213 void show_regs(struct pt_regs *regs)
215 printk("CPU %d:", smp_processor_id());
217 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
221 * Free current thread data structures etc..
223 void exit_thread(void)
225 struct task_struct *me = current;
226 struct thread_struct *t = &me->thread;
228 if (me->thread.io_bitmap_ptr) {
229 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
231 kfree(t->io_bitmap_ptr);
232 t->io_bitmap_ptr = NULL;
233 clear_thread_flag(TIF_IO_BITMAP);
235 * Careful, clear this in the TSS too:
237 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
238 t->io_bitmap_max = 0;
242 /* Free any DS contexts that have not been properly released. */
243 if (unlikely(t->ds_ctx)) {
244 /* we clear debugctl to make sure DS is not used. */
245 update_debugctlmsr(0);
248 #endif /* CONFIG_X86_DS */
251 void flush_thread(void)
253 struct task_struct *tsk = current;
255 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
256 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
257 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
258 clear_tsk_thread_flag(tsk, TIF_IA32);
260 set_tsk_thread_flag(tsk, TIF_IA32);
261 current_thread_info()->status |= TS_COMPAT;
264 clear_tsk_thread_flag(tsk, TIF_DEBUG);
266 tsk->thread.debugreg0 = 0;
267 tsk->thread.debugreg1 = 0;
268 tsk->thread.debugreg2 = 0;
269 tsk->thread.debugreg3 = 0;
270 tsk->thread.debugreg6 = 0;
271 tsk->thread.debugreg7 = 0;
272 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
274 * Forget coprocessor state..
276 tsk->fpu_counter = 0;
281 void release_thread(struct task_struct *dead_task)
284 if (dead_task->mm->context.size) {
285 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
287 dead_task->mm->context.ldt,
288 dead_task->mm->context.size);
294 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
296 struct user_desc ud = {
303 struct desc_struct *desc = t->thread.tls_array;
308 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
310 return get_desc_base(&t->thread.tls_array[tls]);
314 * This gets called before we allocate a new thread and copy
315 * the current task into it.
317 void prepare_to_copy(struct task_struct *tsk)
322 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
323 unsigned long unused,
324 struct task_struct * p, struct pt_regs * regs)
327 struct pt_regs * childregs;
328 struct task_struct *me = current;
330 childregs = ((struct pt_regs *)
331 (THREAD_SIZE + task_stack_page(p))) - 1;
337 childregs->sp = (unsigned long)childregs;
339 p->thread.sp = (unsigned long) childregs;
340 p->thread.sp0 = (unsigned long) (childregs+1);
341 p->thread.usersp = me->thread.usersp;
343 set_tsk_thread_flag(p, TIF_FORK);
345 p->thread.fs = me->thread.fs;
346 p->thread.gs = me->thread.gs;
348 savesegment(gs, p->thread.gsindex);
349 savesegment(fs, p->thread.fsindex);
350 savesegment(es, p->thread.es);
351 savesegment(ds, p->thread.ds);
353 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
354 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
355 if (!p->thread.io_bitmap_ptr) {
356 p->thread.io_bitmap_max = 0;
359 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
361 set_tsk_thread_flag(p, TIF_IO_BITMAP);
365 * Set a new TLS for the child thread?
367 if (clone_flags & CLONE_SETTLS) {
368 #ifdef CONFIG_IA32_EMULATION
369 if (test_thread_flag(TIF_IA32))
370 err = do_set_thread_area(p, -1,
371 (struct user_desc __user *)childregs->si, 0);
374 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
380 if (err && p->thread.io_bitmap_ptr) {
381 kfree(p->thread.io_bitmap_ptr);
382 p->thread.io_bitmap_max = 0;
388 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
396 write_pda(oldrsp, new_sp);
397 regs->cs = __USER_CS;
398 regs->ss = __USER_DS;
402 * Free the old FP and other extended state
404 free_thread_xstate(current);
406 EXPORT_SYMBOL_GPL(start_thread);
408 static void hard_disable_TSC(void)
410 write_cr4(read_cr4() | X86_CR4_TSD);
413 void disable_TSC(void)
416 if (!test_and_set_thread_flag(TIF_NOTSC))
418 * Must flip the CPU state synchronously with
419 * TIF_NOTSC in the current running context.
425 static void hard_enable_TSC(void)
427 write_cr4(read_cr4() & ~X86_CR4_TSD);
430 static void enable_TSC(void)
433 if (test_and_clear_thread_flag(TIF_NOTSC))
435 * Must flip the CPU state synchronously with
436 * TIF_NOTSC in the current running context.
442 int get_tsc_mode(unsigned long adr)
446 if (test_thread_flag(TIF_NOTSC))
447 val = PR_TSC_SIGSEGV;
451 return put_user(val, (unsigned int __user *)adr);
454 int set_tsc_mode(unsigned int val)
456 if (val == PR_TSC_SIGSEGV)
458 else if (val == PR_TSC_ENABLE)
467 * This special macro can be used to load a debugging register
469 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
471 static inline void __switch_to_xtra(struct task_struct *prev_p,
472 struct task_struct *next_p,
473 struct tss_struct *tss)
475 struct thread_struct *prev, *next;
476 unsigned long debugctl;
478 prev = &prev_p->thread,
479 next = &next_p->thread;
481 debugctl = prev->debugctlmsr;
485 unsigned long ds_prev = 0, ds_next = 0;
488 ds_prev = (unsigned long)prev->ds_ctx->ds;
490 ds_next = (unsigned long)next->ds_ctx->ds;
492 if (ds_next != ds_prev) {
494 * We clear debugctl to make sure DS
495 * is not in use when we change it:
498 update_debugctlmsr(0);
499 wrmsrl(MSR_IA32_DS_AREA, ds_next);
502 #endif /* CONFIG_X86_DS */
504 if (next->debugctlmsr != debugctl)
505 update_debugctlmsr(next->debugctlmsr);
507 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
517 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
518 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
519 /* prev and next are different */
520 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
526 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
528 * Copy the relevant range of the IO bitmap.
529 * Normally this is 128 bytes or less:
531 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
532 max(prev->io_bitmap_max, next->io_bitmap_max));
533 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
535 * Clear any possible leftover bits:
537 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
540 #ifdef CONFIG_X86_PTRACE_BTS
541 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
542 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
544 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
545 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
546 #endif /* CONFIG_X86_PTRACE_BTS */
550 * switch_to(x,y) should switch tasks from x to y.
552 * This could still be optimized:
553 * - fold all the options into a flag word and test it with a single test.
554 * - could test fs/gs bitsliced
556 * Kprobes not supported here. Set the probe on schedule instead.
559 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
561 struct thread_struct *prev = &prev_p->thread;
562 struct thread_struct *next = &next_p->thread;
563 int cpu = smp_processor_id();
564 struct tss_struct *tss = &per_cpu(init_tss, cpu);
565 unsigned fsindex, gsindex;
567 /* we're going to use this soon, after a few expensive things */
568 if (next_p->fpu_counter>5)
569 prefetch(next->xstate);
572 * Reload esp0, LDT and the page table pointer:
578 * This won't pick up thread selector changes, but I guess that is ok.
580 savesegment(es, prev->es);
581 if (unlikely(next->es | prev->es))
582 loadsegment(es, next->es);
584 savesegment(ds, prev->ds);
585 if (unlikely(next->ds | prev->ds))
586 loadsegment(ds, next->ds);
589 /* We must save %fs and %gs before load_TLS() because
590 * %fs and %gs may be cleared by load_TLS().
592 * (e.g. xen_load_tls())
594 savesegment(fs, fsindex);
595 savesegment(gs, gsindex);
600 * Leave lazy mode, flushing any hypercalls made here.
601 * This must be done before restoring TLS segments so
602 * the GDT and LDT are properly updated, and must be
603 * done before math_state_restore, so the TS bit is up
606 arch_leave_lazy_cpu_mode();
611 * Segment register != 0 always requires a reload. Also
612 * reload when it has changed. When prev process used 64bit
613 * base always reload to avoid an information leak.
615 if (unlikely(fsindex | next->fsindex | prev->fs)) {
616 loadsegment(fs, next->fsindex);
618 * Check if the user used a selector != 0; if yes
619 * clear 64bit base, since overloaded base is always
620 * mapped to the Null selector
625 /* when next process has a 64bit base use it */
627 wrmsrl(MSR_FS_BASE, next->fs);
628 prev->fsindex = fsindex;
630 if (unlikely(gsindex | next->gsindex | prev->gs)) {
631 load_gs_index(next->gsindex);
636 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
637 prev->gsindex = gsindex;
639 /* Must be after DS reload */
643 * Switch the PDA and FPU contexts.
645 prev->usersp = read_pda(oldrsp);
646 write_pda(oldrsp, next->usersp);
647 write_pda(pcurrent, next_p);
649 write_pda(kernelstack,
650 (unsigned long)task_stack_page(next_p) +
651 THREAD_SIZE - PDA_STACKOFFSET);
652 #ifdef CONFIG_CC_STACKPROTECTOR
653 write_pda(stack_canary, next_p->stack_canary);
655 * Build time only check to make sure the stack_canary is at
656 * offset 40 in the pda; this is a gcc ABI requirement
658 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
662 * Now maybe reload the debug registers and handle I/O bitmaps
664 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
665 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
666 __switch_to_xtra(prev_p, next_p, tss);
668 /* If the task has used fpu the last 5 timeslices, just do a full
669 * restore of the math state immediately to avoid the trap; the
670 * chances of needing FPU soon are obviously high now
672 * tsk_used_math() checks prevent calling math_state_restore(),
673 * which can sleep in the case of !tsk_used_math()
675 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
676 math_state_restore();
681 * sys_execve() executes a new program.
684 long sys_execve(char __user *name, char __user * __user *argv,
685 char __user * __user *envp, struct pt_regs *regs)
690 filename = getname(name);
691 error = PTR_ERR(filename);
692 if (IS_ERR(filename))
694 error = do_execve(filename, argv, envp, regs);
699 void set_personality_64bit(void)
701 /* inherit personality from parent */
703 /* Make sure to be in 64bit mode */
704 clear_thread_flag(TIF_IA32);
706 /* TBD: overwrites user setup. Should have two bits.
707 But 64bit processes have always behaved this way,
708 so it's not too bad. The main problem is just that
709 32bit childs are affected again. */
710 current->personality &= ~READ_IMPLIES_EXEC;
713 asmlinkage long sys_fork(struct pt_regs *regs)
715 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
719 sys_clone(unsigned long clone_flags, unsigned long newsp,
720 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
724 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
728 * This is trivial, and on the face of it looks like it
729 * could equally well be done in user mode.
731 * Not so, for quite unobvious reasons - register pressure.
732 * In user mode vfork() cannot have a stack frame, and if
733 * done by calling the "clone()" system call directly, you
734 * do not have enough call-clobbered registers to hold all
735 * the information you need.
737 asmlinkage long sys_vfork(struct pt_regs *regs)
739 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
743 unsigned long get_wchan(struct task_struct *p)
749 if (!p || p == current || p->state==TASK_RUNNING)
751 stack = (unsigned long)task_stack_page(p);
752 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
754 fp = *(u64 *)(p->thread.sp);
756 if (fp < (unsigned long)stack ||
757 fp > (unsigned long)stack+THREAD_SIZE)
760 if (!in_sched_functions(ip))
763 } while (count++ < 16);
767 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
770 int doit = task == current;
775 if (addr >= TASK_SIZE_OF(task))
778 /* handle small bases via the GDT because that's faster to
780 if (addr <= 0xffffffff) {
781 set_32bit_tls(task, GS_TLS, addr);
783 load_TLS(&task->thread, cpu);
784 load_gs_index(GS_TLS_SEL);
786 task->thread.gsindex = GS_TLS_SEL;
789 task->thread.gsindex = 0;
790 task->thread.gs = addr;
793 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
799 /* Not strictly needed for fs, but do it for symmetry
801 if (addr >= TASK_SIZE_OF(task))
804 /* handle small bases via the GDT because that's faster to
806 if (addr <= 0xffffffff) {
807 set_32bit_tls(task, FS_TLS, addr);
809 load_TLS(&task->thread, cpu);
810 loadsegment(fs, FS_TLS_SEL);
812 task->thread.fsindex = FS_TLS_SEL;
815 task->thread.fsindex = 0;
816 task->thread.fs = addr;
818 /* set the selector to 0 to not confuse
821 ret = checking_wrmsrl(MSR_FS_BASE, addr);
828 if (task->thread.fsindex == FS_TLS_SEL)
829 base = read_32bit_tls(task, FS_TLS);
831 rdmsrl(MSR_FS_BASE, base);
833 base = task->thread.fs;
834 ret = put_user(base, (unsigned long __user *)addr);
840 if (task->thread.gsindex == GS_TLS_SEL)
841 base = read_32bit_tls(task, GS_TLS);
843 savesegment(gs, gsindex);
845 rdmsrl(MSR_KERNEL_GS_BASE, base);
847 base = task->thread.gs;
850 base = task->thread.gs;
851 ret = put_user(base, (unsigned long __user *)addr);
863 long sys_arch_prctl(int code, unsigned long addr)
865 return do_arch_prctl(current, code, addr);
868 unsigned long arch_align_stack(unsigned long sp)
870 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
871 sp -= get_random_int() % 8192;
875 unsigned long arch_randomize_brk(struct mm_struct *mm)
877 unsigned long range_end = mm->brk + 0x02000000;
878 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;