2 * linux/arch/x86-64/kernel/process.c
4 * Copyright (C) 1995 Linus Torvalds
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
12 * CPU hotplug support - ashok.raj@intel.com
13 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
17 * This file handles the architecture-dependent parts of process handling..
22 #include <linux/cpu.h>
23 #include <linux/errno.h>
24 #include <linux/sched.h>
25 #include <linux/kernel.h>
27 #include <linux/elfcore.h>
28 #include <linux/smp.h>
29 #include <linux/slab.h>
30 #include <linux/user.h>
31 #include <linux/module.h>
32 #include <linux/a.out.h>
33 #include <linux/interrupt.h>
34 #include <linux/delay.h>
35 #include <linux/ptrace.h>
36 #include <linux/utsname.h>
37 #include <linux/random.h>
38 #include <linux/kprobes.h>
40 #include <asm/uaccess.h>
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
44 #include <asm/processor.h>
46 #include <asm/mmu_context.h>
48 #include <asm/prctl.h>
49 #include <asm/kdebug.h>
51 #include <asm/proto.h>
54 asmlinkage extern void ret_from_fork(void);
56 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58 static atomic_t hlt_counter = ATOMIC_INIT(0);
60 unsigned long boot_option_idle_override = 0;
61 EXPORT_SYMBOL(boot_option_idle_override);
64 * Powermanagement idle function, if any..
66 void (*pm_idle)(void);
67 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
69 void disable_hlt(void)
71 atomic_inc(&hlt_counter);
74 EXPORT_SYMBOL(disable_hlt);
78 atomic_dec(&hlt_counter);
81 EXPORT_SYMBOL(enable_hlt);
84 * We use this if we don't have any better
87 void default_idle(void)
89 if (!atomic_read(&hlt_counter)) {
99 * On SMP it's slightly faster (but much more power-consuming!)
100 * to poll the ->need_resched flag instead of waiting for the
101 * cross-CPU IPI to arrive. Use this option with caution.
103 static void poll_idle (void)
110 * Deal with another CPU just having chosen a thread to
113 oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
116 set_thread_flag(TIF_POLLING_NRFLAG);
123 "i" (_TIF_NEED_RESCHED),
124 "m" (current_thread_info()->flags));
125 clear_thread_flag(TIF_POLLING_NRFLAG);
131 void cpu_idle_wait(void)
133 unsigned int cpu, this_cpu = get_cpu();
136 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
140 for_each_online_cpu(cpu) {
141 per_cpu(cpu_idle_state, cpu) = 1;
145 __get_cpu_var(cpu_idle_state) = 0;
150 for_each_online_cpu(cpu) {
151 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
154 cpus_and(map, map, cpu_online_map);
155 } while (!cpus_empty(map));
157 EXPORT_SYMBOL_GPL(cpu_idle_wait);
159 #ifdef CONFIG_HOTPLUG_CPU
160 DECLARE_PER_CPU(int, cpu_state);
163 /* We don't actually take CPU down, just spin without interrupts. */
164 static inline void play_dead(void)
170 __get_cpu_var(cpu_state) = CPU_DEAD;
176 static inline void play_dead(void)
180 #endif /* CONFIG_HOTPLUG_CPU */
183 * The idle thread. There's no useful work to be
184 * done, so just try to conserve power and have a
185 * low exit latency (ie sit in a loop waiting for
186 * somebody to say that they'd like to reschedule)
190 /* endless idle loop with no priority at all */
192 while (!need_resched()) {
195 if (__get_cpu_var(cpu_idle_state))
196 __get_cpu_var(cpu_idle_state) = 0;
202 if (cpu_is_offline(smp_processor_id()))
207 preempt_enable_no_resched();
214 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
215 * which can obviate IPI to trigger checking of need_resched.
216 * We execute MONITOR against need_resched and enter optimized wait state
217 * through MWAIT. Whenever someone changes need_resched, we would be woken
218 * up from MWAIT (without an IPI).
220 static void mwait_idle(void)
224 if (!need_resched()) {
225 set_thread_flag(TIF_POLLING_NRFLAG);
227 __monitor((void *)¤t_thread_info()->flags, 0, 0);
231 } while (!need_resched());
232 clear_thread_flag(TIF_POLLING_NRFLAG);
236 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
239 if (cpu_has(c, X86_FEATURE_MWAIT)) {
241 * Skip, if setup has overridden idle.
242 * One CPU supports mwait => All CPUs supports mwait
246 printk("using mwait in idle threads.\n");
249 pm_idle = mwait_idle;
254 static int __init idle_setup (char *str)
256 if (!strncmp(str, "poll", 4)) {
257 printk("using polling idle threads.\n");
261 boot_option_idle_override = 1;
265 __setup("idle=", idle_setup);
267 /* Prints also some state that isn't saved in the pt_regs */
268 void __show_regs(struct pt_regs * regs)
270 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
271 unsigned int fsindex,gsindex;
272 unsigned int ds,cs,es;
276 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
277 current->pid, current->comm, print_tainted(),
278 system_utsname.release,
279 (int)strcspn(system_utsname.version, " "),
280 system_utsname.version);
281 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
282 printk_address(regs->rip);
283 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags);
284 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
285 regs->rax, regs->rbx, regs->rcx);
286 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
287 regs->rdx, regs->rsi, regs->rdi);
288 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
289 regs->rbp, regs->r8, regs->r9);
290 printk("R10: %016lx R11: %016lx R12: %016lx\n",
291 regs->r10, regs->r11, regs->r12);
292 printk("R13: %016lx R14: %016lx R15: %016lx\n",
293 regs->r13, regs->r14, regs->r15);
295 asm("movl %%ds,%0" : "=r" (ds));
296 asm("movl %%cs,%0" : "=r" (cs));
297 asm("movl %%es,%0" : "=r" (es));
298 asm("movl %%fs,%0" : "=r" (fsindex));
299 asm("movl %%gs,%0" : "=r" (gsindex));
301 rdmsrl(MSR_FS_BASE, fs);
302 rdmsrl(MSR_GS_BASE, gs);
303 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
305 asm("movq %%cr0, %0": "=r" (cr0));
306 asm("movq %%cr2, %0": "=r" (cr2));
307 asm("movq %%cr3, %0": "=r" (cr3));
308 asm("movq %%cr4, %0": "=r" (cr4));
310 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
311 fs,fsindex,gs,gsindex,shadowgs);
312 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
313 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
316 void show_regs(struct pt_regs *regs)
318 printk("CPU %d:", smp_processor_id());
320 show_trace(®s->rsp);
324 * Free current thread data structures etc..
326 void exit_thread(void)
328 struct task_struct *me = current;
329 struct thread_struct *t = &me->thread;
332 * Remove function-return probe instances associated with this task
333 * and put them back on the free list. Do not insert an exit probe for
334 * this function, it will be disabled by kprobe_flush_task if you do.
336 kprobe_flush_task(me);
338 if (me->thread.io_bitmap_ptr) {
339 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
341 kfree(t->io_bitmap_ptr);
342 t->io_bitmap_ptr = NULL;
344 * Careful, clear this in the TSS too:
346 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
347 t->io_bitmap_max = 0;
352 void flush_thread(void)
354 struct task_struct *tsk = current;
355 struct thread_info *t = current_thread_info();
358 * Remove function-return probe instances associated with this task
359 * and put them back on the free list. Do not insert an exit probe for
360 * this function, it will be disabled by kprobe_flush_task if you do.
362 kprobe_flush_task(tsk);
364 if (t->flags & _TIF_ABI_PENDING)
365 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
367 tsk->thread.debugreg0 = 0;
368 tsk->thread.debugreg1 = 0;
369 tsk->thread.debugreg2 = 0;
370 tsk->thread.debugreg3 = 0;
371 tsk->thread.debugreg6 = 0;
372 tsk->thread.debugreg7 = 0;
373 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
375 * Forget coprocessor state..
381 void release_thread(struct task_struct *dead_task)
384 if (dead_task->mm->context.size) {
385 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
387 dead_task->mm->context.ldt,
388 dead_task->mm->context.size);
394 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
396 struct user_desc ud = {
403 struct n_desc_struct *desc = (void *)t->thread.tls_array;
405 desc->a = LDT_entry_a(&ud);
406 desc->b = LDT_entry_b(&ud);
409 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
411 struct desc_struct *desc = (void *)t->thread.tls_array;
414 (((u32)desc->base1) << 16) |
415 (((u32)desc->base2) << 24);
419 * This gets called before we allocate a new thread and copy
420 * the current task into it.
422 void prepare_to_copy(struct task_struct *tsk)
427 int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
428 unsigned long unused,
429 struct task_struct * p, struct pt_regs * regs)
432 struct pt_regs * childregs;
433 struct task_struct *me = current;
435 childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
440 childregs->rsp = rsp;
442 childregs->rsp = (unsigned long)childregs;
445 p->thread.rsp = (unsigned long) childregs;
446 p->thread.rsp0 = (unsigned long) (childregs+1);
447 p->thread.userrsp = me->thread.userrsp;
449 set_ti_thread_flag(p->thread_info, TIF_FORK);
451 p->thread.fs = me->thread.fs;
452 p->thread.gs = me->thread.gs;
454 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
455 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
456 asm("mov %%es,%0" : "=m" (p->thread.es));
457 asm("mov %%ds,%0" : "=m" (p->thread.ds));
459 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
460 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
461 if (!p->thread.io_bitmap_ptr) {
462 p->thread.io_bitmap_max = 0;
465 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES);
469 * Set a new TLS for the child thread?
471 if (clone_flags & CLONE_SETTLS) {
472 #ifdef CONFIG_IA32_EMULATION
473 if (test_thread_flag(TIF_IA32))
474 err = ia32_child_tls(p, childregs);
477 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
483 if (err && p->thread.io_bitmap_ptr) {
484 kfree(p->thread.io_bitmap_ptr);
485 p->thread.io_bitmap_max = 0;
491 * This special macro can be used to load a debugging register
493 #define loaddebug(thread,r) set_debug(thread->debugreg ## r, r)
496 * switch_to(x,y) should switch tasks from x to y.
498 * This could still be optimized:
499 * - fold all the options into a flag word and test it with a single test.
500 * - could test fs/gs bitsliced
502 struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
504 struct thread_struct *prev = &prev_p->thread,
505 *next = &next_p->thread;
506 int cpu = smp_processor_id();
507 struct tss_struct *tss = &per_cpu(init_tss, cpu);
512 * Reload esp0, LDT and the page table pointer:
514 tss->rsp0 = next->rsp0;
518 * This won't pick up thread selector changes, but I guess that is ok.
520 asm volatile("mov %%es,%0" : "=m" (prev->es));
521 if (unlikely(next->es | prev->es))
522 loadsegment(es, next->es);
524 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
525 if (unlikely(next->ds | prev->ds))
526 loadsegment(ds, next->ds);
535 asm volatile("movl %%fs,%0" : "=r" (fsindex));
536 /* segment register != 0 always requires a reload.
537 also reload when it has changed.
538 when prev process used 64bit base always reload
539 to avoid an information leak. */
540 if (unlikely(fsindex | next->fsindex | prev->fs)) {
541 loadsegment(fs, next->fsindex);
542 /* check if the user used a selector != 0
543 * if yes clear 64bit base, since overloaded base
544 * is always mapped to the Null selector
549 /* when next process has a 64bit base use it */
551 wrmsrl(MSR_FS_BASE, next->fs);
552 prev->fsindex = fsindex;
556 asm volatile("movl %%gs,%0" : "=r" (gsindex));
557 if (unlikely(gsindex | next->gsindex | prev->gs)) {
558 load_gs_index(next->gsindex);
563 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
564 prev->gsindex = gsindex;
568 * Switch the PDA context.
570 prev->userrsp = read_pda(oldrsp);
571 write_pda(oldrsp, next->userrsp);
572 write_pda(pcurrent, next_p);
573 write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);
576 * Now maybe reload the debug registers
578 if (unlikely(next->debugreg7)) {
590 * Handle the IO bitmap
592 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
593 if (next->io_bitmap_ptr)
595 * Copy the relevant range of the IO bitmap.
596 * Normally this is 128 bytes or less:
598 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
599 max(prev->io_bitmap_max, next->io_bitmap_max));
602 * Clear any possible leftover bits:
604 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
612 * sys_execve() executes a new program.
615 long sys_execve(char __user *name, char __user * __user *argv,
616 char __user * __user *envp, struct pt_regs regs)
621 filename = getname(name);
622 error = PTR_ERR(filename);
623 if (IS_ERR(filename))
625 error = do_execve(filename, argv, envp, ®s);
628 current->ptrace &= ~PT_DTRACE;
629 task_unlock(current);
635 void set_personality_64bit(void)
637 /* inherit personality from parent */
639 /* Make sure to be in 64bit mode */
640 clear_thread_flag(TIF_IA32);
642 /* TBD: overwrites user setup. Should have two bits.
643 But 64bit processes have always behaved this way,
644 so it's not too bad. The main problem is just that
645 32bit childs are affected again. */
646 current->personality &= ~READ_IMPLIES_EXEC;
649 asmlinkage long sys_fork(struct pt_regs *regs)
651 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
654 asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
658 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
662 * This is trivial, and on the face of it looks like it
663 * could equally well be done in user mode.
665 * Not so, for quite unobvious reasons - register pressure.
666 * In user mode vfork() cannot have a stack frame, and if
667 * done by calling the "clone()" system call directly, you
668 * do not have enough call-clobbered registers to hold all
669 * the information you need.
671 asmlinkage long sys_vfork(struct pt_regs *regs)
673 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
677 unsigned long get_wchan(struct task_struct *p)
683 if (!p || p == current || p->state==TASK_RUNNING)
685 stack = (unsigned long)p->thread_info;
686 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
688 fp = *(u64 *)(p->thread.rsp);
690 if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE)
692 rip = *(u64 *)(fp+8);
693 if (!in_sched_functions(rip))
696 } while (count++ < 16);
700 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
703 int doit = task == current;
708 if (addr >= TASK_SIZE_OF(task))
711 /* handle small bases via the GDT because that's faster to
713 if (addr <= 0xffffffff) {
714 set_32bit_tls(task, GS_TLS, addr);
716 load_TLS(&task->thread, cpu);
717 load_gs_index(GS_TLS_SEL);
719 task->thread.gsindex = GS_TLS_SEL;
722 task->thread.gsindex = 0;
723 task->thread.gs = addr;
726 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
732 /* Not strictly needed for fs, but do it for symmetry
734 if (addr >= TASK_SIZE_OF(task))
737 /* handle small bases via the GDT because that's faster to
739 if (addr <= 0xffffffff) {
740 set_32bit_tls(task, FS_TLS, addr);
742 load_TLS(&task->thread, cpu);
743 asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL));
745 task->thread.fsindex = FS_TLS_SEL;
748 task->thread.fsindex = 0;
749 task->thread.fs = addr;
751 /* set the selector to 0 to not confuse
753 asm volatile("movl %0,%%fs" :: "r" (0));
754 ret = checking_wrmsrl(MSR_FS_BASE, addr);
761 if (task->thread.fsindex == FS_TLS_SEL)
762 base = read_32bit_tls(task, FS_TLS);
764 rdmsrl(MSR_FS_BASE, base);
766 base = task->thread.fs;
767 ret = put_user(base, (unsigned long __user *)addr);
772 if (task->thread.gsindex == GS_TLS_SEL)
773 base = read_32bit_tls(task, GS_TLS);
775 rdmsrl(MSR_KERNEL_GS_BASE, base);
777 base = task->thread.gs;
778 ret = put_user(base, (unsigned long __user *)addr);
790 long sys_arch_prctl(int code, unsigned long addr)
792 return do_arch_prctl(current, code, addr);
796 * Capture the user space registers if the task is not running (in user space)
798 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
800 struct pt_regs *pp, ptregs;
802 pp = (struct pt_regs *)(tsk->thread.rsp0);
809 elf_core_copy_regs(regs, &ptregs);
814 unsigned long arch_align_stack(unsigned long sp)
816 if (randomize_va_space)
817 sp -= get_random_int() % 8192;