source: src/linux/universal/linux-3.2/arch/x86/kernel/process_64.c @ 18171

Last change on this file since 18171 was 18171, checked in by BrainSlayer, 17 months ago

this kernel will be maintained for all targets, so target specific kernel trees will not be neccessary anymore in future

File size: 16.1 KB
Line 
1/*
2 *  Copyright (C) 1995  Linus Torvalds
3 *
4 *  Pentium III FXSR, SSE support
5 *      Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 *  X86-64 port
8 *      Andi Kleen.
9 *
10 *      CPU hotplug support - ashok.raj@intel.com
11 */
12
13/*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
17#include <linux/stackprotector.h>
18#include <linux/cpu.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/fs.h>
22#include <linux/kernel.h>
23#include <linux/mm.h>
24#include <linux/elfcore.h>
25#include <linux/smp.h>
26#include <linux/slab.h>
27#include <linux/user.h>
28#include <linux/interrupt.h>
29#include <linux/delay.h>
30#include <linux/module.h>
31#include <linux/ptrace.h>
32#include <linux/notifier.h>
33#include <linux/kprobes.h>
34#include <linux/kdebug.h>
35#include <linux/tick.h>
36#include <linux/prctl.h>
37#include <linux/uaccess.h>
38#include <linux/io.h>
39#include <linux/ftrace.h>
40#include <linux/cpuidle.h>
41
42#include <asm/pgtable.h>
43#include <asm/system.h>
44#include <asm/processor.h>
45#include <asm/i387.h>
46#include <asm/mmu_context.h>
47#include <asm/prctl.h>
48#include <asm/desc.h>
49#include <asm/proto.h>
50#include <asm/ia32.h>
51#include <asm/idle.h>
52#include <asm/syscalls.h>
53#include <asm/debugreg.h>
54#include <asm/nmi.h>
55
56asmlinkage extern void ret_from_fork(void);
57
58DEFINE_PER_CPU(unsigned long, old_rsp);
59static DEFINE_PER_CPU(unsigned char, is_idle);
60
61static ATOMIC_NOTIFIER_HEAD(idle_notifier);
62
63void idle_notifier_register(struct notifier_block *n)
64{
65        atomic_notifier_chain_register(&idle_notifier, n);
66}
67EXPORT_SYMBOL_GPL(idle_notifier_register);
68
69void idle_notifier_unregister(struct notifier_block *n)
70{
71        atomic_notifier_chain_unregister(&idle_notifier, n);
72}
73EXPORT_SYMBOL_GPL(idle_notifier_unregister);
74
75void enter_idle(void)
76{
77        percpu_write(is_idle, 1);
78        atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
79}
80
81static void __exit_idle(void)
82{
83        if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
84                return;
85        atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
86}
87
88/* Called from interrupts to signify idle end */
89void exit_idle(void)
90{
91        /* idle loop has pid 0 */
92        if (current->pid)
93                return;
94        __exit_idle();
95}
96
97#ifndef CONFIG_SMP
98static inline void play_dead(void)
99{
100        BUG();
101}
102#endif
103
104/*
105 * The idle thread. There's no useful work to be
106 * done, so just try to conserve power and have a
107 * low exit latency (ie sit in a loop waiting for
108 * somebody to say that they'd like to reschedule)
109 */
110void cpu_idle(void)
111{
112        current_thread_info()->status |= TS_POLLING;
113
114        /*
115         * If we're the non-boot CPU, nothing set the stack canary up
116         * for us.  CPU0 already has it initialized but no harm in
117         * doing it again.  This is a good place for updating it, as
118         * we wont ever return from this function (so the invalid
119         * canaries already on the stack wont ever trigger).
120         */
121        boot_init_stack_canary();
122
123        /* endless idle loop with no priority at all */
124        while (1) {
125                tick_nohz_stop_sched_tick(1);
126                while (!need_resched()) {
127
128                        rmb();
129
130                        if (cpu_is_offline(smp_processor_id()))
131                                play_dead();
132                        /*
133                         * Idle routines should keep interrupts disabled
134                         * from here on, until they go to idle.
135                         * Otherwise, idle callbacks can misfire.
136                         */
137                        local_touch_nmi();
138                        local_irq_disable();
139                        enter_idle();
140                        /* Don't trace irqs off for idle */
141                        stop_critical_timings();
142                        if (cpuidle_idle_call())
143                                pm_idle();
144                        start_critical_timings();
145
146                        /* In many cases the interrupt that ended idle
147                           has already called exit_idle. But some idle
148                           loops can be woken up without interrupt. */
149                        __exit_idle();
150                }
151
152                tick_nohz_restart_sched_tick();
153                preempt_enable_no_resched();
154                schedule();
155                preempt_disable();
156        }
157}
158
159/* Prints also some state that isn't saved in the pt_regs */
160void __show_regs(struct pt_regs *regs, int all)
161{
162        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
163        unsigned long d0, d1, d2, d3, d6, d7;
164        unsigned int fsindex, gsindex;
165        unsigned int ds, cs, es;
166
167        show_regs_common();
168        printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
169        printk_address(regs->ip, 1);
170        printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
171                        regs->sp, regs->flags);
172        printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
173               regs->ax, regs->bx, regs->cx);
174        printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
175               regs->dx, regs->si, regs->di);
176        printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
177               regs->bp, regs->r8, regs->r9);
178        printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
179               regs->r10, regs->r11, regs->r12);
180        printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
181               regs->r13, regs->r14, regs->r15);
182
183        asm("movl %%ds,%0" : "=r" (ds));
184        asm("movl %%cs,%0" : "=r" (cs));
185        asm("movl %%es,%0" : "=r" (es));
186        asm("movl %%fs,%0" : "=r" (fsindex));
187        asm("movl %%gs,%0" : "=r" (gsindex));
188
189        rdmsrl(MSR_FS_BASE, fs);
190        rdmsrl(MSR_GS_BASE, gs);
191        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
192
193        if (!all)
194                return;
195
196        cr0 = read_cr0();
197        cr2 = read_cr2();
198        cr3 = read_cr3();
199        cr4 = read_cr4();
200
201        printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
202               fs, fsindex, gs, gsindex, shadowgs);
203        printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
204                        es, cr0);
205        printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
206                        cr4);
207
208        get_debugreg(d0, 0);
209        get_debugreg(d1, 1);
210        get_debugreg(d2, 2);
211        printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
212        get_debugreg(d3, 3);
213        get_debugreg(d6, 6);
214        get_debugreg(d7, 7);
215        printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
216}
217
218void release_thread(struct task_struct *dead_task)
219{
220        if (dead_task->mm) {
221                if (dead_task->mm->context.size) {
222                        printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
223                                        dead_task->comm,
224                                        dead_task->mm->context.ldt,
225                                        dead_task->mm->context.size);
226                        BUG();
227                }
228        }
229}
230
231static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
232{
233        struct user_desc ud = {
234                .base_addr = addr,
235                .limit = 0xfffff,
236                .seg_32bit = 1,
237                .limit_in_pages = 1,
238                .useable = 1,
239        };
240        struct desc_struct *desc = t->thread.tls_array;
241        desc += tls;
242        fill_ldt(desc, &ud);
243}
244
245static inline u32 read_32bit_tls(struct task_struct *t, int tls)
246{
247        return get_desc_base(&t->thread.tls_array[tls]);
248}
249
250/*
251 * This gets called before we allocate a new thread and copy
252 * the current task into it.
253 */
254void prepare_to_copy(struct task_struct *tsk)
255{
256        unlazy_fpu(tsk);
257}
258
259int copy_thread(unsigned long clone_flags, unsigned long sp,
260                unsigned long unused,
261        struct task_struct *p, struct pt_regs *regs)
262{
263        int err;
264        struct pt_regs *childregs;
265        struct task_struct *me = current;
266
267        childregs = ((struct pt_regs *)
268                        (THREAD_SIZE + task_stack_page(p))) - 1;
269        *childregs = *regs;
270
271        childregs->ax = 0;
272        if (user_mode(regs))
273                childregs->sp = sp;
274        else
275                childregs->sp = (unsigned long)childregs;
276
277        p->thread.sp = (unsigned long) childregs;
278        p->thread.sp0 = (unsigned long) (childregs+1);
279        p->thread.usersp = me->thread.usersp;
280
281        set_tsk_thread_flag(p, TIF_FORK);
282
283        p->thread.io_bitmap_ptr = NULL;
284
285        savesegment(gs, p->thread.gsindex);
286        p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
287        savesegment(fs, p->thread.fsindex);
288        p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
289        savesegment(es, p->thread.es);
290        savesegment(ds, p->thread.ds);
291
292        err = -ENOMEM;
293        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
294
295        if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
296                p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
297                if (!p->thread.io_bitmap_ptr) {
298                        p->thread.io_bitmap_max = 0;
299                        return -ENOMEM;
300                }
301                memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
302                                IO_BITMAP_BYTES);
303                set_tsk_thread_flag(p, TIF_IO_BITMAP);
304        }
305
306        /*
307         * Set a new TLS for the child thread?
308         */
309        if (clone_flags & CLONE_SETTLS) {
310#ifdef CONFIG_IA32_EMULATION
311                if (test_thread_flag(TIF_IA32))
312                        err = do_set_thread_area(p, -1,
313                                (struct user_desc __user *)childregs->si, 0);
314                else
315#endif
316                        err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
317                if (err)
318                        goto out;
319        }
320        err = 0;
321out:
322        if (err && p->thread.io_bitmap_ptr) {
323                kfree(p->thread.io_bitmap_ptr);
324                p->thread.io_bitmap_max = 0;
325        }
326
327        return err;
328}
329
330static void
331start_thread_common(struct pt_regs *regs, unsigned long new_ip,
332                    unsigned long new_sp,
333                    unsigned int _cs, unsigned int _ss, unsigned int _ds)
334{
335        loadsegment(fs, 0);
336        loadsegment(es, _ds);
337        loadsegment(ds, _ds);
338        load_gs_index(0);
339        regs->ip                = new_ip;
340        regs->sp                = new_sp;
341        percpu_write(old_rsp, new_sp);
342        regs->cs                = _cs;
343        regs->ss                = _ss;
344        regs->flags             = X86_EFLAGS_IF;
345        /*
346         * Free the old FP and other extended state
347         */
348        free_thread_xstate(current);
349}
350
351void
352start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
353{
354        start_thread_common(regs, new_ip, new_sp,
355                            __USER_CS, __USER_DS, 0);
356}
357
358#ifdef CONFIG_IA32_EMULATION
359void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
360{
361        start_thread_common(regs, new_ip, new_sp,
362                            __USER32_CS, __USER32_DS, __USER32_DS);
363}
364#endif
365
366/*
367 *      switch_to(x,y) should switch tasks from x to y.
368 *
369 * This could still be optimized:
370 * - fold all the options into a flag word and test it with a single test.
371 * - could test fs/gs bitsliced
372 *
373 * Kprobes not supported here. Set the probe on schedule instead.
374 * Function graph tracer not supported too.
375 */
376__notrace_funcgraph struct task_struct *
377__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
378{
379        struct thread_struct *prev = &prev_p->thread;
380        struct thread_struct *next = &next_p->thread;
381        int cpu = smp_processor_id();
382        struct tss_struct *tss = &per_cpu(init_tss, cpu);
383        unsigned fsindex, gsindex;
384        bool preload_fpu;
385
386        /*
387         * If the task has used fpu the last 5 timeslices, just do a full
388         * restore of the math state immediately to avoid the trap; the
389         * chances of needing FPU soon are obviously high now
390         */
391        preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
392
393        /* we're going to use this soon, after a few expensive things */
394        if (preload_fpu)
395                prefetch(next->fpu.state);
396
397        /*
398         * Reload esp0, LDT and the page table pointer:
399         */
400        load_sp0(tss, next);
401
402        /*
403         * Switch DS and ES.
404         * This won't pick up thread selector changes, but I guess that is ok.
405         */
406        savesegment(es, prev->es);
407        if (unlikely(next->es | prev->es))
408                loadsegment(es, next->es);
409
410        savesegment(ds, prev->ds);
411        if (unlikely(next->ds | prev->ds))
412                loadsegment(ds, next->ds);
413
414
415        /* We must save %fs and %gs before load_TLS() because
416         * %fs and %gs may be cleared by load_TLS().
417         *
418         * (e.g. xen_load_tls())
419         */
420        savesegment(fs, fsindex);
421        savesegment(gs, gsindex);
422
423        load_TLS(next, cpu);
424
425        /* Must be after DS reload */
426        __unlazy_fpu(prev_p);
427
428        /* Make sure cpu is ready for new context */
429        if (preload_fpu)
430                clts();
431
432        /*
433         * Leave lazy mode, flushing any hypercalls made here.
434         * This must be done before restoring TLS segments so
435         * the GDT and LDT are properly updated, and must be
436         * done before math_state_restore, so the TS bit is up
437         * to date.
438         */
439        arch_end_context_switch(next_p);
440
441        /*
442         * Switch FS and GS.
443         *
444         * Segment register != 0 always requires a reload.  Also
445         * reload when it has changed.  When prev process used 64bit
446         * base always reload to avoid an information leak.
447         */
448        if (unlikely(fsindex | next->fsindex | prev->fs)) {
449                loadsegment(fs, next->fsindex);
450                /*
451                 * Check if the user used a selector != 0; if yes
452                 *  clear 64bit base, since overloaded base is always
453                 *  mapped to the Null selector
454                 */
455                if (fsindex)
456                        prev->fs = 0;
457        }
458        /* when next process has a 64bit base use it */
459        if (next->fs)
460                wrmsrl(MSR_FS_BASE, next->fs);
461        prev->fsindex = fsindex;
462
463        if (unlikely(gsindex | next->gsindex | prev->gs)) {
464                load_gs_index(next->gsindex);
465                if (gsindex)
466                        prev->gs = 0;
467        }
468        if (next->gs)
469                wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
470        prev->gsindex = gsindex;
471
472        /*
473         * Switch the PDA and FPU contexts.
474         */
475        prev->usersp = percpu_read(old_rsp);
476        percpu_write(old_rsp, next->usersp);
477        percpu_write(current_task, next_p);
478
479        percpu_write(kernel_stack,
480                  (unsigned long)task_stack_page(next_p) +
481                  THREAD_SIZE - KERNEL_STACK_OFFSET);
482
483        /*
484         * Now maybe reload the debug registers and handle I/O bitmaps
485         */
486        if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
487                     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
488                __switch_to_xtra(prev_p, next_p, tss);
489
490        /*
491         * Preload the FPU context, now that we've determined that the
492         * task is likely to be using it.
493         */
494        if (preload_fpu)
495                __math_state_restore();
496
497        return prev_p;
498}
499
500void set_personality_64bit(void)
501{
502        /* inherit personality from parent */
503
504        /* Make sure to be in 64bit mode */
505        clear_thread_flag(TIF_IA32);
506
507        /* Ensure the corresponding mm is not marked. */
508        if (current->mm)
509                current->mm->context.ia32_compat = 0;
510
511        /* TBD: overwrites user setup. Should have two bits.
512           But 64bit processes have always behaved this way,
513           so it's not too bad. The main problem is just that
514           32bit childs are affected again. */
515        current->personality &= ~READ_IMPLIES_EXEC;
516}
517
518void set_personality_ia32(void)
519{
520        /* inherit personality from parent */
521
522        /* Make sure to be in 32bit mode */
523        set_thread_flag(TIF_IA32);
524        current->personality |= force_personality32;
525
526        /* Mark the associated mm as containing 32-bit tasks. */
527        if (current->mm)
528                current->mm->context.ia32_compat = 1;
529
530        /* Prepare the first "return" to user space */
531        current_thread_info()->status |= TS_COMPAT;
532}
533
534unsigned long get_wchan(struct task_struct *p)
535{
536        unsigned long stack;
537        u64 fp, ip;
538        int count = 0;
539
540        if (!p || p == current || p->state == TASK_RUNNING)
541                return 0;
542        stack = (unsigned long)task_stack_page(p);
543        if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
544                return 0;
545        fp = *(u64 *)(p->thread.sp);
546        do {
547                if (fp < (unsigned long)stack ||
548                    fp >= (unsigned long)stack+THREAD_SIZE)
549                        return 0;
550                ip = *(u64 *)(fp+8);
551                if (!in_sched_functions(ip))
552                        return ip;
553                fp = *(u64 *)fp;
554        } while (count++ < 16);
555        return 0;
556}
557
558long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
559{
560        int ret = 0;
561        int doit = task == current;
562        int cpu;
563
564        switch (code) {
565        case ARCH_SET_GS:
566                if (addr >= TASK_SIZE_OF(task))
567                        return -EPERM;
568                cpu = get_cpu();
569                /* handle small bases via the GDT because that's faster to
570                   switch. */
571                if (addr <= 0xffffffff) {
572                        set_32bit_tls(task, GS_TLS, addr);
573                        if (doit) {
574                                load_TLS(&task->thread, cpu);
575                                load_gs_index(GS_TLS_SEL);
576                        }
577                        task->thread.gsindex = GS_TLS_SEL;
578                        task->thread.gs = 0;
579                } else {
580                        task->thread.gsindex = 0;
581                        task->thread.gs = addr;
582                        if (doit) {
583                                load_gs_index(0);
584                                ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
585                        }
586                }
587                put_cpu();
588                break;
589        case ARCH_SET_FS:
590                /* Not strictly needed for fs, but do it for symmetry
591                   with gs */
592                if (addr >= TASK_SIZE_OF(task))
593                        return -EPERM;
594                cpu = get_cpu();
595                /* handle small bases via the GDT because that's faster to
596                   switch. */
597                if (addr <= 0xffffffff) {
598                        set_32bit_tls(task, FS_TLS, addr);
599                        if (doit) {
600                                load_TLS(&task->thread, cpu);
601                                loadsegment(fs, FS_TLS_SEL);
602                        }
603                        task->thread.fsindex = FS_TLS_SEL;
604                        task->thread.fs = 0;
605                } else {
606                        task->thread.fsindex = 0;
607                        task->thread.fs = addr;
608                        if (doit) {
609                                /* set the selector to 0 to not confuse
610                                   __switch_to */
611                                loadsegment(fs, 0);
612                                ret = checking_wrmsrl(MSR_FS_BASE, addr);
613                        }
614                }
615                put_cpu();
616                break;
617        case ARCH_GET_FS: {
618                unsigned long base;
619                if (task->thread.fsindex == FS_TLS_SEL)
620                        base = read_32bit_tls(task, FS_TLS);
621                else if (doit)
622                        rdmsrl(MSR_FS_BASE, base);
623                else
624                        base = task->thread.fs;
625                ret = put_user(base, (unsigned long __user *)addr);
626                break;
627        }
628        case ARCH_GET_GS: {
629                unsigned long base;
630                unsigned gsindex;
631                if (task->thread.gsindex == GS_TLS_SEL)
632                        base = read_32bit_tls(task, GS_TLS);
633                else if (doit) {
634                        savesegment(gs, gsindex);
635                        if (gsindex)
636                                rdmsrl(MSR_KERNEL_GS_BASE, base);
637                        else
638                                base = task->thread.gs;
639                } else
640                        base = task->thread.gs;
641                ret = put_user(base, (unsigned long __user *)addr);
642                break;
643        }
644
645        default:
646                ret = -EINVAL;
647                break;
648        }
649
650        return ret;
651}
652
653long sys_arch_prctl(int code, unsigned long addr)
654{
655        return do_arch_prctl(current, code, addr);
656}
657
658unsigned long KSTK_ESP(struct task_struct *task)
659{
660        return (test_tsk_thread_flag(task, TIF_IA32)) ?
661                        (task_pt_regs(task)->sp) : ((task)->thread.usersp);
662}
Note: See TracBrowser for help on using the repository browser.