From 7d8d8cfdee9a7bd6f9682f253fa98efdd8048a9e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 20 Nov 2019 23:06:41 -0800 Subject: [PATCH] x86/doublefault/32: Rewrite the x86_32 #DF handler and unify with 64-bit The old x86_32 doublefault_fn() was old and crufty, and it did not even try to recover. do_double_fault() is much nicer. Rewrite the 32-bit double fault code to sanitize CPU state and call do_double_fault(). This is mostly an exercise i386 archaeology. With this patch applied, 32-bit double faults get a real stack trace, just like 64-bit double faults. [ mingo: merged the patch to a later kernel base. ] Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 42 +++++++++++++++ arch/x86/include/asm/traps.h | 3 ++ arch/x86/kernel/doublefault_32.c | 107 +++++++++++++++++++++++++++------------ arch/x86/kernel/traps.c | 19 ++++++- 4 files changed, 138 insertions(+), 33 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 5832b11f01bb..632432bb723d 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1537,6 +1537,48 @@ SYM_CODE_START(debug) jmp common_exception SYM_CODE_END(debug) +#ifdef CONFIG_DOUBLEFAULT +SYM_CODE_START(double_fault) +1: + /* + * This is a task gate handler, not an interrupt gate handler. + * The error code is on the stack, but the stack is otherwise + * empty. Interrupts are off. Our state is sane with the following + * exceptions: + * + * - CR0.TS is set. "TS" literally means "task switched". + * - EFLAGS.NT is set because we're a "nested task". + * - The doublefault TSS has back_link set and has been marked busy. + * - TR points to the doublefault TSS and the normal TSS is busy. + * - CR3 is the normal kernel PGD. This would be delightful, except + * that the CPU didn't bother to save the old CR3 anywhere. This + * would make it very awkward to return back to the context we came + * from. + * + * The rest of EFLAGS is sanitized for us, so we don't need to + * worry about AC or DF. + * + * Don't even bother popping the error code. It's always zero, + * and ignoring it makes us a bit more robust against buggy + * hypervisor task gate implementations. + * + * We will manually undo the task switch instead of doing a + * task-switching IRET. + */ + + clts /* clear CR0.TS */ + pushl $X86_EFLAGS_FIXED + popfl /* clear EFLAGS.NT */ + + call doublefault_shim + + /* We don't support returning, so we have no IRET here. */ +1: + hlt + jmp 1b +SYM_CODE_END(double_fault) +#endif + /* * NMI is doubly nasty. It can happen on the first instruction of * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index b25e633033c3..ffa0dc8a535e 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -69,6 +69,9 @@ dotraplinkage void do_overflow(struct pt_regs *regs, long error_code); dotraplinkage void do_bounds(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code); dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code); +#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT) +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); +#endif dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code); dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code); diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 4eecfe4825ed..3793646f0fb5 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -9,42 +9,83 @@ #include #include #include +#include +extern void double_fault(void); #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) -static void doublefault_fn(void) -{ - struct desc_ptr gdt_desc = {0, 0}; - unsigned long gdt, tss; +#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x) - BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE); +static void set_df_gdt_entry(unsigned int cpu); - native_store_gdt(&gdt_desc); - gdt = gdt_desc.address; +/* + * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks + * we're running the doublefault task. Cannot return. + */ +asmlinkage notrace void __noreturn doublefault_shim(void) +{ + unsigned long cr2; + struct pt_regs regs; - printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size); + BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE); - if (ptr_ok(gdt)) { - gdt += GDT_ENTRY_TSS << 3; - tss = get_desc_base((struct desc_struct *)gdt); - printk(KERN_EMERG "double fault, tss at %08lx\n", tss); + cr2 = native_read_cr2(); - if (ptr_ok(tss)) { - struct x86_hw_tss *t = (struct x86_hw_tss *)tss; + /* Reset back to the normal kernel task. */ + force_reload_TR(); + set_df_gdt_entry(smp_processor_id()); - printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", - t->ip, t->sp); + trace_hardirqs_off(); - printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", - t->ax, t->bx, t->cx, t->dx); - printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", - t->si, t->di); - } - } + /* + * Fill in pt_regs. A downside of doing this in C is that the unwinder + * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump + * won't successfully unwind to the source of the double fault. + * The main dump from do_double_fault() is fine, though, since it + * uses these regs directly. + * + * If anyone ever cares, this could be moved to asm. + */ + regs.ss = TSS(ss); + regs.__ssh = 0; + regs.sp = TSS(sp); + regs.flags = TSS(flags); + regs.cs = TSS(cs); + /* We won't go through the entry asm, so we can leave __csh as 0. */ + regs.__csh = 0; + regs.ip = TSS(ip); + regs.orig_ax = 0; + regs.gs = TSS(gs); + regs.__gsh = 0; + regs.fs = TSS(fs); + regs.__fsh = 0; + regs.es = TSS(es); + regs.__esh = 0; + regs.ds = TSS(ds); + regs.__dsh = 0; + regs.ax = TSS(ax); + regs.bp = TSS(bp); + regs.di = TSS(di); + regs.si = TSS(si); + regs.dx = TSS(dx); + regs.cx = TSS(cx); + regs.bx = TSS(bx); + + do_double_fault(®s, 0, cr2); - for (;;) - cpu_relax(); + /* + * x86_32 does not save the original CR3 anywhere on a task switch. + * This means that, even if we wanted to return, we would need to find + * some way to reconstruct CR3. We could make a credible guess based + * on cpu_tlbstate, but that would be racy and would not account for + * PTI. + * + * Instead, don't bother. We can return through + * rewind_stack_do_exit() instead. + */ + panic("cannot return from double fault\n"); } +NOKPROBE_SYMBOL(doublefault_shim); DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { .tss = { @@ -55,9 +96,8 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { .ldt = 0, .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, - .ip = (unsigned long) doublefault_fn, - /* 0x2 bit is always set */ - .flags = X86_EFLAGS_SF | 0x2, + .ip = (unsigned long) double_fault, + .flags = X86_EFLAGS_FIXED, .es = __USER_DS, .cs = __KERNEL_CS, .ss = __KERNEL_DS, @@ -71,6 +111,14 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { }, }; +static void set_df_gdt_entry(unsigned int cpu) +{ + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, + &get_cpu_entry_area(cpu)->doublefault_stack.tss); + +} + void doublefault_init_cpu_tss(void) { unsigned int cpu = smp_processor_id(); @@ -84,8 +132,5 @@ void doublefault_init_cpu_tss(void) (unsigned long)&cea->doublefault_stack.stack + sizeof(doublefault_stack.stack)); - /* Set up doublefault TSS pointer in the GDT */ - __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, - &get_cpu_entry_area(cpu)->doublefault_stack.tss); - + set_df_gdt_entry(cpu); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 76381b04dc93..a9b16c3a933d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -306,8 +306,23 @@ __visible void __noreturn handle_stack_overflow(const char *message, } #endif -#ifdef CONFIG_X86_64 -/* Runs on IST stack */ +#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT) +/* + * Runs on an IST stack for x86_64 and on a special task stack for x86_32. + * + * On x86_64, this is more or less a normal kernel entry. Notwithstanding the + * SDM's warnings about double faults being unrecoverable, returning works as + * expected. Presumably what the SDM actually means is that the CPU may get + * the register state wrong on entry, so returning could be a bad idea. + * + * Various CPU engineers have promised that double faults due to an IRET fault + * while the stack is read-only are, in fact, recoverable. + * + * On x86_32, this is entered through a task gate, and regs are synthesized + * from the TSS. Returning is, in principle, okay, but changes to regs will + * be lost. If, for some reason, we need to return to a context with modified + * regs, the shim code could be adjusted to synchronize the registers. + */ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2) { static const char str[] = "double fault"; -- 2.11.0