KAISER: Kernel Address Isolation

author Richard Fellner <richard.fellner@student.tugraz.at>

Thu, 4 May 2017 12:26:50 +0000 (14:26 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 5 Jan 2018 14:44:23 +0000 (15:44 +0100)
author Richard Fellner <richard.fellner@student.tugraz.at>
Thu, 4 May 2017 12:26:50 +0000 (14:26 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 5 Jan 2018 14:44:23 +0000 (15:44 +0100)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S

index cc0f2f5..273bc5b 100644 (file)
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -35,6 +35,7 @@
  #include <asm/asm.h>
  #include <asm/smap.h>
  #include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
  #include <linux/err.h>
  
  /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
@@ -135,6 +136,7 @@ ENTRY(entry_SYSCALL_64)
          * it is too small to ever cause noticeable irq latency.
          */
         SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
         /*
          * A hypervisor implementation might want to use a label
          * after the swapgs, so that it can do the swapgs
@@ -207,9 +209,10 @@ entry_SYSCALL_64_fastpath:
         testl   $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
         jnz     int_ret_from_sys_call_irqs_off  /* Go to the slow path */
  
-       RESTORE_C_REGS_EXCEPT_RCX_R11
         movq    RIP(%rsp), %rcx
         movq    EFLAGS(%rsp), %r11
+       RESTORE_C_REGS_EXCEPT_RCX_R11
+       SWITCH_USER_CR3
         movq    RSP(%rsp), %rsp
         /*
          * 64-bit SYSRET restores rip from rcx,
@@ -347,10 +350,12 @@ GLOBAL(int_ret_from_sys_call)
  syscall_return_via_sysret:
         /* rcx and r11 are already restored (see code above) */
         RESTORE_C_REGS_EXCEPT_RCX_R11
+       SWITCH_USER_CR3
         movq    RSP(%rsp), %rsp
         USERGS_SYSRET64
  
  opportunistic_sysret_failed:
+       SWITCH_USER_CR3
         SWAPGS
         jmp     restore_c_regs_and_iret
  END(entry_SYSCALL_64)
@@ -509,6 +514,7 @@ END(irq_entries_start)
          * tracking that we're in kernel mode.
          */
         SWAPGS
+       SWITCH_KERNEL_CR3
  
         /*
          * We need to tell lockdep that IRQs are off.  We can't do this until
@@ -568,6 +574,7 @@ GLOBAL(retint_user)
         mov     %rsp,%rdi
         call    prepare_exit_to_usermode
         TRACE_IRQS_IRETQ
+       SWITCH_USER_CR3
         SWAPGS
         jmp     restore_regs_and_iret
  
@@ -625,6 +632,7 @@ native_irq_return_ldt:
         pushq   %rax
         pushq   %rdi
         SWAPGS
+       SWITCH_KERNEL_CR3
         movq    PER_CPU_VAR(espfix_waddr), %rdi
         movq    %rax, (0*8)(%rdi)               /* RAX */
         movq    (2*8)(%rsp), %rax               /* RIP */
@@ -640,6 +648,7 @@ native_irq_return_ldt:
         andl    $0xffff0000, %eax
         popq    %rdi
         orq     PER_CPU_VAR(espfix_stack), %rax
+       SWITCH_USER_CR3
         SWAPGS
         movq    %rax, %rsp
         popq    %rax
@@ -1007,6 +1016,7 @@ ENTRY(paranoid_entry)
         testl   %edx, %edx
         js      1f                              /* negative -> in kernel */
         SWAPGS
+       SWITCH_KERNEL_CR3
         xorl    %ebx, %ebx
  1:     ret
  END(paranoid_entry)
@@ -1029,6 +1039,7 @@ ENTRY(paranoid_exit)
         testl   %ebx, %ebx                      /* swapgs needed? */
         jnz     paranoid_exit_no_swapgs
         TRACE_IRQS_IRETQ
+       SWITCH_USER_CR3_NO_STACK
         SWAPGS_UNSAFE_STACK
         jmp     paranoid_exit_restore
  paranoid_exit_no_swapgs:
@@ -1058,6 +1069,7 @@ ENTRY(error_entry)
          * from user mode due to an IRET fault.
          */
         SWAPGS
+       SWITCH_KERNEL_CR3
  
  .Lerror_entry_from_usermode_after_swapgs:
         /*
@@ -1110,7 +1122,7 @@ ENTRY(error_entry)
          * Switch to kernel gsbase:
          */
         SWAPGS
-
+       SWITCH_KERNEL_CR3
         /*
          * Pretend that the exception came from user mode: set up pt_regs
          * as if we faulted immediately after IRET and clear EBX so that
@@ -1210,6 +1222,7 @@ ENTRY(nmi)
          */
  
         SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
         cld
         movq    %rsp, %rdx
         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1250,6 +1263,7 @@ ENTRY(nmi)
          * work, because we don't want to enable interrupts.  Fortunately,
          * do_nmi doesn't modify pt_regs.
          */
+       SWITCH_USER_CR3
         SWAPGS
         jmp     restore_c_regs_and_iret
  
@@ -1461,6 +1475,7 @@ end_repeat_nmi:
         testl   %ebx, %ebx                      /* swapgs needed? */
         jnz     nmi_restore
  nmi_swapgs:
+       SWITCH_USER_CR3_NO_STACK
         SWAPGS_UNSAFE_STACK
  nmi_restore:
         RESTORE_EXTRA_REGS
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S

index 15cfeba..fe19119 100644 (file)
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -13,6 +13,7 @@
  #include <asm/irqflags.h>
  #include <asm/asm.h>
  #include <asm/smap.h>
+#include <asm/kaiser.h>
  #include <linux/linkage.h>
  #include <linux/err.h>
  
@@ -50,6 +51,7 @@ ENDPROC(native_usergs_sysret32)
  ENTRY(entry_SYSENTER_compat)
         /* Interrupts are off on entry. */
         SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
  
         /*
@@ -161,6 +163,7 @@ ENDPROC(entry_SYSENTER_compat)
  ENTRY(entry_SYSCALL_compat)
         /* Interrupts are off on entry. */
         SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
  
         /* Stash user ESP and switch to the kernel stack. */
         movl    %esp, %r8d
@@ -208,6 +211,7 @@ ENTRY(entry_SYSCALL_compat)
         /* Opportunistic SYSRET */
  sysret32_from_system_call:
         TRACE_IRQS_ON                   /* User mode traces as IRQs on. */
+       SWITCH_USER_CR3
         movq    RBX(%rsp), %rbx         /* pt_regs->rbx */
         movq    RBP(%rsp), %rbp         /* pt_regs->rbp */
         movq    EFLAGS(%rsp), %r11      /* pt_regs->flags (in r11) */
@@ -269,6 +273,7 @@ ENTRY(entry_INT80_compat)
         PARAVIRT_ADJUST_EXCEPTION_FRAME
         ASM_CLAC                        /* Do this early to minimize exposure */
         SWAPGS
+       SWITCH_KERNEL_CR3_NO_STACK
  
         /*
          * User tracing code (ptrace or signal handlers) might assume that
@@ -311,6 +316,7 @@ ENTRY(entry_INT80_compat)
  
         /* Go back to user mode. */
         TRACE_IRQS_ON
+       SWITCH_USER_CR3
         SWAPGS
         jmp     restore_regs_and_iret
  END(entry_INT80_compat)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h

index 59caa55..ee52ff8 100644 (file)
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -187,7 +187,7 @@ extern char irq_entries_start[];
  #define VECTOR_RETRIGGERED     ((void *)~0UL)
  
  typedef struct irq_desc* vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
  
  #endif /* !ASSEMBLY_ */
  
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h

new file mode 100644 (file)

index 0000000..63ee830
--- /dev/null
+++ b/arch/x86/include/asm/kaiser.h
@@ -0,0 +1,113 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+
+/* This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory.
+ * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped,
+ * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled.
+ * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory.
+ *
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions
+ * of the user space, or the stacks.
+ */
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_KAISER
+
+.macro _SWITCH_TO_KERNEL_CR3 reg
+movq %cr3, \reg
+andq $(~0x1000), \reg
+movq \reg, %cr3
+.endm
+
+.macro _SWITCH_TO_USER_CR3 reg
+movq %cr3, \reg
+orq $(0x1000), \reg
+movq \reg, %cr3
+.endm
+
+.macro SWITCH_KERNEL_CR3
+pushq %rax
+_SWITCH_TO_KERNEL_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_USER_CR3
+pushq %rax
+_SWITCH_TO_USER_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_KERNEL_CR3_NO_STACK
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_KERNEL_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+.endm
+
+
+.macro SWITCH_USER_CR3_NO_STACK
+
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_USER_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+
+.endm
+
+#else /* CONFIG_KAISER */
+
+.macro SWITCH_KERNEL_CR3 reg
+.endm
+.macro SWITCH_USER_CR3 reg
+.endm
+.macro SWITCH_USER_CR3_NO_STACK
+.endm
+.macro SWITCH_KERNEL_CR3_NO_STACK
+.endm
+
+#endif /* CONFIG_KAISER */
+#else /* __ASSEMBLY__ */
+
+
+#ifdef CONFIG_KAISER
+// Upon kernel/user mode switch, it may happen that
+// the address space has to be switched before the registers have been stored.
+// To change the address space, another register is needed.
+// A register therefore has to be stored/restored.
+//
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+#endif /* CONFIG_KAISER */
+
+/**
+ *  shadowmem_add_mapping - map a virtual memory part to the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ *  @flags: The mapping flags of the pages
+ *
+ *  the mapping is done on a global scope, so no bigger synchronization has to be done.
+ *  the pages have to be manually unmapped again when they are not needed any longer.
+ */
+extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+
+
+/**
+ *  shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ *  shadowmem_initialize_mapping - Initalize the shadow mapping
+ *
+ *  most parts of the shadow mapping can be mapped upon boot time.
+ *  only the thread stacks have to be mapped on runtime.
+ *  the mapped regions are not unmapped at all.
+ */
+extern void kaiser_init(void);
+
+#endif
+
+
+
+#endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h

index 6ec0c8b..6a843d2 100644 (file)
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -856,6 +856,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
  static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
  {
         memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+       // clone the shadow pgd part as well
+       memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
+#endif
  }
  
  #define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h

index 2ee7811..2131edd 100644 (file)
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,9 +106,30 @@ static inline void native_pud_clear(pud_t *pud)
         native_set_pud(pud, native_make_pud(0));
  }
  
+#ifdef CONFIG_KAISER
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
+       return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
+}
+
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
+       return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);
+}
+#endif /* CONFIG_KAISER */
+
  static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
  {
+#ifdef CONFIG_KAISER
+       // We know that a pgd is page aligned.
+       // Therefore the lower indices have to be mapped to user space.
+       // These pages are mapped to the shadow mapping.
+       if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
+               native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+       }
+
+       pgdp->pgd = pgd.pgd & ~_PAGE_USER;
+#else /* CONFIG_KAISER */
         *pgdp = pgd;
+#endif
  }
  
  static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h

index 79c9185..9846704 100644 (file)
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -39,7 +39,11 @@
  #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
  #define _PAGE_DIRTY    (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
  #define _PAGE_PSE      (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
-#define _PAGE_GLOBAL   (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#ifdef CONFIG_KAISER
+#define _PAGE_GLOBAL   (_AT(pteval_t, 0))
+#else
+#define _PAGE_GLOBAL  (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#endif
  #define _PAGE_SOFTW1   (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
  #define _PAGE_SOFTW2   (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
  #define _PAGE_PAT      (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
@@ -89,7 +93,11 @@
  #define _PAGE_NX       (_AT(pteval_t, 0))
  #endif
  
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#ifdef CONFIG_KAISER
+#define _PAGE_PROTNONE (_AT(pteval_t, 0))
+#else
+#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#endif
  
  #define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
                          _PAGE_ACCESSED | _PAGE_DIRTY)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h

index 2d5a50c..6a2e0a0 100644 (file)
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -305,7 +305,7 @@ struct tss_struct {
  
  } ____cacheline_aligned;
  
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
  
  #ifdef CONFIG_X86_32
  DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
@@ -332,6 +332,11 @@ union irq_stack_union {
                 char gs_base[40];
                 unsigned long stack_canary;
         };
+
+       struct {
+               char irq_stack_pointer[64];
+               char unused[IRQ_STACK_SIZE - 64];
+       };
  };
  
  DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index aa1e724..e5ba970 100644 (file)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -92,7 +92,7 @@ static const struct cpu_dev default_cpu = {
  
  static const struct cpu_dev *this_cpu = &default_cpu;
  
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
  #ifdef CONFIG_X86_64
         /*
          * We need valid kernel segments for data and code in long mode too
@@ -1229,7 +1229,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
           [DEBUG_STACK - 1]                     = DEBUG_STKSZ
  };
  
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
         [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
  
  /* May not be marked __init: used by software suspend */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c

index 4d38416..bd1358d 100644 (file)
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -41,6 +41,7 @@
  #include <asm/pgalloc.h>
  #include <asm/setup.h>
  #include <asm/espfix.h>
+#include <asm/kaiser.h>
  
  /*
   * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -126,6 +127,11 @@ void __init init_espfix_bsp(void)
         /* Install the espfix pud into the kernel page directory */
         pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
         pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+#ifdef CONFIG_KAISER
+       // add the esp stack pud to the shadow mapping here.
+       // This can be done directly, because the fixup stack has its own pud
+       set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
+#endif
  
         /* Randomize the locations */
         init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S

index ffdc0e8..0a8d18a 100644 (file)
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -441,6 +441,14 @@ early_idt_ripmsg:
         .balign PAGE_SIZE; \
  GLOBAL(name)
  
+#ifdef CONFIG_KAISER
+#define NEXT_PGD_PAGE(name) \
+       .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#endif
+
  /* Automate the creation of 1 to 1 mapping pmd entries */
  #define PMDS(START, PERM, COUNT)                       \
         i = 0 ;                                         \
@@ -450,7 +458,7 @@ GLOBAL(name)
         .endr
  
         __INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PGD_PAGE(early_level4_pgt)
         .fill   511,8,0
         .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
  
@@ -460,10 +468,10 @@ NEXT_PAGE(early_dynamic_pgts)
         .data
  
  #ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
-       .fill   512,8,0
+NEXT_PGD_PAGE(init_level4_pgt)
+       .fill   2*512,8,0
  #else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
         .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
         .org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
         .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c

index 1423ab1..f480b38 100644 (file)
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -51,7 +51,7 @@ static struct irqaction irq2 = {
         .flags = IRQF_NO_THREAD,
  };
  
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
         [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
  };
  
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c

index 9f7c21c..7c5c5dc 100644 (file)
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -39,7 +39,7 @@
   * section. Since TSS's are completely CPU-local, we want them
   * on exact cacheline boundaries, to eliminate cacheline ping-pong.
   */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
         .x86_tss = {
                 .sp0 = TOP_OF_INIT_STACK,
  #ifdef CONFIG_X86_32
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile

index 1ae7c14..9781560 100644 (file)
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -32,3 +32,4 @@ obj-$(CONFIG_ACPI_NUMA)               += srat.o
  obj-$(CONFIG_NUMA_EMU)         += numa_emulation.o
  
  obj-$(CONFIG_X86_INTEL_MPX)    += mpx.o
+obj-$(CONFIG_KAISER)           += kaiser.o
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c

new file mode 100644 (file)

index 0000000..cf1bb92
--- /dev/null
+++ b/arch/x86/mm/kaiser.c
@@ -0,0 +1,160 @@
+
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+#ifdef CONFIG_KAISER
+
+__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/**
+ * Get the real ppn from a address in kernel mapping.
+ * @param address The virtual adrress
+ * @return the physical address
+ */
+static inline unsigned long get_pa_from_mapping (unsigned long address)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = pgd_offset_k(address);
+       BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
+
+       pud = pud_offset(pgd, address);
+       BUG_ON(pud_none(*pud));
+
+       if (pud_large(*pud)) {
+               return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
+       }
+
+       pmd = pmd_offset(pud, address);
+       BUG_ON(pmd_none(*pmd));
+
+       if (pmd_large(*pmd)) {
+               return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
+       }
+
+       pte = pte_offset_kernel(pmd, address);
+       BUG_ON(pte_none(*pte));
+
+       return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
+}
+
+void _kaiser_copy (unsigned long start_addr, unsigned long size,
+                                       unsigned long flags)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       unsigned long address;
+       unsigned long end_addr = start_addr + size;
+       unsigned long target_address;
+
+       for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
+                       address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) {
+               target_address = get_pa_from_mapping(address);
+
+               pgd = native_get_shadow_pgd(pgd_offset_k(address));
+
+               BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
+               BUG_ON(pgd_large(*pgd));
+
+               pud = pud_offset(pgd, address);
+               if (pud_none(*pud)) {
+                       set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
+               }
+               BUG_ON(pud_large(*pud));
+
+               pmd = pmd_offset(pud, address);
+               if (pmd_none(*pmd)) {
+                       set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
+               }
+               BUG_ON(pmd_large(*pmd));
+
+               pte = pte_offset_kernel(pmd, address);
+               if (pte_none(*pte)) {
+                       set_pte(pte, __pte(flags | target_address));
+               } else {
+                       BUG_ON(__pa(pte_page(*pte)) != target_address);
+               }
+       }
+}
+
+// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
+static inline void __init _kaiser_init(void)
+{
+       pgd_t *pgd;
+       int i = 0;
+
+       pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+       for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+               set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
+       }
+}
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+spinlock_t shadow_table_lock;
+void __init kaiser_init(void)
+{
+       int cpu;
+       spin_lock_init(&shadow_table_lock);
+
+       spin_lock(&shadow_table_lock);
+
+       _kaiser_init();
+
+       for_each_possible_cpu(cpu) {
+               // map the per cpu user variables
+               _kaiser_copy(
+                               (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
+                               (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
+                               __PAGE_KERNEL);
+       }
+
+       // map the entry/exit text section, which is responsible to switch between user- and kernel mode
+       _kaiser_copy(
+                       (unsigned long) __entry_text_start,
+                       (unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
+                       __PAGE_KERNEL_RX);
+
+       // the fixed map address of the idt_table
+       _kaiser_copy(
+                       (unsigned long) idt_descr.address,
+                       sizeof(gate_desc) * NR_VECTORS,
+                       __PAGE_KERNEL_RO);
+
+       spin_unlock(&shadow_table_lock);
+}
+
+// add a mapping to the shadow-mapping, and synchronize the mappings
+void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+       spin_lock(&shadow_table_lock);
+       _kaiser_copy(addr, size, flags);
+       spin_unlock(&shadow_table_lock);
+}
+
+extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+       pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
+       spin_lock(&shadow_table_lock);
+       do {
+               unmap_pud_range(pgd, start, start + size);
+       } while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
+       spin_unlock(&shadow_table_lock);
+}
+#endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c

index b599a78..4fe616b 100644 (file)
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -829,7 +829,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
                         pud_clear(pud);
  }
  
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
  {
         pud_t *pud = pud_offset(pgd, start);
  
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c

index fb0a9dd..087d3e1 100644 (file)
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -342,12 +342,38 @@ static inline void _pgd_free(pgd_t *pgd)
  #else
  static inline pgd_t *_pgd_alloc(void)
  {
+#ifdef CONFIG_KAISER
+       // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
+       // block. Therefore, we have to allocate at least 3 pages. However, the
+       // __get_free_pages returns us 4 pages. Hence, we store the base pointer at
+       // the beginning of the page of our 8kb-aligned memory block in order to
+       // correctly free it afterwars.
+
+       unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
+
+       if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
+       {
+               *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
+               return (pgd_t *) pages;
+       }
+       else
+       {
+               *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
+               return (pgd_t *) (pages + PAGE_SIZE);
+       }
+#else
         return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#endif
  }
  
  static inline void _pgd_free(pgd_t *pgd)
  {
+#ifdef CONFIG_KAISER
+  unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
+       free_pages(pages, get_order(4*PAGE_SIZE));
+#else
         free_page((unsigned long)pgd);
+#endif
  }
  #endif /* CONFIG_X86_PAE */
  
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h

index ef2e8c9..cc1c662 100644 (file)
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -725,7 +725,16 @@
   */
  #define PERCPU_INPUT(cacheline)                                                \
         VMLINUX_SYMBOL(__per_cpu_start) = .;                            \
-       *(.data..percpu..first)                                         \
+       \
+       VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .;        \
+       *(.data..percpu..first)           \
+       . = ALIGN(cacheline);           \
+       *(.data..percpu..user_mapped)            \
+       *(.data..percpu..user_mapped..shared_aligned)        \
+       . = ALIGN(PAGE_SIZE);           \
+       *(.data..percpu..user_mapped..page_aligned)          \
+       VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .;        \
+       \
         . = ALIGN(PAGE_SIZE);                                           \
         *(.data..percpu..page_aligned)                                  \
         . = ALIGN(cacheline);                                           \
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h

index 8f16299..8ea945f 100644 (file)
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -35,6 +35,12 @@
  
  #endif
  
+#ifdef CONFIG_KAISER
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
  /*
   * Base implementations of per-CPU variable declarations and definitions, where
   * the section in which the variable is to be placed is provided by the
@@ -115,6 +121,12 @@
  #define DEFINE_PER_CPU(type, name)                                     \
         DEFINE_PER_CPU_SECTION(type, name, "")
  
+#define DECLARE_PER_CPU_USER_MAPPED(type, name)         \
+       DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name)          \
+       DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
  /*
   * Declaration/definition used for per-CPU variables that must come first in
   * the set of variables.
@@ -144,6 +156,14 @@
         DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
         ____cacheline_aligned_in_smp
  
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)                 \
+       DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+       ____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)                  \
+       DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+       ____cacheline_aligned_in_smp
+
  #define DECLARE_PER_CPU_ALIGNED(type, name)                            \
         DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)    \
         ____cacheline_aligned
@@ -162,6 +182,16 @@
  #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)                                \
         DEFINE_PER_CPU_SECTION(type, name, "..page_aligned")            \
         __aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)      \
+  DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned")   \
+  __aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)       \
+  DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned")    \
+  __aligned(PAGE_SIZE)
  
  /*
   * Declaration/definition used for per-CPU variables that must be read mostly.
diff --git a/init/main.c b/init/main.c

index 9e64d70..0cc5c4b 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -87,6 +87,9 @@
  #include <asm/setup.h>
  #include <asm/sections.h>
  #include <asm/cacheflush.h>
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+#endif
  
  static int kernel_init(void *);
  
@@ -492,6 +495,9 @@ static void __init mm_init(void)
         pgtable_init();
         vmalloc_init();
         ioremap_huge_init();
+#ifdef CONFIG_KAISER
+       kaiser_init();
+#endif
  }
  
  asmlinkage __visible void __init start_kernel(void)
diff --git a/kernel/fork.c b/kernel/fork.c

index 68cfda1..8f1931f 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -167,8 +167,12 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
         return page ? page_address(page) : NULL;
  }
  
+extern void kaiser_remove_mapping(unsigned long start_addr, unsigned long size);
  static inline void free_thread_info(struct thread_info *ti)
  {
+#ifdef CONFIG_KAISER
+       kaiser_remove_mapping((unsigned long)ti, THREAD_SIZE);
+#endif
         free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
  }
  # else
@@ -331,6 +335,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)
         *stackend = STACK_END_MAGIC;    /* for overflow detection */
  }
  
+extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
  static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
  {
         struct task_struct *tsk;
@@ -352,6 +357,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
                 goto free_ti;
  
         tsk->stack = ti;
+#ifdef CONFIG_KAISER
+       kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
+#endif
  #ifdef CONFIG_SECCOMP
         /*
          * We must handle setting up seccomp filters once we're under
diff --git a/security/Kconfig b/security/Kconfig

index e452378..cb2a9bc 100644 (file)
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -30,6 +30,13 @@ config SECURITY
           model will be used.
  
           If you are unsure how to answer this question, answer N.
+config KAISER
+       bool "Remove the kernel mapping in user mode"
+       depends on X86_64
+       depends on !PARAVIRT
+       help
+         This enforces a strict kernel and user space isolation in order to close
+         hardware side channels on kernel address information.
  
  config SECURITYFS
         bool "Enable the securityfs filesystem"
author	Richard Fellner <richard.fellner@student.tugraz.at>
	Thu, 4 May 2017 12:26:50 +0000 (14:26 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 5 Jan 2018 14:44:23 +0000 (15:44 +0100)
arch/x86/entry/entry_64.S		patch \| blob \| history
arch/x86/entry/entry_64_compat.S		patch \| blob \| history
arch/x86/include/asm/hw_irq.h		patch \| blob \| history
arch/x86/include/asm/kaiser.h	[new file with mode: 0644]	patch \| blob
arch/x86/include/asm/pgtable.h		patch \| blob \| history
arch/x86/include/asm/pgtable_64.h		patch \| blob \| history
arch/x86/include/asm/pgtable_types.h		patch \| blob \| history
arch/x86/include/asm/processor.h		patch \| blob \| history
arch/x86/kernel/cpu/common.c		patch \| blob \| history
arch/x86/kernel/espfix_64.c		patch \| blob \| history
arch/x86/kernel/head_64.S		patch \| blob \| history
arch/x86/kernel/irqinit.c		patch \| blob \| history
arch/x86/kernel/process.c		patch \| blob \| history
arch/x86/mm/Makefile		patch \| blob \| history
arch/x86/mm/kaiser.c	[new file with mode: 0644]	patch \| blob
arch/x86/mm/pageattr.c		patch \| blob \| history
arch/x86/mm/pgtable.c		patch \| blob \| history
include/asm-generic/vmlinux.lds.h		patch \| blob \| history
include/linux/percpu-defs.h		patch \| blob \| history
init/main.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
security/Kconfig		patch \| blob \| history