kaiser: merged update

author Dave Hansen <dave.hansen@linux.intel.com>

Wed, 30 Aug 2017 23:23:00 +0000 (16:23 -0700)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 5 Jan 2018 14:46:32 +0000 (15:46 +0100)
author Dave Hansen <dave.hansen@linux.intel.com>
Wed, 30 Aug 2017 23:23:00 +0000 (16:23 -0700)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 5 Jan 2018 14:46:32 +0000 (15:46 +0100)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S

index 9467a2c..2869fc1 100644 (file)
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -230,6 +230,13 @@ entry_SYSCALL_64_fastpath:
         movq    RIP(%rsp), %rcx
         movq    EFLAGS(%rsp), %r11
         RESTORE_C_REGS_EXCEPT_RCX_R11
+       /*
+        * This opens a window where we have a user CR3, but are
+        * running in the kernel.  This makes using the CS
+        * register useless for telling whether or not we need to
+        * switch CR3 in NMIs.  Normal interrupts are OK because
+        * they are off here.
+        */
         SWITCH_USER_CR3
         movq    RSP(%rsp), %rsp
         USERGS_SYSRET64
@@ -326,11 +333,25 @@ return_from_SYSCALL_64:
  syscall_return_via_sysret:
         /* rcx and r11 are already restored (see code above) */
         RESTORE_C_REGS_EXCEPT_RCX_R11
+       /*
+        * This opens a window where we have a user CR3, but are
+        * running in the kernel.  This makes using the CS
+        * register useless for telling whether or not we need to
+        * switch CR3 in NMIs.  Normal interrupts are OK because
+        * they are off here.
+        */
         SWITCH_USER_CR3
         movq    RSP(%rsp), %rsp
         USERGS_SYSRET64
  
  opportunistic_sysret_failed:
+       /*
+        * This opens a window where we have a user CR3, but are
+        * running in the kernel.  This makes using the CS
+        * register useless for telling whether or not we need to
+        * switch CR3 in NMIs.  Normal interrupts are OK because
+        * they are off here.
+        */
         SWITCH_USER_CR3
         SWAPGS
         jmp     restore_c_regs_and_iret
@@ -1087,6 +1108,13 @@ ENTRY(error_entry)
         cld
         SAVE_C_REGS 8
         SAVE_EXTRA_REGS 8
+       /*
+        * error_entry() always returns with a kernel gsbase and
+        * CR3.  We must also have a kernel CR3/gsbase before
+        * calling TRACE_IRQS_*.  Just unconditionally switch to
+        * the kernel CR3 here.
+        */
+       SWITCH_KERNEL_CR3
         xorl    %ebx, %ebx
         testb   $3, CS+8(%rsp)
         jz      .Lerror_kernelspace
@@ -1096,7 +1124,6 @@ ENTRY(error_entry)
          * from user mode due to an IRET fault.
          */
         SWAPGS
-       SWITCH_KERNEL_CR3
  
  .Lerror_entry_from_usermode_after_swapgs:
         /*
@@ -1148,7 +1175,6 @@ ENTRY(error_entry)
          * Switch to kernel gsbase:
          */
         SWAPGS
-       SWITCH_KERNEL_CR3
  
         /*
          * Pretend that the exception came from user mode: set up pt_regs
@@ -1249,7 +1275,10 @@ ENTRY(nmi)
          */
  
         SWAPGS_UNSAFE_STACK
-       SWITCH_KERNEL_CR3_NO_STACK
+       /*
+        * percpu variables are mapped with user CR3, so no need
+        * to switch CR3 here.
+        */
         cld
         movq    %rsp, %rdx
         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1283,14 +1312,33 @@ ENTRY(nmi)
  
         movq    %rsp, %rdi
         movq    $-1, %rsi
+#ifdef CONFIG_KAISER
+       /* Unconditionally use kernel CR3 for do_nmi() */
+       /* %rax is saved above, so OK to clobber here */
+       movq    %cr3, %rax
+       pushq   %rax
+#ifdef CONFIG_KAISER_REAL_SWITCH
+       andq    $(~0x1000), %rax
+#endif
+       movq    %rax, %cr3
+#endif
         call    do_nmi
+       /*
+        * Unconditionally restore CR3.  I know we return to
+        * kernel code that needs user CR3, but do we ever return
+        * to "user mode" where we need the kernel CR3?
+        */
+#ifdef CONFIG_KAISER
+       popq    %rax
+       mov     %rax, %cr3
+#endif
  
         /*
          * Return back to user mode.  We must *not* do the normal exit
-        * work, because we don't want to enable interrupts.  Fortunately,
-        * do_nmi doesn't modify pt_regs.
+        * work, because we don't want to enable interrupts.  Do not
+        * switch to user CR3: we might be going back to kernel code
+        * that had a user CR3 set.
          */
-       SWITCH_USER_CR3
         SWAPGS
         jmp     restore_c_regs_and_iret
  
@@ -1486,23 +1534,54 @@ end_repeat_nmi:
         ALLOC_PT_GPREGS_ON_STACK
  
         /*
-        * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
-        * as we should not be calling schedule in NMI context.
-        * Even with normal interrupts enabled. An NMI should not be
-        * setting NEED_RESCHED or anything that normal interrupts and
-        * exceptions might do.
+        * Use the same approach as paranoid_entry to handle SWAPGS, but
+        * without CR3 handling since we do that differently in NMIs.  No
+        * need to use paranoid_exit as we should not be calling schedule
+        * in NMI context.  Even with normal interrupts enabled. An NMI
+        * should not be setting NEED_RESCHED or anything that normal
+        * interrupts and exceptions might do.
          */
-       call    paranoid_entry
+       cld
+       SAVE_C_REGS
+       SAVE_EXTRA_REGS
+       movl    $1, %ebx
+       movl    $MSR_GS_BASE, %ecx
+       rdmsr
+       testl   %edx, %edx
+       js      1f                              /* negative -> in kernel */
+       SWAPGS
+       xorl    %ebx, %ebx
+1:
+#ifdef CONFIG_KAISER
+       /* Unconditionally use kernel CR3 for do_nmi() */
+       /* %rax is saved above, so OK to clobber here */
+       movq    %cr3, %rax
+       pushq   %rax
+#ifdef CONFIG_KAISER_REAL_SWITCH
+       andq    $(~0x1000), %rax
+#endif
+       movq    %rax, %cr3
+#endif
  
         /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
         movq    %rsp, %rdi
+       addq    $8, %rdi /* point %rdi at ptregs, fixed up for CR3 */
         movq    $-1, %rsi
         call    do_nmi
+       /*
+        * Unconditionally restore CR3.  We might be returning to
+        * kernel code that needs user CR3, like just just before
+        * a sysret.
+        */
+#ifdef CONFIG_KAISER
+       popq    %rax
+       mov     %rax, %cr3
+#endif
  
         testl   %ebx, %ebx                      /* swapgs needed? */
         jnz     nmi_restore
  nmi_swapgs:
-       SWITCH_USER_CR3_NO_STACK
+       /* We fixed up CR3 above, so no need to switch it here */
         SWAPGS_UNSAFE_STACK
  nmi_restore:
         RESTORE_EXTRA_REGS
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h

index 63ee830..0703f48 100644 (file)
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -16,13 +16,17 @@
  
  .macro _SWITCH_TO_KERNEL_CR3 reg
  movq %cr3, \reg
+#ifdef CONFIG_KAISER_REAL_SWITCH
  andq $(~0x1000), \reg
+#endif
  movq \reg, %cr3
  .endm
  
  .macro _SWITCH_TO_USER_CR3 reg
  movq %cr3, \reg
+#ifdef CONFIG_KAISER_REAL_SWITCH
  orq $(0x1000), \reg
+#endif
  movq \reg, %cr3
  .endm
  
@@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
  .endm
  
  #endif /* CONFIG_KAISER */
+
  #else /* __ASSEMBLY__ */
  
  
  #ifdef CONFIG_KAISER
-// Upon kernel/user mode switch, it may happen that
-// the address space has to be switched before the registers have been stored.
-// To change the address space, another register is needed.
-// A register therefore has to be stored/restored.
-//
-DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+/*
+ * Upon kernel/user mode switch, it may happen that the address
+ * space has to be switched before the registers have been
+ * stored.  To change the address space, another register is
+ * needed.  A register therefore has to be stored/restored.
+*/
  
-#endif /* CONFIG_KAISER */
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
  
  /**
- *  shadowmem_add_mapping - map a virtual memory part to the shadow mapping
+ *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
   *  @addr: the start address of the range
   *  @size: the size of the range
   *  @flags: The mapping flags of the pages
   *
- *  the mapping is done on a global scope, so no bigger synchronization has to be done.
- *  the pages have to be manually unmapped again when they are not needed any longer.
+ *  The mapping is done on a global scope, so no bigger
+ *  synchronization has to be done.  the pages have to be
+ *  manually unmapped again when they are not needed any longer.
   */
-extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
  
  
  /**
- *  shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
+ *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
   *  @addr: the start address of the range
   *  @size: the size of the range
   */
  extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
  
  /**
- *  shadowmem_initialize_mapping - Initalize the shadow mapping
+ *  kaiser_initialize_mapping - Initalize the shadow mapping
   *
- *  most parts of the shadow mapping can be mapped upon boot time.
- *  only the thread stacks have to be mapped on runtime.
- *  the mapped regions are not unmapped at all.
+ *  Most parts of the shadow mapping can be mapped upon boot
+ *  time.  Only per-process things like the thread stacks
+ *  or a new LDT have to be mapped at runtime.  These boot-
+ *  time mappings are permanent and nevertunmapped.
   */
  extern void kaiser_init(void);
  
-#endif
+#endif /* CONFIG_KAISER */
+
+#endif /* __ASSEMBLY */
  
  
  
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h

index 4b479c9..1cee98e 100644 (file)
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -690,7 +690,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
  
  static inline int pgd_bad(pgd_t pgd)
  {
-       return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+       pgdval_t ignore_flags = _PAGE_USER;
+       /*
+        * We set NX on KAISER pgds that map userspace memory so
+        * that userspace can not meaningfully use the kernel
+        * page table by accident; it will fault on the first
+        * instruction it tries to run.  See native_set_pgd().
+        */
+       if (IS_ENABLED(CONFIG_KAISER))
+               ignore_flags |= _PAGE_NX;
+
+       return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
  }
  
  static inline int pgd_none(pgd_t pgd)
@@ -905,8 +915,10 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
  {
         memcpy(dst, src, count * sizeof(pgd_t));
  #ifdef CONFIG_KAISER
-       // clone the shadow pgd part as well
-       memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
+       /* Clone the shadow pgd part as well */
+       memcpy(native_get_shadow_pgd(dst),
+              native_get_shadow_pgd(src),
+              count * sizeof(pgd_t));
  #endif
  }
  
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h

index e6ea39f..000265c 100644 (file)
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud)
  }
  
  #ifdef CONFIG_KAISER
-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+{
         return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
  }
  
-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+{
         return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);
  }
+#else
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+{
+       BUILD_BUG_ON(1);
+       return NULL;
+}
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+{
+       return pgdp;
+}
  #endif /* CONFIG_KAISER */
  
+/*
+ * Page table pages are page-aligned.  The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(void *__ptr)
+{
+       unsigned long ptr = (unsigned long)__ptr;
+
+       return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
+}
+
  static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
  {
  #ifdef CONFIG_KAISER
-       // We know that a pgd is page aligned.
-       // Therefore the lower indices have to be mapped to user space.
-       // These pages are mapped to the shadow mapping.
-       if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
+       pteval_t extra_kern_pgd_flags = 0;
+       /* Do we need to also populate the shadow pgd? */
+       if (is_userspace_pgd(pgdp)) {
                 native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+               /*
+                * Even if the entry is *mapping* userspace, ensure
+                * that userspace can not use it.  This way, if we
+                * get out to userspace running on the kernel CR3,
+                * userspace will crash instead of running.
+                */
+               extra_kern_pgd_flags = _PAGE_NX;
         }
-
-       pgdp->pgd = pgd.pgd & ~_PAGE_USER;
+       pgdp->pgd = pgd.pgd;
+       pgdp->pgd |= extra_kern_pgd_flags;
  #else /* CONFIG_KAISER */
         *pgdp = pgd;
  #endif
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h

index 00fecbb..8bc8d02 100644 (file)
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -48,7 +48,7 @@
  #ifdef CONFIG_KAISER
  #define _PAGE_GLOBAL   (_AT(pteval_t, 0))
  #else
-#define _PAGE_GLOBAL  (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#define _PAGE_GLOBAL   (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
  #endif
  #define _PAGE_SOFTW1   (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
  #define _PAGE_SOFTW2   (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
@@ -123,11 +123,7 @@
  #define _PAGE_DEVMAP   (_AT(pteval_t, 0))
  #endif
  
-#ifdef CONFIG_KAISER
-#define _PAGE_PROTNONE (_AT(pteval_t, 0))
-#else
  #define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
-#endif
  
  #define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
                          _PAGE_ACCESSED | _PAGE_DIRTY)
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c

index 9ff875a..560c2fd 100644 (file)
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -127,11 +127,14 @@ void __init init_espfix_bsp(void)
         /* Install the espfix pud into the kernel page directory */
         pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
         pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
-#ifdef CONFIG_KAISER
-       // add the esp stack pud to the shadow mapping here.
-       // This can be done directly, because the fixup stack has its own pud
-       set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
-#endif
+       /*
+        * Just copy the top-level PGD that is mapping the espfix
+        * area to ensure it is mapped into the shadow user page
+        * tables.
+        */
+       if (IS_ENABLED(CONFIG_KAISER))
+               set_pgd(native_get_shadow_pgd(pgd_p),
+                       __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
  
         /* Randomize the locations */
         init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S

index 9e849b5..5775379 100644 (file)
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -406,11 +406,24 @@ GLOBAL(early_recursion_flag)
  GLOBAL(name)
  
  #ifdef CONFIG_KAISER
+/*
+ * Each PGD needs to be 8k long and 8k aligned.  We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define KAISER_USER_PGD_FILL   512
+/* This ensures they are 8k-aligned: */
  #define NEXT_PGD_PAGE(name) \
         .balign 2 * PAGE_SIZE; \
  GLOBAL(name)
  #else
  #define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define KAISER_USER_PGD_FILL   0
  #endif
  
  /* Automate the creation of 1 to 1 mapping pmd entries */
@@ -425,6 +438,7 @@ GLOBAL(name)
  NEXT_PGD_PAGE(early_level4_pgt)
         .fill   511,8,0
         .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .fill   KAISER_USER_PGD_FILL,8,0
  
  NEXT_PAGE(early_dynamic_pgts)
         .fill   512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -433,7 +447,8 @@ NEXT_PAGE(early_dynamic_pgts)
  
  #ifndef CONFIG_XEN
  NEXT_PGD_PAGE(init_level4_pgt)
-       .fill   2*512,8,0
+       .fill   512,8,0
+       .fill   KAISER_USER_PGD_FILL,8,0
  #else
  NEXT_PGD_PAGE(init_level4_pgt)
         .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -442,6 +457,7 @@ NEXT_PGD_PAGE(init_level4_pgt)
         .org    init_level4_pgt + L4_START_KERNEL*8, 0
         /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
         .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .fill   KAISER_USER_PGD_FILL,8,0
  
  NEXT_PAGE(level3_ident_pgt)
         .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -452,6 +468,7 @@ NEXT_PAGE(level2_ident_pgt)
          */
         PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
  #endif
+       .fill   KAISER_USER_PGD_FILL,8,0
  
  NEXT_PAGE(level3_kernel_pgt)
         .fill   L3_START_KERNEL,8,0
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c

index 5f70014..dd31f9f 100644 (file)
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -18,6 +18,7 @@
  #include <linux/uaccess.h>
  
  #include <asm/ldt.h>
+#include <asm/kaiser.h>
  #include <asm/desc.h>
  #include <asm/mmu_context.h>
  #include <asm/syscalls.h>
@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
         set_ldt(pc->ldt->entries, pc->ldt->size);
  }
  
+static void __free_ldt_struct(struct ldt_struct *ldt)
+{
+       if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+               vfree(ldt->entries);
+       else
+               free_page((unsigned long)ldt->entries);
+       kfree(ldt);
+}
+
  /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
  static struct ldt_struct *alloc_ldt_struct(int size)
  {
         struct ldt_struct *new_ldt;
         int alloc_size;
+       int ret = 0;
  
         if (size > LDT_ENTRIES)
                 return NULL;
@@ -66,6 +77,14 @@ static struct ldt_struct *alloc_ldt_struct(int size)
                 return NULL;
         }
  
+       // FIXME: make kaiser_add_mapping() return an error code
+       // when it fails
+       kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+                          __PAGE_KERNEL);
+       if (ret) {
+               __free_ldt_struct(new_ldt);
+               return NULL;
+       }
         new_ldt->size = size;
         return new_ldt;
  }
@@ -92,12 +111,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
         if (likely(!ldt))
                 return;
  
+       kaiser_remove_mapping((unsigned long)ldt->entries,
+                             ldt->size * LDT_ENTRY_SIZE);
         paravirt_free_ldt(ldt->entries, ldt->size);
-       if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
-               vfree(ldt->entries);
-       else
-               free_page((unsigned long)ldt->entries);
-       kfree(ldt);
+       __free_ldt_struct(ldt);
  }
  
  /*
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c

index 1c113db..2bb5ee4 100644 (file)
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -9,10 +9,12 @@
  #include <linux/atomic.h>
  
  atomic_t trace_idt_ctr = ATOMIC_INIT(0);
+__aligned(PAGE_SIZE)
  struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
                                 (unsigned long) trace_idt_table };
  
  /* No need to be aligned, but done to keep all IDTs defined the same way. */
+__aligned(PAGE_SIZE)
  gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
  
  static int trace_irq_vector_refcount;
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c

index cf1bb92..7270a29 100644 (file)
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -1,160 +1,305 @@
-
-
+#include <linux/bug.h>
  #include <linux/kernel.h>
  #include <linux/errno.h>
  #include <linux/string.h>
  #include <linux/types.h>
  #include <linux/bug.h>
  #include <linux/init.h>
+#include <linux/interrupt.h>
  #include <linux/spinlock.h>
  #include <linux/mm.h>
-
  #include <linux/uaccess.h>
+
+#include <asm/kaiser.h>
  #include <asm/pgtable.h>
  #include <asm/pgalloc.h>
  #include <asm/desc.h>
  #ifdef CONFIG_KAISER
  
  __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes.  No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte().  We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
+ */
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
  
-/**
- * Get the real ppn from a address in kernel mapping.
- * @param address The virtual adrress
- * @return the physical address
+/*
+ * Returns -1 on error.
   */
-static inline unsigned long get_pa_from_mapping (unsigned long address)
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
  {
         pgd_t *pgd;
         pud_t *pud;
         pmd_t *pmd;
         pte_t *pte;
  
-       pgd = pgd_offset_k(address);
-       BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
-
-       pud = pud_offset(pgd, address);
-       BUG_ON(pud_none(*pud));
+       pgd = pgd_offset_k(vaddr);
+       /*
+        * We made all the kernel PGDs present in kaiser_init().
+        * We expect them to stay that way.
+        */
+       BUG_ON(pgd_none(*pgd));
+       /*
+        * PGDs are either 512GB or 128TB on all x86_64
+        * configurations.  We don't handle these.
+        */
+       BUG_ON(pgd_large(*pgd));
  
-       if (pud_large(*pud)) {
-               return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
+       pud = pud_offset(pgd, vaddr);
+       if (pud_none(*pud)) {
+               WARN_ON_ONCE(1);
+               return -1;
         }
  
-       pmd = pmd_offset(pud, address);
-       BUG_ON(pmd_none(*pmd));
+       if (pud_large(*pud))
+               return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
  
-       if (pmd_large(*pmd)) {
-               return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
+       pmd = pmd_offset(pud, vaddr);
+       if (pmd_none(*pmd)) {
+               WARN_ON_ONCE(1);
+               return -1;
         }
  
-       pte = pte_offset_kernel(pmd, address);
-       BUG_ON(pte_none(*pte));
+       if (pmd_large(*pmd))
+               return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
  
-       return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
+       pte = pte_offset_kernel(pmd, vaddr);
+       if (pte_none(*pte)) {
+               WARN_ON_ONCE(1);
+               return -1;
+       }
+
+       return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
  }
  
-void _kaiser_copy (unsigned long start_addr, unsigned long size,
-                                       unsigned long flags)
+/*
+ * This is a relatively normal page table walk, except that it
+ * also tries to allocate page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
  {
-       pgd_t *pgd;
-       pud_t *pud;
         pmd_t *pmd;
-       pte_t *pte;
-       unsigned long address;
-       unsigned long end_addr = start_addr + size;
-       unsigned long target_address;
+       pud_t *pud;
+       pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+       gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
  
-       for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
-                       address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) {
-               target_address = get_pa_from_mapping(address);
+       might_sleep();
+       if (is_atomic) {
+               gfp &= ~GFP_KERNEL;
+               gfp |= __GFP_HIGH | __GFP_ATOMIC;
+       }
  
-               pgd = native_get_shadow_pgd(pgd_offset_k(address));
+       if (pgd_none(*pgd)) {
+               WARN_ONCE(1, "All shadow pgds should have been populated");
+               return NULL;
+       }
+       BUILD_BUG_ON(pgd_large(*pgd) != 0);
  
-               BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
-               BUG_ON(pgd_large(*pgd));
+       pud = pud_offset(pgd, address);
+       /* The shadow page tables do not use large mappings: */
+       if (pud_large(*pud)) {
+               WARN_ON(1);
+               return NULL;
+       }
+       if (pud_none(*pud)) {
+               unsigned long new_pmd_page = __get_free_page(gfp);
+               if (!new_pmd_page)
+                       return NULL;
+               spin_lock(&shadow_table_allocation_lock);
+               if (pud_none(*pud))
+                       set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+               else
+                       free_page(new_pmd_page);
+               spin_unlock(&shadow_table_allocation_lock);
+       }
  
-               pud = pud_offset(pgd, address);
-               if (pud_none(*pud)) {
-                       set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
-               }
-               BUG_ON(pud_large(*pud));
+       pmd = pmd_offset(pud, address);
+       /* The shadow page tables do not use large mappings: */
+       if (pmd_large(*pmd)) {
+               WARN_ON(1);
+               return NULL;
+       }
+       if (pmd_none(*pmd)) {
+               unsigned long new_pte_page = __get_free_page(gfp);
+               if (!new_pte_page)
+                       return NULL;
+               spin_lock(&shadow_table_allocation_lock);
+               if (pmd_none(*pmd))
+                       set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+               else
+                       free_page(new_pte_page);
+               spin_unlock(&shadow_table_allocation_lock);
+       }
  
-               pmd = pmd_offset(pud, address);
-               if (pmd_none(*pmd)) {
-                       set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
-               }
-               BUG_ON(pmd_large(*pmd));
+       return pte_offset_kernel(pmd, address);
+}
  
-               pte = pte_offset_kernel(pmd, address);
+int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+                       unsigned long flags)
+{
+       int ret = 0;
+       pte_t *pte;
+       unsigned long start_addr = (unsigned long )__start_addr;
+       unsigned long address = start_addr & PAGE_MASK;
+       unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+       unsigned long target_address;
+
+       for (;address < end_addr; address += PAGE_SIZE) {
+               target_address = get_pa_from_mapping(address);
+               if (target_address == -1) {
+                       ret = -EIO;
+                       break;
+               }
+               pte = kaiser_pagetable_walk(address, false);
                 if (pte_none(*pte)) {
                         set_pte(pte, __pte(flags | target_address));
                 } else {
-                       BUG_ON(__pa(pte_page(*pte)) != target_address);
+                       pte_t tmp;
+                       set_pte(&tmp, __pte(flags | target_address));
+                       WARN_ON_ONCE(!pte_same(*pte, tmp));
                 }
         }
+       return ret;
+}
+
+static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
+{
+       unsigned long size = end - start;
+
+       return kaiser_add_user_map(start, size, flags);
  }
  
-// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
-static inline void __init _kaiser_init(void)
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated.  This ensures that all processes that get
+ * forked have the same entries.  This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
  {
         pgd_t *pgd;
         int i = 0;
  
         pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
         for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
-               set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
+               pgd_t new_pgd;
+               pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE);
+               if (!pud) {
+                       WARN_ON(1);
+                       break;
+               }
+               new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
+               /*
+                * Make sure not to stomp on some other pgd entry.
+                */
+               if (!pgd_none(pgd[i])) {
+                       WARN_ON(1);
+                       continue;
+               }
+               set_pgd(pgd + i, new_pgd);
         }
  }
  
+#define kaiser_add_user_map_early(start, size, flags) do {     \
+       int __ret = kaiser_add_user_map(start, size, flags);    \
+       WARN_ON(__ret);                                         \
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do {         \
+       int __ret = kaiser_add_user_map_ptrs(start, end, flags);        \
+       WARN_ON(__ret);                                                 \
+} while (0)
+
  extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
-spinlock_t shadow_table_lock;
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die.  But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it.  If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
  void __init kaiser_init(void)
  {
         int cpu;
-       spin_lock_init(&shadow_table_lock);
-
-       spin_lock(&shadow_table_lock);
  
-       _kaiser_init();
+       kaiser_init_all_pgds();
  
         for_each_possible_cpu(cpu) {
-               // map the per cpu user variables
-               _kaiser_copy(
-                               (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
-                               (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
-                               __PAGE_KERNEL);
+               void *percpu_vaddr = __per_cpu_user_mapped_start +
+                                    per_cpu_offset(cpu);
+               unsigned long percpu_sz = __per_cpu_user_mapped_end -
+                                         __per_cpu_user_mapped_start;
+               kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+                                         __PAGE_KERNEL);
         }
  
-       // map the entry/exit text section, which is responsible to switch between user- and kernel mode
-       _kaiser_copy(
-                       (unsigned long) __entry_text_start,
-                       (unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
-                       __PAGE_KERNEL_RX);
+       /*
+        * Map the entry/exit text section, which is needed at
+        * switches from user to and from kernel.
+        */
+       kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
+                                      __PAGE_KERNEL_RX);
  
-       // the fixed map address of the idt_table
-       _kaiser_copy(
-                       (unsigned long) idt_descr.address,
-                       sizeof(gate_desc) * NR_VECTORS,
-                       __PAGE_KERNEL_RO);
-
-       spin_unlock(&shadow_table_lock);
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+       kaiser_add_user_map_ptrs_early(__irqentry_text_start,
+                                      __irqentry_text_end,
+                                      __PAGE_KERNEL_RX);
+#endif
+       kaiser_add_user_map_early((void *)idt_descr.address,
+                                 sizeof(gate_desc) * NR_VECTORS,
+                                 __PAGE_KERNEL_RO);
+#ifdef CONFIG_TRACING
+       kaiser_add_user_map_early(&trace_idt_descr,
+                                 sizeof(trace_idt_descr),
+                                 __PAGE_KERNEL);
+       kaiser_add_user_map_early(&trace_idt_table,
+                                 sizeof(gate_desc) * NR_VECTORS,
+                                 __PAGE_KERNEL);
+#endif
+       kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
+                                 __PAGE_KERNEL);
+       kaiser_add_user_map_early(&debug_idt_table,
+                                 sizeof(gate_desc) * NR_VECTORS,
+                                 __PAGE_KERNEL);
  }
  
+extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end);
  // add a mapping to the shadow-mapping, and synchronize the mappings
-void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
  {
-       spin_lock(&shadow_table_lock);
-       _kaiser_copy(addr, size, flags);
-       spin_unlock(&shadow_table_lock);
+       return kaiser_add_user_map((const void *)addr, size, flags);
  }
  
-extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
  void kaiser_remove_mapping(unsigned long start, unsigned long size)
  {
-       pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
-       spin_lock(&shadow_table_lock);
-       do {
-               unmap_pud_range(pgd, start, start + size);
-       } while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
-       spin_unlock(&shadow_table_lock);
+       unsigned long end = start + size;
+       unsigned long addr;
+
+       for (addr = start; addr < end; addr += PGDIR_SIZE) {
+               pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
+               /*
+                * unmap_p4d_range() handles > P4D_SIZE unmaps,
+                * so no need to trim 'end'.
+                */
+               unmap_pud_range_nofree(pgd, addr, end);
+       }
  }
  #endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c

index c17412f..73dcb0e 100644 (file)
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
  #define CPA_FLUSHTLB 1
  #define CPA_ARRAY 2
  #define CPA_PAGES_ARRAY 4
+#define CPA_FREE_PAGETABLES 8
  
  #ifdef CONFIG_PROC_FS
  static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -729,10 +730,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
         return 0;
  }
  
-static bool try_to_free_pte_page(pte_t *pte)
+static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
  {
         int i;
  
+       if (!(cpa->flags & CPA_FREE_PAGETABLES))
+               return false;
+
         for (i = 0; i < PTRS_PER_PTE; i++)
                 if (!pte_none(pte[i]))
                         return false;
@@ -741,10 +745,13 @@ static bool try_to_free_pte_page(pte_t *pte)
         return true;
  }
  
-static bool try_to_free_pmd_page(pmd_t *pmd)
+static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
  {
         int i;
  
+       if (!(cpa->flags & CPA_FREE_PAGETABLES))
+               return false;
+
         for (i = 0; i < PTRS_PER_PMD; i++)
                 if (!pmd_none(pmd[i]))
                         return false;
@@ -753,7 +760,9 @@ static bool try_to_free_pmd_page(pmd_t *pmd)
         return true;
  }
  
-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
+                           unsigned long start,
+                           unsigned long end)
  {
         pte_t *pte = pte_offset_kernel(pmd, start);
  
@@ -764,22 +773,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
                 pte++;
         }
  
-       if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+       if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
                 pmd_clear(pmd);
                 return true;
         }
         return false;
  }
  
-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
                               unsigned long start, unsigned long end)
  {
-       if (unmap_pte_range(pmd, start, end))
-               if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+       if (unmap_pte_range(cpa, pmd, start, end))
+               if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
                         pud_clear(pud);
  }
  
-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
+                           unsigned long start, unsigned long end)
  {
         pmd_t *pmd = pmd_offset(pud, start);
  
@@ -790,7 +800,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
                 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
                 unsigned long pre_end = min_t(unsigned long, end, next_page);
  
-               __unmap_pmd_range(pud, pmd, start, pre_end);
+               __unmap_pmd_range(cpa, pud, pmd, start, pre_end);
  
                 start = pre_end;
                 pmd++;
@@ -803,7 +813,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
                 if (pmd_large(*pmd))
                         pmd_clear(pmd);
                 else
-                       __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+                       __unmap_pmd_range(cpa, pud, pmd,
+                                         start, start + PMD_SIZE);
  
                 start += PMD_SIZE;
                 pmd++;
@@ -813,17 +824,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
          * 4K leftovers?
          */
         if (start < end)
-               return __unmap_pmd_range(pud, pmd, start, end);
+               return __unmap_pmd_range(cpa, pud, pmd, start, end);
  
         /*
          * Try again to free the PMD page if haven't succeeded above.
          */
         if (!pud_none(*pud))
-               if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+               if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
                         pud_clear(pud);
  }
  
-void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
+                             unsigned long start,
+                             unsigned long end)
  {
         pud_t *pud = pud_offset(pgd, start);
  
@@ -834,7 +847,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
                 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
                 unsigned long pre_end   = min_t(unsigned long, end, next_page);
  
-               unmap_pmd_range(pud, start, pre_end);
+               unmap_pmd_range(cpa, pud, start, pre_end);
  
                 start = pre_end;
                 pud++;
@@ -848,7 +861,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
                 if (pud_large(*pud))
                         pud_clear(pud);
                 else
-                       unmap_pmd_range(pud, start, start + PUD_SIZE);
+                       unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
  
                 start += PUD_SIZE;
                 pud++;
@@ -858,7 +871,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
          * 2M leftovers?
          */
         if (start < end)
-               unmap_pmd_range(pud, start, end);
+               unmap_pmd_range(cpa, pud, start, end);
  
         /*
          * No need to try to free the PUD page because we'll free it in
@@ -866,6 +879,24 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
          */
  }
  
+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+       struct cpa_data cpa = {
+               .flags = CPA_FREE_PAGETABLES,
+       };
+
+       __unmap_pud_range(&cpa, pgd, start, end);
+}
+
+void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+       struct cpa_data cpa = {
+               .flags = 0,
+       };
+
+       __unmap_pud_range(&cpa, pgd, start, end);
+}
+
  static int alloc_pte_page(pmd_t *pmd)
  {
         pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c

index 27d218b..352fd01 100644 (file)
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -344,40 +344,26 @@ static inline void _pgd_free(pgd_t *pgd)
                 kmem_cache_free(pgd_cache, pgd);
  }
  #else
-static inline pgd_t *_pgd_alloc(void)
-{
+
  #ifdef CONFIG_KAISER
-       // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
-       // block. Therefore, we have to allocate at least 3 pages. However, the
-       // __get_free_pages returns us 4 pages. Hence, we store the base pointer at
-       // the beginning of the page of our 8kb-aligned memory block in order to
-       // correctly free it afterwars.
-
-       unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
-
-       if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
-       {
-               *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
-               return (pgd_t *) pages;
-       }
-       else
-       {
-               *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
-               return (pgd_t *) (pages + PAGE_SIZE);
-       }
+/*
+ * Instead of one pmd, we aquire two pmds.  Being order-1, it is
+ * both 8k in size and 8k-aligned.  That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
  #else
-       return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#define PGD_ALLOCATION_ORDER 0
  #endif
+
+static inline pgd_t *_pgd_alloc(void)
+{
+       return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
  }
  
  static inline void _pgd_free(pgd_t *pgd)
  {
-#ifdef CONFIG_KAISER
-  unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
-       free_pages(pages, get_order(4*PAGE_SIZE));
-#else
-       free_page((unsigned long)pgd);
-#endif
+       free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
  }
  #endif /* CONFIG_X86_PAE */
  
diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h

new file mode 100644 (file)

index 0000000..9db5433
--- /dev/null
+++ b/include/linux/kaiser.h
@@ -0,0 +1,26 @@
+#ifndef _INCLUDE_KAISER_H
+#define _INCLUDE_KAISER_H
+
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+#else
+
+/*
+ * These stubs are used whenever CONFIG_KAISER is off, which
+ * includes architectures that support KAISER, but have it
+ * disabled.
+ */
+
+static inline void kaiser_init(void)
+{
+}
+static inline void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+}
+static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+       return 0;
+}
+
+#endif /* !CONFIG_KAISER */
+#endif /* _INCLUDE_KAISER_H */
diff --git a/kernel/fork.c b/kernel/fork.c

index 4014be1..0b9688d 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -58,6 +58,7 @@
  #include <linux/tsacct_kern.h>
  #include <linux/cn_proc.h>
  #include <linux/freezer.h>
+#include <linux/kaiser.h>
  #include <linux/delayacct.h>
  #include <linux/taskstats_kern.h>
  #include <linux/random.h>
@@ -472,7 +473,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)
         *stackend = STACK_END_MAGIC;    /* for overflow detection */
  }
  
-extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
  static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
  {
         struct task_struct *tsk;
@@ -500,9 +500,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
          * functions again.
          */
         tsk->stack = stack;
-#ifdef CONFIG_KAISER
-       kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
-#endif
+
+       err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
+       if (err)
+               goto free_stack;
  #ifdef CONFIG_VMAP_STACK
         tsk->stack_vm_area = stack_vm_area;
  #endif
diff --git a/security/Kconfig b/security/Kconfig

index f515ac3..334d2e8 100644 (file)
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -32,12 +32,17 @@ config SECURITY
           If you are unsure how to answer this question, answer N.
  config KAISER
         bool "Remove the kernel mapping in user mode"
+       default y
         depends on X86_64
         depends on !PARAVIRT
         help
           This enforces a strict kernel and user space isolation in order to close
           hardware side channels on kernel address information.
  
+config KAISER_REAL_SWITCH
+       bool "KAISER: actually switch page tables"
+       default y
+
  config SECURITYFS
         bool "Enable the securityfs filesystem"
         help
author	Dave Hansen <dave.hansen@linux.intel.com>
	Wed, 30 Aug 2017 23:23:00 +0000 (16:23 -0700)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 5 Jan 2018 14:46:32 +0000 (15:46 +0100)
arch/x86/entry/entry_64.S		patch \| blob \| history
arch/x86/include/asm/kaiser.h		patch \| blob \| history
arch/x86/include/asm/pgtable.h		patch \| blob \| history
arch/x86/include/asm/pgtable_64.h		patch \| blob \| history
arch/x86/include/asm/pgtable_types.h		patch \| blob \| history
arch/x86/kernel/espfix_64.c		patch \| blob \| history
arch/x86/kernel/head_64.S		patch \| blob \| history
arch/x86/kernel/ldt.c		patch \| blob \| history
arch/x86/kernel/tracepoint.c		patch \| blob \| history
arch/x86/mm/kaiser.c		patch \| blob \| history
arch/x86/mm/pageattr.c		patch \| blob \| history
arch/x86/mm/pgtable.c		patch \| blob \| history
include/linux/kaiser.h	[new file with mode: 0644]	patch \| blob
kernel/fork.c		patch \| blob \| history
security/Kconfig		patch \| blob \| history