Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

[tomoyo/tomoyo-test1.git] / arch / x86 / mm / fault.c
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c

index a51df51..c5437f2 100644 (file)
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -30,6 +30,7 @@
  #include <asm/desc.h>                  /* store_idt(), ...             */
  #include <asm/cpu_entry_area.h>                /* exception stack              */
  #include <asm/pgtable_areas.h>         /* VMALLOC_START, ...           */
+#include <asm/kvm_para.h>              /* kvm_handle_async_pf          */
  
  #define CREATE_TRACE_POINTS
  #include <asm/trace/exceptions.h>
@@ -190,16 +191,13 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
         return pmd_k;
  }
  
-static void vmalloc_sync(void)
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
  {
-       unsigned long address;
-
-       if (SHARED_KERNEL_PMD)
-               return;
+       unsigned long addr;
  
-       for (address = VMALLOC_START & PMD_MASK;
-            address >= TASK_SIZE_MAX && address < VMALLOC_END;
-            address += PMD_SIZE) {
+       for (addr = start & PMD_MASK;
+            addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
+            addr += PMD_SIZE) {
                 struct page *page;
  
                 spin_lock(&pgd_lock);
@@ -210,61 +208,13 @@ static void vmalloc_sync(void)
                         pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
  
                         spin_lock(pgt_lock);
-                       vmalloc_sync_one(page_address(page), address);
+                       vmalloc_sync_one(page_address(page), addr);
                         spin_unlock(pgt_lock);
                 }
                 spin_unlock(&pgd_lock);
         }
  }
  
-void vmalloc_sync_mappings(void)
-{
-       vmalloc_sync();
-}
-
-void vmalloc_sync_unmappings(void)
-{
-       vmalloc_sync();
-}
-
-/*
- * 32-bit:
- *
- *   Handle a fault on the vmalloc or module mapping area
- */
-static noinline int vmalloc_fault(unsigned long address)
-{
-       unsigned long pgd_paddr;
-       pmd_t *pmd_k;
-       pte_t *pte_k;
-
-       /* Make sure we are in vmalloc area: */
-       if (!(address >= VMALLOC_START && address < VMALLOC_END))
-               return -1;
-
-       /*
-        * Synchronize this task's top level page-table
-        * with the 'reference' page table.
-        *
-        * Do _not_ use "current" here. We might be inside
-        * an interrupt in the middle of a task switch..
-        */
-       pgd_paddr = read_cr3_pa();
-       pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
-       if (!pmd_k)
-               return -1;
-
-       if (pmd_large(*pmd_k))
-               return 0;
-
-       pte_k = pte_offset_kernel(pmd_k, address);
-       if (!pte_present(*pte_k))
-               return -1;
-
-       return 0;
-}
-NOKPROBE_SYMBOL(vmalloc_fault);
-
  /*
   * Did it hit the DOS screen memory VA from vm86 mode?
   */
@@ -329,96 +279,6 @@ out:
  
  #else /* CONFIG_X86_64: */
  
-void vmalloc_sync_mappings(void)
-{
-       /*
-        * 64-bit mappings might allocate new p4d/pud pages
-        * that need to be propagated to all tasks' PGDs.
-        */
-       sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
-}
-
-void vmalloc_sync_unmappings(void)
-{
-       /*
-        * Unmappings never allocate or free p4d/pud pages.
-        * No work is required here.
-        */
-}
-
-/*
- * 64-bit:
- *
- *   Handle a fault on the vmalloc area
- */
-static noinline int vmalloc_fault(unsigned long address)
-{
-       pgd_t *pgd, *pgd_k;
-       p4d_t *p4d, *p4d_k;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-
-       /* Make sure we are in vmalloc area: */
-       if (!(address >= VMALLOC_START && address < VMALLOC_END))
-               return -1;
-
-       /*
-        * Copy kernel mappings over when needed. This can also
-        * happen within a race in page table update. In the later
-        * case just flush:
-        */
-       pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
-       pgd_k = pgd_offset_k(address);
-       if (pgd_none(*pgd_k))
-               return -1;
-
-       if (pgtable_l5_enabled()) {
-               if (pgd_none(*pgd)) {
-                       set_pgd(pgd, *pgd_k);
-                       arch_flush_lazy_mmu_mode();
-               } else {
-                       BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k));
-               }
-       }
-
-       /* With 4-level paging, copying happens on the p4d level. */
-       p4d = p4d_offset(pgd, address);
-       p4d_k = p4d_offset(pgd_k, address);
-       if (p4d_none(*p4d_k))
-               return -1;
-
-       if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
-               set_p4d(p4d, *p4d_k);
-               arch_flush_lazy_mmu_mode();
-       } else {
-               BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k));
-       }
-
-       BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
-
-       pud = pud_offset(p4d, address);
-       if (pud_none(*pud))
-               return -1;
-
-       if (pud_large(*pud))
-               return 0;
-
-       pmd = pmd_offset(pud, address);
-       if (pmd_none(*pmd))
-               return -1;
-
-       if (pmd_large(*pmd))
-               return 0;
-
-       pte = pte_offset_kernel(pmd, address);
-       if (!pte_present(*pte))
-               return -1;
-
-       return 0;
-}
-NOKPROBE_SYMBOL(vmalloc_fault);
-
  #ifdef CONFIG_CPU_SUP_AMD
  static const char errata93_warning[] =
  KERN_ERR 
@@ -1257,29 +1117,6 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
          */
         WARN_ON_ONCE(hw_error_code & X86_PF_PK);
  
-       /*
-        * We can fault-in kernel-space virtual memory on-demand. The
-        * 'reference' page table is init_mm.pgd.
-        *
-        * NOTE! We MUST NOT take any locks for this case. We may
-        * be in an interrupt or a critical region, and should
-        * only copy the information from the master page table,
-        * nothing more.
-        *
-        * Before doing this on-demand faulting, ensure that the
-        * fault is not any of the following:
-        * 1. A fault on a PTE with a reserved bit set.
-        * 2. A fault caused by a user-mode access.  (Do not demand-
-        *    fault kernel memory due to user-mode accesses).
-        * 3. A fault caused by a page-level protection violation.
-        *    (A demand fault would be on a non-present page which
-        *     would have X86_PF_PROT==0).
-        */
-       if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
-               if (vmalloc_fault(address) >= 0)
-                       return;
-       }
-
         /* Was the fault spurious, caused by lazy TLB invalidation? */
         if (spurious_kernel_fault(hw_error_code, address))
                 return;
@@ -1523,6 +1360,24 @@ do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
                 unsigned long address)
  {
         prefetchw(&current->mm->mmap_sem);
+       /*
+        * KVM has two types of events that are, logically, interrupts, but
+        * are unfortunately delivered using the #PF vector.  These events are
+        * "you just accessed valid memory, but the host doesn't have it right
+        * now, so I'll put you to sleep if you continue" and "that memory
+        * you tried to access earlier is available now."
+        *
+        * We are relying on the interrupted context being sane (valid RSP,
+        * relevant locks not held, etc.), which is fine as long as the
+        * interrupted context had IF=1.  We are also relying on the KVM
+        * async pf type field and CR2 being read consistently instead of
+        * getting values from real and async page faults mixed up.
+        *
+        * Fingers crossed.
+        */
+       if (kvm_handle_async_pf(regs, (u32)address))
+               return;
+
         trace_page_fault_entries(regs, hw_error_code, address);
  
         if (unlikely(kmmio_fault(regs, address)))