OSDN Git Service

Merge tag 'kvm-s390-next-4.20-2' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorPaolo Bonzini <pbonzini@redhat.com>
Sat, 13 Oct 2018 10:00:26 +0000 (12:00 +0200)
committerPaolo Bonzini <pbonzini@redhat.com>
Sat, 13 Oct 2018 10:00:26 +0000 (12:00 +0200)
KVM: s390/vfio-ap: Fixes and enhancements for vfio-ap

- add tracing
- fix a locking bug
- make local functions and data static

238 files changed:
Documentation/devicetree/bindings/net/macb.txt
Documentation/virtual/kvm/api.txt
MAINTAINERS
Makefile
arch/arm/boot/dts/sama5d3_emac.dtsi
arch/powerpc/include/asm/asm-prototypes.h
arch/powerpc/include/asm/book3s/64/mmu-hash.h
arch/powerpc/include/asm/book3s/64/pgtable.h
arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
arch/powerpc/include/asm/hvcall.h
arch/powerpc/include/asm/iommu.h
arch/powerpc/include/asm/kvm_asm.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_book3s_64.h
arch/powerpc/include/asm/kvm_book3s_asm.h
arch/powerpc/include/asm/kvm_booke.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/mmu_context.h
arch/powerpc/include/asm/ppc-opcode.h
arch/powerpc/include/asm/reg.h
arch/powerpc/include/uapi/asm/kvm.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/cpu_setup_power.S
arch/powerpc/kernel/iommu.c
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_64_mmu_radix.c
arch/powerpc/kvm/book3s_64_vio.c
arch/powerpc/kvm/book3s_64_vio_hv.c
arch/powerpc/kvm/book3s_emulate.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_builtin.c
arch/powerpc/kvm/book3s_hv_interrupts.S
arch/powerpc/kvm/book3s_hv_nested.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_hv_ras.c
arch/powerpc/kvm/book3s_hv_rm_xics.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_hv_tm.c
arch/powerpc/kvm/book3s_hv_tm_builtin.c
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/book3s_xics.c
arch/powerpc/kvm/book3s_xive.c
arch/powerpc/kvm/book3s_xive_template.c
arch/powerpc/kvm/bookehv_interrupts.S
arch/powerpc/kvm/emulate_loadstore.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/kvm/tm.S
arch/powerpc/kvm/trace_book3s.h
arch/powerpc/mm/init_64.c
arch/powerpc/mm/mmu_context_iommu.c
arch/powerpc/mm/tlb-radix.c
arch/s390/kvm/kvm-s390.c
arch/s390/mm/gmap.c
arch/x86/crypto/aegis128-aesni-glue.c
arch/x86/crypto/aegis128l-aesni-glue.c
arch/x86/crypto/aegis256-aesni-glue.c
arch/x86/crypto/morus1280-sse2-glue.c
arch/x86/crypto/morus640-sse2-glue.c
arch/x86/hyperv/hv_apic.c
arch/x86/include/asm/hyperv-tlfs.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/uapi/asm/kvm.h
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
drivers/ata/libata-core.c
drivers/block/floppy.c
drivers/bluetooth/hci_ldisc.c
drivers/clk/x86/clk-pmc-atom.c
drivers/crypto/ccp/psp-dev.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
drivers/gpu/drm/amd/amdgpu/amdgpu_cgs.c
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
drivers/gpu/drm/amd/amdkfd/kfd_device.c
drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/amdkfd/kfd_topology.c
drivers/gpu/drm/amd/include/kgd_kfd_interface.h
drivers/gpu/drm/drm_atomic.c
drivers/gpu/drm/drm_debugfs.c
drivers/gpu/drm/drm_fb_helper.c
drivers/gpu/drm/i915/gvt/handlers.c
drivers/gpu/drm/i915/gvt/kvmgt.c
drivers/gpu/drm/i915/gvt/mmio.c
drivers/gpu/drm/i915/gvt/vgpu.c
drivers/gpu/drm/pl111/pl111_vexpress.c
drivers/gpu/drm/sun4i/sun4i_drv.c
drivers/gpu/drm/sun4i/sun8i_hdmi_phy.c
drivers/gpu/drm/sun4i/sun8i_mixer.c
drivers/gpu/drm/sun4i/sun8i_tcon_top.c
drivers/gpu/drm/udl/udl_fb.c
drivers/gpu/drm/vc4/vc4_plane.c
drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
drivers/gpu/drm/vmwgfx/vmwgfx_stdu.c
drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
drivers/gpu/vga/vga_switcheroo.c
drivers/hwmon/nct6775.c
drivers/mtd/devices/m25p80.c
drivers/mtd/mtdpart.c
drivers/mtd/nand/raw/denali.c
drivers/mtd/nand/raw/marvell_nand.c
drivers/net/appletalk/ipddp.c
drivers/net/dsa/mv88e6xxx/global1.h
drivers/net/dsa/mv88e6xxx/global1_atu.c
drivers/net/ethernet/broadcom/bnxt/bnxt.c
drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
drivers/net/ethernet/cadence/macb_main.c
drivers/net/ethernet/hp/hp100.c
drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
drivers/net/ethernet/microchip/lan743x_main.c
drivers/net/ethernet/realtek/r8169.c
drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
drivers/net/ethernet/ti/Kconfig
drivers/net/hyperv/netvsc.c
drivers/net/hyperv/netvsc_drv.c
drivers/net/ppp/pppoe.c
drivers/net/usb/qmi_wwan.c
drivers/net/veth.c
drivers/net/xen-netfront.c
drivers/nvme/target/admin-cmd.c
drivers/pci/controller/pci-hyperv.c
drivers/platform/x86/alienware-wmi.c
drivers/platform/x86/dell-smbios-wmi.c
drivers/scsi/qla2xxx/qla_target.h
drivers/spi/spi-fsl-dspi.c
drivers/spi/spi.c
drivers/target/iscsi/iscsi_target.c
fs/ext4/dir.c
fs/ext4/ext4.h
fs/ext4/inline.c
fs/ext4/inode.c
fs/ext4/mmp.c
fs/ext4/namei.c
fs/ext4/resize.c
fs/ext4/super.c
fs/ocfs2/buffer_head_io.c
fs/proc/kcore.c
fs/ubifs/super.c
fs/ubifs/xattr.c
include/drm/drm_drv.h
include/linux/compiler-gcc.h
include/linux/compiler_types.h
include/linux/kvm_host.h
include/linux/vga_switcheroo.h
include/net/tls.h
include/sound/hdaudio.h
include/sound/soc-dapm.h
include/uapi/linux/kvm.h
include/uapi/sound/skl-tplg-interface.h
kernel/bpf/btf.c
kernel/bpf/verifier.c
kernel/pid.c
kernel/sys.c
kernel/trace/ring_buffer.c
mm/Kconfig
mm/shmem.c
mm/vmscan.c
net/bluetooth/smp.c
net/core/filter.c
net/core/neighbour.c
net/core/rtnetlink.c
net/ipv4/af_inet.c
net/ipv4/udp.c
net/ipv6/ip6_offload.c
net/ipv6/ip6_output.c
net/ipv6/route.c
net/ipv6/udp.c
net/sched/act_sample.c
net/sched/cls_api.c
net/socket.c
net/tls/tls_device.c
net/tls/tls_device_fallback.c
net/tls/tls_main.c
net/tls/tls_sw.c
scripts/subarch.include [new file with mode: 0644]
sound/firewire/bebob/bebob.c
sound/firewire/bebob/bebob_maudio.c
sound/firewire/digi00x/digi00x.c
sound/firewire/fireface/ff-protocol-ff400.c
sound/firewire/fireworks/fireworks.c
sound/firewire/oxfw/oxfw.c
sound/firewire/tascam/tascam.c
sound/hda/hdac_controller.c
sound/pci/emu10k1/emufx.c
sound/pci/hda/hda_intel.c
sound/pci/hda/hda_intel.h
sound/soc/amd/acp-pcm-dma.c
sound/soc/codecs/cs4265.c
sound/soc/codecs/max98373.c
sound/soc/codecs/rt5514.c
sound/soc/codecs/rt5682.c
sound/soc/codecs/sigmadsp.c
sound/soc/codecs/tas6424.c
sound/soc/codecs/wm8804-i2c.c
sound/soc/codecs/wm9712.c
sound/soc/intel/boards/bytcr_rt5640.c
sound/soc/intel/skylake/skl.c
sound/soc/qcom/qdsp6/q6routing.c
sound/soc/sh/rcar/adg.c
sound/soc/sh/rcar/core.c
sound/soc/sh/rcar/dma.c
sound/soc/sh/rcar/rsnd.h
sound/soc/sh/rcar/ssi.c
sound/soc/soc-core.c
sound/soc/soc-dapm.c
tools/kvm/kvm_stat/kvm_stat
tools/perf/arch/powerpc/util/book3s_hv_exits.h
tools/testing/selftests/android/Makefile
tools/testing/selftests/android/config [moved from tools/testing/selftests/android/ion/config with 100% similarity]
tools/testing/selftests/android/ion/Makefile
tools/testing/selftests/cgroup/.gitignore
tools/testing/selftests/cgroup/cgroup_util.c
tools/testing/selftests/cgroup/cgroup_util.h
tools/testing/selftests/cgroup/test_memcontrol.c
tools/testing/selftests/efivarfs/config [new file with mode: 0644]
tools/testing/selftests/futex/functional/Makefile
tools/testing/selftests/gpio/Makefile
tools/testing/selftests/kselftest.h
tools/testing/selftests/kvm/.gitignore
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/include/kvm_util.h
tools/testing/selftests/kvm/lib/kvm_util.c
tools/testing/selftests/kvm/platform_info_test.c [new file with mode: 0644]
tools/testing/selftests/lib.mk
tools/testing/selftests/memory-hotplug/config
tools/testing/selftests/net/Makefile
tools/testing/selftests/net/tls.c
tools/testing/selftests/networking/timestamping/Makefile
tools/testing/selftests/vm/Makefile

index 457d5ae..3e17ac1 100644 (file)
@@ -10,6 +10,7 @@ Required properties:
   Use "cdns,pc302-gem" for Picochip picoXcell pc302 and later devices based on
   the Cadence GEM, or the generic form: "cdns,gem".
   Use "atmel,sama5d2-gem" for the GEM IP (10/100) available on Atmel sama5d2 SoCs.
+  Use "atmel,sama5d3-macb" for the 10/100Mbit IP available on Atmel sama5d3 SoCs.
   Use "atmel,sama5d3-gem" for the Gigabit IP available on Atmel sama5d3 SoCs.
   Use "atmel,sama5d4-gem" for the GEM IP (10/100) available on Atmel sama5d4 SoCs.
   Use "cdns,zynq-gem" Xilinx Zynq-7xxx SoC.
index c664064..df98b63 100644 (file)
@@ -1922,6 +1922,7 @@ registers, find a list below:
   PPC   | KVM_REG_PPC_TIDR              | 64
   PPC   | KVM_REG_PPC_PSSCR             | 64
   PPC   | KVM_REG_PPC_DEC_EXPIRY        | 64
+  PPC   | KVM_REG_PPC_PTCR              | 64
   PPC   | KVM_REG_PPC_TM_GPR0           | 64
           ...
   PPC   | KVM_REG_PPC_TM_GPR31          | 64
@@ -2269,6 +2270,10 @@ The supported flags are:
         The emulated MMU supports 1T segments in addition to the
         standard 256M ones.
 
+    - KVM_PPC_NO_HASH
+       This flag indicates that HPT guests are not supported by KVM,
+       thus all guests must use radix MMU mode.
+
 The "slb_size" field indicates how many SLB entries are supported
 
 The "sps" array contains 8 entries indicating the supported base
@@ -4510,7 +4515,8 @@ Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits.
 Architectures: s390
 Parameters: none
 Returns: 0 on success, -EINVAL if hpage module parameter was not set
-        or cmma is enabled
+        or cmma is enabled, or the VM has the KVM_VM_S390_UCONTROL
+        flag set
 
 With this capability the KVM support for memory backing with 1m pages
 through hugetlbfs can be enabled for a VM. After the capability is
@@ -4521,6 +4527,29 @@ hpage module parameter is not set to 1, -EINVAL is returned.
 While it is generally possible to create a huge page backed VM without
 this capability, the VM will not be able to run.
 
+7.14 KVM_CAP_MSR_PLATFORM_INFO
+
+Architectures: x86
+Parameters: args[0] whether feature should be enabled or not
+
+With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise,
+a #GP would be raised when the guest tries to access. Currently, this
+capability does not enable write permissions of this MSR for the guest.
+
+7.16 KVM_CAP_PPC_NESTED_HV
+
+Architectures: ppc
+Parameters: none
+Returns: 0 on success, -EINVAL when the implementation doesn't support
+        nested-HV virtualization.
+
+HV-KVM on POWER9 and later systems allows for "nested-HV"
+virtualization, which provides a way for a guest VM to run guests that
+can run using the CPU's supervisor mode (privileged non-hypervisor
+state).  Enabling this capability on a VM depends on the CPU having
+the necessary functionality and on the facility being enabled with a
+kvm-hv module parameter.
+
 8. Other capabilities.
 ----------------------
 
index a2e401f..1610fb2 100644 (file)
@@ -13461,9 +13461,8 @@ F:      drivers/i2c/busses/i2c-synquacer.c
 F:     Documentation/devicetree/bindings/i2c/i2c-synquacer.txt
 
 SOCIONEXT UNIPHIER SOUND DRIVER
-M:     Katsuhiro Suzuki <suzuki.katsuhiro@socionext.com>
 L:     alsa-devel@alsa-project.org (moderated for non-subscribers)
-S:     Maintained
+S:     Orphan
 F:     sound/soc/uniphier/
 
 SOEKRIS NET48XX LED SUPPORT
index 83a03fa..f03a1e0 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -299,19 +299,7 @@ KERNELRELEASE = $(shell cat include/config/kernel.release 2> /dev/null)
 KERNELVERSION = $(VERSION)$(if $(PATCHLEVEL),.$(PATCHLEVEL)$(if $(SUBLEVEL),.$(SUBLEVEL)))$(EXTRAVERSION)
 export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION
 
-# SUBARCH tells the usermode build what the underlying arch is.  That is set
-# first, and if a usermode build is happening, the "ARCH=um" on the command
-# line overrides the setting of ARCH below.  If a native build is happening,
-# then ARCH is assigned, getting whatever value it gets normally, and
-# SUBARCH is subsequently ignored.
-
-SUBARCH := $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ \
-                                 -e s/sun4u/sparc64/ \
-                                 -e s/arm.*/arm/ -e s/sa110/arm/ \
-                                 -e s/s390x/s390/ -e s/parisc64/parisc/ \
-                                 -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \
-                                 -e s/sh[234].*/sh/ -e s/aarch64.*/arm64/ \
-                                 -e s/riscv.*/riscv/)
+include scripts/subarch.include
 
 # Cross compiling and selecting different set of gcc/bin-utils
 # ---------------------------------------------------------------------------
index 7cb235e..6e9e1c2 100644 (file)
@@ -41,7 +41,7 @@
                        };
 
                        macb1: ethernet@f802c000 {
-                               compatible = "cdns,at91sam9260-macb", "cdns,macb";
+                               compatible = "atmel,sama5d3-macb", "cdns,at91sam9260-macb", "cdns,macb";
                                reg = <0xf802c000 0x100>;
                                interrupts = <35 IRQ_TYPE_LEVEL_HIGH 3>;
                                pinctrl-names = "default";
index 1f4691c..c55ba3b 100644 (file)
@@ -150,4 +150,25 @@ extern s32 patch__memset_nocache, patch__memcpy_nocache;
 
 extern long flush_count_cache;
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv);
+void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv);
+#else
+static inline void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr,
+                                    bool preserve_nv) { }
+static inline void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr,
+                                       bool preserve_nv) { }
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+void kvmhv_save_host_pmu(void);
+void kvmhv_load_host_pmu(void);
+void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use);
+void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu);
+
+int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu);
+
+long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr);
+long kvmppc_h_set_xdabr(struct kvm_vcpu *vcpu, unsigned long dabr,
+                       unsigned long dabrx);
+
 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
index b3520b5..66db23e 100644 (file)
@@ -203,6 +203,18 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
        BUG();
 }
 
+static inline unsigned int ap_to_shift(unsigned long ap)
+{
+       int psize;
+
+       for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+               if (mmu_psize_defs[psize].ap == ap)
+                       return mmu_psize_defs[psize].shift;
+       }
+
+       return -1;
+}
+
 static inline unsigned long get_sllp_encoding(int psize)
 {
        unsigned long sllp;
index 13a688f..2fdc865 100644 (file)
@@ -1051,7 +1051,6 @@ static inline void vmemmap_remove_mapping(unsigned long start,
        return hash__vmemmap_remove_mapping(start, page_size);
 }
 #endif
-struct page *realmode_pfn_to_page(unsigned long pfn);
 
 static inline pte_t pmd_pte(pmd_t pmd)
 {
index 1154a6d..671316f 100644 (file)
@@ -53,6 +53,7 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid,
                                        unsigned long addr,
                                        unsigned long page_size);
 extern void radix__flush_pwc_lpid(unsigned int lpid);
+extern void radix__flush_tlb_lpid(unsigned int lpid);
 extern void radix__local_flush_tlb_lpid(unsigned int lpid);
 extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid);
 
index a0b17f9..45e8789 100644 (file)
 #define H_GET_24X7_DATA                0xF07C
 #define H_GET_PERF_COUNTER_INFO        0xF080
 
+/* Platform-specific hcalls used for nested HV KVM */
+#define H_SET_PARTITION_TABLE  0xF800
+#define H_ENTER_NESTED         0xF804
+#define H_TLB_INVALIDATE       0xF808
+
 /* Values for 2nd argument to H_SET_MODE */
 #define H_SET_MODE_RESOURCE_SET_CIABR          1
 #define H_SET_MODE_RESOURCE_SET_DAWR           2
@@ -461,6 +466,42 @@ struct h_cpu_char_result {
        u64 behaviour;
 };
 
+/* Register state for entering a nested guest with H_ENTER_NESTED */
+struct hv_guest_state {
+       u64 version;            /* version of this structure layout */
+       u32 lpid;
+       u32 vcpu_token;
+       /* These registers are hypervisor privileged (at least for writing) */
+       u64 lpcr;
+       u64 pcr;
+       u64 amor;
+       u64 dpdes;
+       u64 hfscr;
+       s64 tb_offset;
+       u64 dawr0;
+       u64 dawrx0;
+       u64 ciabr;
+       u64 hdec_expiry;
+       u64 purr;
+       u64 spurr;
+       u64 ic;
+       u64 vtb;
+       u64 hdar;
+       u64 hdsisr;
+       u64 heir;
+       u64 asdr;
+       /* These are OS privileged but need to be set late in guest entry */
+       u64 srr0;
+       u64 srr1;
+       u64 sprg[4];
+       u64 pidr;
+       u64 cfar;
+       u64 ppr;
+};
+
+/* Latest version of hv_guest_state structure */
+#define HV_GUEST_STATE_VERSION 1
+
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_HVCALL_H */
index ab3a4fb..3d4b88c 100644 (file)
@@ -220,8 +220,6 @@ extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
 extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
                unsigned long *hpa, enum dma_data_direction *direction);
-extern long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry,
-               unsigned long *hpa, enum dma_data_direction *direction);
 #else
 static inline void iommu_register_group(struct iommu_table_group *table_group,
                                        int pci_domain_number,
index a790d5c..1f32191 100644 (file)
@@ -84,7 +84,6 @@
 #define BOOK3S_INTERRUPT_INST_STORAGE  0x400
 #define BOOK3S_INTERRUPT_INST_SEGMENT  0x480
 #define BOOK3S_INTERRUPT_EXTERNAL      0x500
-#define BOOK3S_INTERRUPT_EXTERNAL_LEVEL        0x501
 #define BOOK3S_INTERRUPT_EXTERNAL_HV   0x502
 #define BOOK3S_INTERRUPT_ALIGNMENT     0x600
 #define BOOK3S_INTERRUPT_PROGRAM       0x700
 #define BOOK3S_IRQPRIO_EXTERNAL                        14
 #define BOOK3S_IRQPRIO_DECREMENTER             15
 #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR     16
-#define BOOK3S_IRQPRIO_EXTERNAL_LEVEL          17
-#define BOOK3S_IRQPRIO_MAX                     18
+#define BOOK3S_IRQPRIO_MAX                     17
 
 #define BOOK3S_HFLAG_DCBZ32                    0x1
 #define BOOK3S_HFLAG_SLB                       0x2
index 83a9aa3..09f8e9b 100644 (file)
@@ -188,14 +188,37 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
 extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
                        struct kvm_vcpu *vcpu,
                        unsigned long ea, unsigned long dsisr);
+extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
+                                     struct kvmppc_pte *gpte, u64 root,
+                                     u64 *pte_ret_p);
+extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
+                       struct kvmppc_pte *gpte, u64 table,
+                       int table_index, u64 *pte_ret_p);
 extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
                        struct kvmppc_pte *gpte, bool data, bool iswrite);
+extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
+                       unsigned int shift, struct kvm_memory_slot *memslot,
+                       unsigned int lpid);
+extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
+                                   bool writing, unsigned long gpa,
+                                   unsigned int lpid);
+extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
+                               unsigned long gpa,
+                               struct kvm_memory_slot *memslot,
+                               bool writing, bool kvm_ro,
+                               pte_t *inserted_pte, unsigned int *levelp);
 extern int kvmppc_init_vm_radix(struct kvm *kvm);
 extern void kvmppc_free_radix(struct kvm *kvm);
+extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
+                                     unsigned int lpid);
 extern int kvmppc_radix_init(void);
 extern void kvmppc_radix_exit(void);
 extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
                        unsigned long gfn);
+extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
+                            unsigned long gpa, unsigned int shift,
+                            struct kvm_memory_slot *memslot,
+                            unsigned int lpid);
 extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
                        unsigned long gfn);
 extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
@@ -271,6 +294,21 @@ static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {}
 static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
 #endif
 
+long kvmhv_nested_init(void);
+void kvmhv_nested_exit(void);
+void kvmhv_vm_nested_init(struct kvm *kvm);
+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
+void kvmhv_release_all_nested(struct kvm *kvm);
+long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
+long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
+int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu,
+                         u64 time_limit, unsigned long lpcr);
+void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
+void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
+                                  struct hv_guest_state *hr);
+long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu);
+
 void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
 
 extern int kvm_irq_bypass;
@@ -301,12 +339,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
 
 static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
 {
-       vcpu->arch.cr = val;
+       vcpu->arch.regs.ccr = val;
 }
 
 static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
 {
-       return vcpu->arch.cr;
+       return vcpu->arch.regs.ccr;
 }
 
 static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
@@ -384,9 +422,6 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
 /* TO = 31 for unconditional trap */
 #define INS_TW                         0x7fe00008
 
-/* LPIDs we support with this build -- runtime limit may be lower */
-#define KVMPPC_NR_LPIDS                        (LPID_RSVD + 1)
-
 #define SPLIT_HACK_MASK                        0xff000000
 #define SPLIT_HACK_OFFS                        0xfb000000
 
index dc435a5..6d29814 100644 (file)
 #include <linux/string.h>
 #include <asm/bitops.h>
 #include <asm/book3s/64/mmu-hash.h>
+#include <asm/cpu_has_feature.h>
+#include <asm/ppc-opcode.h>
+
+#ifdef CONFIG_PPC_PSERIES
+static inline bool kvmhv_on_pseries(void)
+{
+       return !cpu_has_feature(CPU_FTR_HVMODE);
+}
+#else
+static inline bool kvmhv_on_pseries(void)
+{
+       return false;
+}
+#endif
+
+/*
+ * Structure for a nested guest, that is, for a guest that is managed by
+ * one of our guests.
+ */
+struct kvm_nested_guest {
+       struct kvm *l1_host;            /* L1 VM that owns this nested guest */
+       int l1_lpid;                    /* lpid L1 guest thinks this guest is */
+       int shadow_lpid;                /* real lpid of this nested guest */
+       pgd_t *shadow_pgtable;          /* our page table for this guest */
+       u64 l1_gr_to_hr;                /* L1's addr of part'n-scoped table */
+       u64 process_table;              /* process table entry for this guest */
+       long refcnt;                    /* number of pointers to this struct */
+       struct mutex tlb_lock;          /* serialize page faults and tlbies */
+       struct kvm_nested_guest *next;
+       cpumask_t need_tlb_flush;
+       cpumask_t cpu_in_guest;
+       short prev_cpu[NR_CPUS];
+};
+
+/*
+ * We define a nested rmap entry as a single 64-bit quantity
+ * 0xFFF0000000000000  12-bit lpid field
+ * 0x000FFFFFFFFFF000  40-bit guest 4k page frame number
+ * 0x0000000000000001  1-bit  single entry flag
+ */
+#define RMAP_NESTED_LPID_MASK          0xFFF0000000000000UL
+#define RMAP_NESTED_LPID_SHIFT         (52)
+#define RMAP_NESTED_GPA_MASK           0x000FFFFFFFFFF000UL
+#define RMAP_NESTED_IS_SINGLE_ENTRY    0x0000000000000001UL
+
+/* Structure for a nested guest rmap entry */
+struct rmap_nested {
+       struct llist_node list;
+       u64 rmap;
+};
+
+/*
+ * for_each_nest_rmap_safe - iterate over the list of nested rmap entries
+ *                          safe against removal of the list entry or NULL list
+ * @pos:       a (struct rmap_nested *) to use as a loop cursor
+ * @node:      pointer to the first entry
+ *             NOTE: this can be NULL
+ * @rmapp:     an (unsigned long *) in which to return the rmap entries on each
+ *             iteration
+ *             NOTE: this must point to already allocated memory
+ *
+ * The nested_rmap is a llist of (struct rmap_nested) entries pointed to by the
+ * rmap entry in the memslot. The list is always terminated by a "single entry"
+ * stored in the list element of the final entry of the llist. If there is ONLY
+ * a single entry then this is itself in the rmap entry of the memslot, not a
+ * llist head pointer.
+ *
+ * Note that the iterator below assumes that a nested rmap entry is always
+ * non-zero.  This is true for our usage because the LPID field is always
+ * non-zero (zero is reserved for the host).
+ *
+ * This should be used to iterate over the list of rmap_nested entries with
+ * processing done on the u64 rmap value given by each iteration. This is safe
+ * against removal of list entries and it is always safe to call free on (pos).
+ *
+ * e.g.
+ * struct rmap_nested *cursor;
+ * struct llist_node *first;
+ * unsigned long rmap;
+ * for_each_nest_rmap_safe(cursor, first, &rmap) {
+ *     do_something(rmap);
+ *     free(cursor);
+ * }
+ */
+#define for_each_nest_rmap_safe(pos, node, rmapp)                             \
+       for ((pos) = llist_entry((node), typeof(*(pos)), list);                \
+            (node) &&                                                         \
+            (*(rmapp) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ?     \
+                         ((u64) (node)) : ((pos)->rmap))) &&                  \
+            (((node) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ?      \
+                        ((struct llist_node *) ((pos) = NULL)) :              \
+                        (pos)->list.next)), true);                            \
+            (pos) = llist_entry((node), typeof(*(pos)), list))
+
+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
+                                         bool create);
+void kvmhv_put_nested(struct kvm_nested_guest *gp);
+int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid);
+
+/* Encoding of first parameter for H_TLB_INVALIDATE */
+#define H_TLBIE_P1_ENC(ric, prs, r)    (___PPC_RIC(ric) | ___PPC_PRS(prs) | \
+                                        ___PPC_R(r))
 
 /* Power architecture requires HPT is at least 256kiB, at most 64TiB */
 #define PPC_MIN_HPT_ORDER      18
@@ -435,6 +537,7 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
 }
 
 extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
+extern void kvmhv_radix_debugfs_init(struct kvm *kvm);
 
 extern void kvmhv_rm_send_ipi(int cpu);
 
@@ -482,7 +585,7 @@ static inline u64 sanitize_msr(u64 msr)
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu)
 {
-       vcpu->arch.cr  = vcpu->arch.cr_tm;
+       vcpu->arch.regs.ccr  = vcpu->arch.cr_tm;
        vcpu->arch.regs.xer = vcpu->arch.xer_tm;
        vcpu->arch.regs.link  = vcpu->arch.lr_tm;
        vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
@@ -499,7 +602,7 @@ static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu)
 
 static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
 {
-       vcpu->arch.cr_tm  = vcpu->arch.cr;
+       vcpu->arch.cr_tm  = vcpu->arch.regs.ccr;
        vcpu->arch.xer_tm = vcpu->arch.regs.xer;
        vcpu->arch.lr_tm  = vcpu->arch.regs.link;
        vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
@@ -515,6 +618,17 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
+extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
+                            unsigned long gpa, unsigned int level,
+                            unsigned long mmu_seq, unsigned int lpid,
+                            unsigned long *rmapp, struct rmap_nested **n_rmap);
+extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
+                                  struct rmap_nested **n_rmap);
+extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
+                               struct kvm_memory_slot *memslot,
+                               unsigned long gpa, unsigned long hpa,
+                               unsigned long nbytes);
+
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
index d978fdf..eb3ba63 100644 (file)
@@ -25,6 +25,9 @@
 #define XICS_MFRR              0xc
 #define XICS_IPI               2       /* interrupt source # for IPIs */
 
+/* LPIDs we support with this build -- runtime limit may be lower */
+#define KVMPPC_NR_LPIDS                        (LPID_RSVD + 1)
+
 /* Maximum number of threads per physical core */
 #define MAX_SMT_THREADS                8
 
index d513e3e..f0cef62 100644 (file)
@@ -46,12 +46,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
 
 static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
 {
-       vcpu->arch.cr = val;
+       vcpu->arch.regs.ccr = val;
 }
 
 static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
 {
-       return vcpu->arch.cr;
+       return vcpu->arch.regs.ccr;
 }
 
 static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
index 906bcbd..fac6f63 100644 (file)
@@ -46,6 +46,7 @@
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 #include <asm/kvm_book3s_asm.h>                /* for MAX_SMT_THREADS */
 #define KVM_MAX_VCPU_ID                (MAX_SMT_THREADS * KVM_MAX_VCORES)
+#define KVM_MAX_NESTED_GUESTS  KVMPPC_NR_LPIDS
 
 #else
 #define KVM_MAX_VCPU_ID                KVM_MAX_VCPUS
@@ -94,6 +95,7 @@ struct dtl_entry;
 
 struct kvmppc_vcpu_book3s;
 struct kvmppc_book3s_shadow_vcpu;
+struct kvm_nested_guest;
 
 struct kvm_vm_stat {
        ulong remote_tlb_flush;
@@ -287,10 +289,12 @@ struct kvm_arch {
        u8 radix;
        u8 fwnmi_enabled;
        bool threads_indep;
+       bool nested_enable;
        pgd_t *pgtable;
        u64 process_table;
        struct dentry *debugfs_dir;
        struct dentry *htab_dentry;
+       struct dentry *radix_dentry;
        struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
@@ -311,6 +315,9 @@ struct kvm_arch {
 #endif
        struct kvmppc_ops *kvm_ops;
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+       u64 l1_ptcr;
+       int max_nested_lpid;
+       struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];
        /* This array can grow quite large, keep it at the end */
        struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
 #endif
@@ -360,7 +367,9 @@ struct kvmppc_pte {
        bool may_write          : 1;
        bool may_execute        : 1;
        unsigned long wimg;
+       unsigned long rc;
        u8 page_size;           /* MMU_PAGE_xxx */
+       u8 page_shift;
 };
 
 struct kvmppc_mmu {
@@ -537,8 +546,6 @@ struct kvm_vcpu_arch {
        ulong tar;
 #endif
 
-       u32 cr;
-
 #ifdef CONFIG_PPC_BOOK3S
        ulong hflags;
        ulong guest_owned_ext;
@@ -707,6 +714,7 @@ struct kvm_vcpu_arch {
        u8 hcall_needed;
        u8 epr_flags; /* KVMPPC_EPR_xxx */
        u8 epr_needed;
+       u8 external_oneshot;    /* clear external irq after delivery */
 
        u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
 
@@ -781,6 +789,10 @@ struct kvm_vcpu_arch {
        u32 emul_inst;
 
        u32 online;
+
+       /* For support of nested guests */
+       struct kvm_nested_guest *nested;
+       u32 nested_vcpu_id;
 #endif
 
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
index e991821..9b89b19 100644 (file)
@@ -194,9 +194,7 @@ extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
                (iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \
                                (stt)->size, (ioba), (npages)) ?        \
                                H_PARAMETER : H_SUCCESS)
-extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
-               unsigned long tce);
-extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+extern long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
                unsigned long *ua, unsigned long **prmap);
 extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
                unsigned long idx, unsigned long tce);
@@ -327,6 +325,7 @@ struct kvmppc_ops {
        int (*set_smt_mode)(struct kvm *kvm, unsigned long mode,
                            unsigned long flags);
        void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr);
+       int (*enable_nested)(struct kvm *kvm);
 };
 
 extern struct kvmppc_ops *kvmppc_hv_ops;
@@ -585,6 +584,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
 
 extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
                               int level, bool line_status);
+extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
 #else
 static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
                                       u32 priority) { return -1; }
@@ -607,6 +607,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur
 
 static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
                                      int level, bool line_status) { return -ENODEV; }
+static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
 #endif /* CONFIG_KVM_XIVE */
 
 /*
@@ -652,6 +653,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
                     unsigned long mfrr);
 int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
 int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
+void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu);
 
 /*
  * Host-side operations we want to set up while running in real
index b2f89b6..b694d6a 100644 (file)
@@ -38,6 +38,7 @@ extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
                unsigned long ua, unsigned int pageshift, unsigned long *hpa);
 extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
                unsigned long ua, unsigned int pageshift, unsigned long *hpa);
+extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua);
 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
 #endif
index 665af14..6093bc8 100644 (file)
 #define OP_31_XOP_LHZUX     311
 #define OP_31_XOP_MSGSNDP   142
 #define OP_31_XOP_MSGCLRP   174
+#define OP_31_XOP_TLBIE     306
 #define OP_31_XOP_MFSPR     339
 #define OP_31_XOP_LWAX      341
 #define OP_31_XOP_LHAX      343
index e5b314e..c906989 100644 (file)
 #define   HFSCR_DSCR   __MASK(FSCR_DSCR_LG)
 #define   HFSCR_VECVSX __MASK(FSCR_VECVSX_LG)
 #define   HFSCR_FP     __MASK(FSCR_FP_LG)
+#define   HFSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56)     /* interrupt cause */
 #define SPRN_TAR       0x32f   /* Target Address Register */
 #define SPRN_LPCR      0x13E   /* LPAR Control Register */
 #define   LPCR_VPM0            ASM_CONST(0x8000000000000000)
 #define SPRN_HSRR0     0x13A   /* Save/Restore Register 0 */
 #define SPRN_HSRR1     0x13B   /* Save/Restore Register 1 */
 #define   HSRR1_DENORM         0x00100000 /* Denorm exception */
+#define   HSRR1_HISI_WRITE     0x00010000 /* HISI bcs couldn't update mem */
 
 #define SPRN_TBCTL     0x35f   /* PA6T Timebase control register */
 #define   TBCTL_FREEZE         0x0000000000000000ull /* Freeze all tbs */
index 1b32b56..8c876c1 100644 (file)
@@ -634,6 +634,7 @@ struct kvm_ppc_cpu_char {
 
 #define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe)
 #define KVM_REG_PPC_ONLINE     (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf)
+#define KVM_REG_PPC_PTCR       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0)
 
 /* Transactional Memory checkpointed state:
  * This is all GPRs, all VSX regs and a subset of SPRs
index 89cf155..d0abcbb 100644 (file)
@@ -438,7 +438,7 @@ int main(void)
 #ifdef CONFIG_PPC_BOOK3S
        OFFSET(VCPU_TAR, kvm_vcpu, arch.tar);
 #endif
-       OFFSET(VCPU_CR, kvm_vcpu, arch.cr);
+       OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
        OFFSET(VCPU_PC, kvm_vcpu, arch.regs.nip);
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        OFFSET(VCPU_MSR, kvm_vcpu, arch.shregs.msr);
@@ -503,6 +503,7 @@ int main(void)
        OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
        OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty);
        OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst);
+       OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested);
        OFFSET(VCPU_CPU, kvm_vcpu, cpu);
        OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu);
 #endif
@@ -695,7 +696,7 @@ int main(void)
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 #else /* CONFIG_PPC_BOOK3S */
-       OFFSET(VCPU_CR, kvm_vcpu, arch.cr);
+       OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
        OFFSET(VCPU_XER, kvm_vcpu, arch.regs.xer);
        OFFSET(VCPU_LR, kvm_vcpu, arch.regs.link);
        OFFSET(VCPU_CTR, kvm_vcpu, arch.regs.ctr);
index 458b928..c317080 100644 (file)
@@ -147,8 +147,8 @@ __init_hvmode_206:
        rldicl. r0,r3,4,63
        bnelr
        ld      r5,CPU_SPEC_FEATURES(r4)
-       LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
-       xor     r5,r5,r6
+       LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE | CPU_FTR_P9_TM_HV_ASSIST)
+       andc    r5,r5,r6
        std     r5,CPU_SPEC_FEATURES(r4)
        blr
 
index af7a20d..19b4c62 100644 (file)
@@ -1013,31 +1013,6 @@ long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_xchg);
 
-#ifdef CONFIG_PPC_BOOK3S_64
-long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry,
-               unsigned long *hpa, enum dma_data_direction *direction)
-{
-       long ret;
-
-       ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
-
-       if (!ret && ((*direction == DMA_FROM_DEVICE) ||
-                       (*direction == DMA_BIDIRECTIONAL))) {
-               struct page *pg = realmode_pfn_to_page(*hpa >> PAGE_SHIFT);
-
-               if (likely(pg)) {
-                       SetPageDirty(pg);
-               } else {
-                       tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
-                       ret = -EFAULT;
-               }
-       }
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_tce_xchg_rm);
-#endif
-
 int iommu_take_ownership(struct iommu_table *tbl)
 {
        unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
index f872c04..e814f40 100644 (file)
@@ -75,7 +75,8 @@ kvm-hv-y += \
        book3s_hv.o \
        book3s_hv_interrupts.o \
        book3s_64_mmu_hv.o \
-       book3s_64_mmu_radix.o
+       book3s_64_mmu_radix.o \
+       book3s_hv_nested.o
 
 kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
        book3s_hv_tm.o
index 87348e4..fd9893b 100644 (file)
@@ -78,8 +78,11 @@ void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu)
 {
        if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) {
                ulong pc = kvmppc_get_pc(vcpu);
+               ulong lr = kvmppc_get_lr(vcpu);
                if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
                        kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK);
+               if ((lr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
+                       kvmppc_set_lr(vcpu, lr & ~SPLIT_HACK_MASK);
                vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK;
        }
 }
@@ -150,7 +153,6 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
        case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE;         break;
        case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT;         break;
        case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL;             break;
-       case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL;       break;
        case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT;            break;
        case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM;              break;
        case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL;           break;
@@ -236,18 +238,35 @@ EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec);
 void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                 struct kvm_interrupt *irq)
 {
-       unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL;
-
-       if (irq->irq == KVM_INTERRUPT_SET_LEVEL)
-               vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL;
+       /*
+        * This case (KVM_INTERRUPT_SET) should never actually arise for
+        * a pseries guest (because pseries guests expect their interrupt
+        * controllers to continue asserting an external interrupt request
+        * until it is acknowledged at the interrupt controller), but is
+        * included to avoid ABI breakage and potentially for other
+        * sorts of guest.
+        *
+        * There is a subtlety here: HV KVM does not test the
+        * external_oneshot flag in the code that synthesizes
+        * external interrupts for the guest just before entering
+        * the guest.  That is OK even if userspace did do a
+        * KVM_INTERRUPT_SET on a pseries guest vcpu, because the
+        * caller (kvm_vcpu_ioctl_interrupt) does a kvm_vcpu_kick()
+        * which ends up doing a smp_send_reschedule(), which will
+        * pull the guest all the way out to the host, meaning that
+        * we will call kvmppc_core_prepare_to_enter() before entering
+        * the guest again, and that will handle the external_oneshot
+        * flag correctly.
+        */
+       if (irq->irq == KVM_INTERRUPT_SET)
+               vcpu->arch.external_oneshot = 1;
 
-       kvmppc_book3s_queue_irqprio(vcpu, vec);
+       kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
 }
 
 void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
 {
        kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
-       kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
 }
 
 void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar,
@@ -278,7 +297,6 @@ static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu,
                vec = BOOK3S_INTERRUPT_DECREMENTER;
                break;
        case BOOK3S_IRQPRIO_EXTERNAL:
-       case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
                deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit;
                vec = BOOK3S_INTERRUPT_EXTERNAL;
                break;
@@ -352,8 +370,16 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority)
                case BOOK3S_IRQPRIO_DECREMENTER:
                        /* DEC interrupts get cleared by mtdec */
                        return false;
-               case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
-                       /* External interrupts get cleared by userspace */
+               case BOOK3S_IRQPRIO_EXTERNAL:
+                       /*
+                        * External interrupts get cleared by userspace
+                        * except when set by the KVM_INTERRUPT ioctl with
+                        * KVM_INTERRUPT_SET (not KVM_INTERRUPT_SET_LEVEL).
+                        */
+                       if (vcpu->arch.external_oneshot) {
+                               vcpu->arch.external_oneshot = 0;
+                               return true;
+                       }
                        return false;
        }
 
index 68e14af..c615617 100644 (file)
@@ -268,14 +268,13 @@ int kvmppc_mmu_hv_init(void)
 {
        unsigned long host_lpid, rsvd_lpid;
 
-       if (!cpu_has_feature(CPU_FTR_HVMODE))
-               return -EINVAL;
-
        if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE))
                return -EINVAL;
 
        /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */
-       host_lpid = mfspr(SPRN_LPID);
+       host_lpid = 0;
+       if (cpu_has_feature(CPU_FTR_HVMODE))
+               host_lpid = mfspr(SPRN_LPID);
        rsvd_lpid = LPID_RSVD;
 
        kvmppc_init_lpid(rsvd_lpid + 1);
index fd6e8c1..43b21e8 100644 (file)
@@ -10,6 +10,9 @@
 #include <linux/string.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/debugfs.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
  */
 static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
 
-int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
-                          struct kvmppc_pte *gpte, bool data, bool iswrite)
+int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
+                              struct kvmppc_pte *gpte, u64 root,
+                              u64 *pte_ret_p)
 {
        struct kvm *kvm = vcpu->kvm;
-       u32 pid;
        int ret, level, ps;
-       __be64 prte, rpte;
-       unsigned long ptbl;
-       unsigned long root, pte, index;
-       unsigned long rts, bits, offset;
-       unsigned long gpa;
-       unsigned long proc_tbl_size;
-
-       /* Work out effective PID */
-       switch (eaddr >> 62) {
-       case 0:
-               pid = vcpu->arch.pid;
-               break;
-       case 3:
-               pid = 0;
-               break;
-       default:
-               return -EINVAL;
-       }
-       proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12);
-       if (pid * 16 >= proc_tbl_size)
-               return -EINVAL;
+       unsigned long rts, bits, offset, index;
+       u64 pte, base, gpa;
+       __be64 rpte;
 
-       /* Read partition table to find root of tree for effective PID */
-       ptbl = (kvm->arch.process_table & PRTB_MASK) + (pid * 16);
-       ret = kvm_read_guest(kvm, ptbl, &prte, sizeof(prte));
-       if (ret)
-               return ret;
-
-       root = be64_to_cpu(prte);
        rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
                ((root & RTS2_MASK) >> RTS2_SHIFT);
        bits = root & RPDS_MASK;
-       root = root & RPDB_MASK;
+       base = root & RPDB_MASK;
 
        offset = rts + 31;
 
-       /* current implementations only support 52-bit space */
+       /* Current implementations only support 52-bit space */
        if (offset != 52)
                return -EINVAL;
 
+       /* Walk each level of the radix tree */
        for (level = 3; level >= 0; --level) {
+               u64 addr;
+               /* Check a valid size */
                if (level && bits != p9_supported_radix_bits[level])
                        return -EINVAL;
                if (level == 0 && !(bits == 5 || bits == 9))
                        return -EINVAL;
                offset -= bits;
                index = (eaddr >> offset) & ((1UL << bits) - 1);
-               /* check that low bits of page table base are zero */
-               if (root & ((1UL << (bits + 3)) - 1))
+               /* Check that low bits of page table base are zero */
+               if (base & ((1UL << (bits + 3)) - 1))
                        return -EINVAL;
-               ret = kvm_read_guest(kvm, root + index * 8,
-                                    &rpte, sizeof(rpte));
-               if (ret)
+               /* Read the entry from guest memory */
+               addr = base + (index * sizeof(rpte));
+               ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
+               if (ret) {
+                       if (pte_ret_p)
+                               *pte_ret_p = addr;
                        return ret;
+               }
                pte = __be64_to_cpu(rpte);
                if (!(pte & _PAGE_PRESENT))
                        return -ENOENT;
+               /* Check if a leaf entry */
                if (pte & _PAGE_PTE)
                        break;
-               bits = pte & 0x1f;
-               root = pte & 0x0fffffffffffff00ul;
+               /* Get ready to walk the next level */
+               base = pte & RPDB_MASK;
+               bits = pte & RPDS_MASK;
        }
-       /* need a leaf at lowest level; 512GB pages not supported */
+
+       /* Need a leaf at lowest level; 512GB pages not supported */
        if (level < 0 || level == 3)
                return -EINVAL;
 
-       /* offset is now log base 2 of the page size */
+       /* We found a valid leaf PTE */
+       /* Offset is now log base 2 of the page size */
        gpa = pte & 0x01fffffffffff000ul;
        if (gpa & ((1ul << offset) - 1))
                return -EINVAL;
-       gpa += eaddr & ((1ul << offset) - 1);
+       gpa |= eaddr & ((1ul << offset) - 1);
        for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
                if (offset == mmu_psize_defs[ps].shift)
                        break;
        gpte->page_size = ps;
+       gpte->page_shift = offset;
 
        gpte->eaddr = eaddr;
        gpte->raddr = gpa;
@@ -115,6 +105,77 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
        gpte->may_read = !!(pte & _PAGE_READ);
        gpte->may_write = !!(pte & _PAGE_WRITE);
        gpte->may_execute = !!(pte & _PAGE_EXEC);
+
+       gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
+
+       if (pte_ret_p)
+               *pte_ret_p = pte;
+
+       return 0;
+}
+
+/*
+ * Used to walk a partition or process table radix tree in guest memory
+ * Note: We exploit the fact that a partition table and a process
+ * table have the same layout, a partition-scoped page table and a
+ * process-scoped page table have the same layout, and the 2nd
+ * doubleword of a partition table entry has the same layout as
+ * the PTCR register.
+ */
+int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
+                                    struct kvmppc_pte *gpte, u64 table,
+                                    int table_index, u64 *pte_ret_p)
+{
+       struct kvm *kvm = vcpu->kvm;
+       int ret;
+       unsigned long size, ptbl, root;
+       struct prtb_entry entry;
+
+       if ((table & PRTS_MASK) > 24)
+               return -EINVAL;
+       size = 1ul << ((table & PRTS_MASK) + 12);
+
+       /* Is the table big enough to contain this entry? */
+       if ((table_index * sizeof(entry)) >= size)
+               return -EINVAL;
+
+       /* Read the table to find the root of the radix tree */
+       ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
+       ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
+       if (ret)
+               return ret;
+
+       /* Root is stored in the first double word */
+       root = be64_to_cpu(entry.prtb0);
+
+       return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
+}
+
+int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+                          struct kvmppc_pte *gpte, bool data, bool iswrite)
+{
+       u32 pid;
+       u64 pte;
+       int ret;
+
+       /* Work out effective PID */
+       switch (eaddr >> 62) {
+       case 0:
+               pid = vcpu->arch.pid;
+               break;
+       case 3:
+               pid = 0;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
+                               vcpu->kvm->arch.process_table, pid, &pte);
+       if (ret)
+               return ret;
+
+       /* Check privilege (applies only to process scoped translations) */
        if (kvmppc_get_msr(vcpu) & MSR_PR) {
                if (pte & _PAGE_PRIVILEGED) {
                        gpte->may_read = 0;
@@ -137,20 +198,46 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 }
 
 static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
-                                   unsigned int pshift)
+                                   unsigned int pshift, unsigned int lpid)
 {
        unsigned long psize = PAGE_SIZE;
+       int psi;
+       long rc;
+       unsigned long rb;
 
        if (pshift)
                psize = 1UL << pshift;
+       else
+               pshift = PAGE_SHIFT;
 
        addr &= ~(psize - 1);
-       radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize);
+
+       if (!kvmhv_on_pseries()) {
+               radix__flush_tlb_lpid_page(lpid, addr, psize);
+               return;
+       }
+
+       psi = shift_to_mmu_psize(pshift);
+       rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
+       rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
+                               lpid, rb);
+       if (rc)
+               pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
 }
 
-static void kvmppc_radix_flush_pwc(struct kvm *kvm)
+static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
 {
-       radix__flush_pwc_lpid(kvm->arch.lpid);
+       long rc;
+
+       if (!kvmhv_on_pseries()) {
+               radix__flush_pwc_lpid(lpid);
+               return;
+       }
+
+       rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
+                               lpid, TLBIEL_INVAL_SET_LPID);
+       if (rc)
+               pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
 }
 
 static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
@@ -195,23 +282,38 @@ static void kvmppc_pmd_free(pmd_t *pmdp)
        kmem_cache_free(kvm_pmd_cache, pmdp);
 }
 
-static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
-                            unsigned long gpa, unsigned int shift)
+/* Called with kvm->mmu_lock held */
+void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
+                     unsigned int shift, struct kvm_memory_slot *memslot,
+                     unsigned int lpid)
 
 {
-       unsigned long page_size = 1ul << shift;
        unsigned long old;
+       unsigned long gfn = gpa >> PAGE_SHIFT;
+       unsigned long page_size = PAGE_SIZE;
+       unsigned long hpa;
 
        old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
-       kvmppc_radix_tlbie_page(kvm, gpa, shift);
-       if (old & _PAGE_DIRTY) {
-               unsigned long gfn = gpa >> PAGE_SHIFT;
-               struct kvm_memory_slot *memslot;
+       kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
 
+       /* The following only applies to L1 entries */
+       if (lpid != kvm->arch.lpid)
+               return;
+
+       if (!memslot) {
                memslot = gfn_to_memslot(kvm, gfn);
-               if (memslot && memslot->dirty_bitmap)
-                       kvmppc_update_dirty_map(memslot, gfn, page_size);
+               if (!memslot)
+                       return;
        }
+       if (shift)
+               page_size = 1ul << shift;
+
+       gpa &= ~(page_size - 1);
+       hpa = old & PTE_RPN_MASK;
+       kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
+
+       if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
+               kvmppc_update_dirty_map(memslot, gfn, page_size);
 }
 
 /*
@@ -224,7 +326,8 @@ static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
  * and emit a warning if encountered, but there may already be data
  * corruption due to the unexpected mappings.
  */
-static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
+static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
+                                 unsigned int lpid)
 {
        if (full) {
                memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
@@ -238,14 +341,15 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
                        WARN_ON_ONCE(1);
                        kvmppc_unmap_pte(kvm, p,
                                         pte_pfn(*p) << PAGE_SHIFT,
-                                        PAGE_SHIFT);
+                                        PAGE_SHIFT, NULL, lpid);
                }
        }
 
        kvmppc_pte_free(pte);
 }
 
-static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
+static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
+                                 unsigned int lpid)
 {
        unsigned long im;
        pmd_t *p = pmd;
@@ -260,20 +364,21 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
                                WARN_ON_ONCE(1);
                                kvmppc_unmap_pte(kvm, (pte_t *)p,
                                         pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
-                                        PMD_SHIFT);
+                                        PMD_SHIFT, NULL, lpid);
                        }
                } else {
                        pte_t *pte;
 
                        pte = pte_offset_map(p, 0);
-                       kvmppc_unmap_free_pte(kvm, pte, full);
+                       kvmppc_unmap_free_pte(kvm, pte, full, lpid);
                        pmd_clear(p);
                }
        }
        kvmppc_pmd_free(pmd);
 }
 
-static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
+static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
+                                 unsigned int lpid)
 {
        unsigned long iu;
        pud_t *p = pud;
@@ -287,36 +392,40 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
                        pmd_t *pmd;
 
                        pmd = pmd_offset(p, 0);
-                       kvmppc_unmap_free_pmd(kvm, pmd, true);
+                       kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
                        pud_clear(p);
                }
        }
        pud_free(kvm->mm, pud);
 }
 
-void kvmppc_free_radix(struct kvm *kvm)
+void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
 {
        unsigned long ig;
-       pgd_t *pgd;
 
-       if (!kvm->arch.pgtable)
-               return;
-       pgd = kvm->arch.pgtable;
        for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
                pud_t *pud;
 
                if (!pgd_present(*pgd))
                        continue;
                pud = pud_offset(pgd, 0);
-               kvmppc_unmap_free_pud(kvm, pud);
+               kvmppc_unmap_free_pud(kvm, pud, lpid);
                pgd_clear(pgd);
        }
-       pgd_free(kvm->mm, kvm->arch.pgtable);
-       kvm->arch.pgtable = NULL;
+}
+
+void kvmppc_free_radix(struct kvm *kvm)
+{
+       if (kvm->arch.pgtable) {
+               kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
+                                         kvm->arch.lpid);
+               pgd_free(kvm->mm, kvm->arch.pgtable);
+               kvm->arch.pgtable = NULL;
+       }
 }
 
 static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
-                                             unsigned long gpa)
+                                       unsigned long gpa, unsigned int lpid)
 {
        pte_t *pte = pte_offset_kernel(pmd, 0);
 
@@ -326,13 +435,13 @@ static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
         * flushing the PWC again.
         */
        pmd_clear(pmd);
-       kvmppc_radix_flush_pwc(kvm);
+       kvmppc_radix_flush_pwc(kvm, lpid);
 
-       kvmppc_unmap_free_pte(kvm, pte, false);
+       kvmppc_unmap_free_pte(kvm, pte, false, lpid);
 }
 
 static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
-                                       unsigned long gpa)
+                                       unsigned long gpa, unsigned int lpid)
 {
        pmd_t *pmd = pmd_offset(pud, 0);
 
@@ -342,9 +451,9 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
         * so can be freed without flushing the PWC again.
         */
        pud_clear(pud);
-       kvmppc_radix_flush_pwc(kvm);
+       kvmppc_radix_flush_pwc(kvm, lpid);
 
-       kvmppc_unmap_free_pmd(kvm, pmd, false);
+       kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
 }
 
 /*
@@ -356,8 +465,10 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
  */
 #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
 
-static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
-                            unsigned int level, unsigned long mmu_seq)
+int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
+                     unsigned long gpa, unsigned int level,
+                     unsigned long mmu_seq, unsigned int lpid,
+                     unsigned long *rmapp, struct rmap_nested **n_rmap)
 {
        pgd_t *pgd;
        pud_t *pud, *new_pud = NULL;
@@ -366,7 +477,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
        int ret;
 
        /* Traverse the guest's 2nd-level tree, allocate new levels needed */
-       pgd = kvm->arch.pgtable + pgd_index(gpa);
+       pgd = pgtable + pgd_index(gpa);
        pud = NULL;
        if (pgd_present(*pgd))
                pud = pud_offset(pgd, gpa);
@@ -423,7 +534,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
                        goto out_unlock;
                }
                /* Valid 1GB page here already, remove it */
-               kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT);
+               kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
+                                lpid);
        }
        if (level == 2) {
                if (!pud_none(*pud)) {
@@ -432,9 +544,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
                         * install a large page, so remove and free the page
                         * table page.
                         */
-                       kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa);
+                       kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
                }
                kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
+               if (rmapp && n_rmap)
+                       kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
                ret = 0;
                goto out_unlock;
        }
@@ -458,7 +572,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
                        WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
                                                        PTE_BITS_MUST_MATCH);
                        kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
-                                             0, pte_val(pte), lgpa, PMD_SHIFT);
+                                       0, pte_val(pte), lgpa, PMD_SHIFT);
                        ret = 0;
                        goto out_unlock;
                }
@@ -472,7 +586,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
                        goto out_unlock;
                }
                /* Valid 2MB page here already, remove it */
-               kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT);
+               kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
+                                lpid);
        }
        if (level == 1) {
                if (!pmd_none(*pmd)) {
@@ -481,9 +596,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
                         * install a large page, so remove and free the page
                         * table page.
                         */
-                       kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa);
+                       kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
                }
                kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
+               if (rmapp && n_rmap)
+                       kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
                ret = 0;
                goto out_unlock;
        }
@@ -508,6 +625,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
                goto out_unlock;
        }
        kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
+       if (rmapp && n_rmap)
+               kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
        ret = 0;
 
  out_unlock:
@@ -521,21 +640,144 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
        return ret;
 }
 
-int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                                  unsigned long ea, unsigned long dsisr)
+bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
+                            unsigned long gpa, unsigned int lpid)
+{
+       unsigned long pgflags;
+       unsigned int shift;
+       pte_t *ptep;
+
+       /*
+        * Need to set an R or C bit in the 2nd-level tables;
+        * since we are just helping out the hardware here,
+        * it is sufficient to do what the hardware does.
+        */
+       pgflags = _PAGE_ACCESSED;
+       if (writing)
+               pgflags |= _PAGE_DIRTY;
+       /*
+        * We are walking the secondary (partition-scoped) page table here.
+        * We can do this without disabling irq because the Linux MM
+        * subsystem doesn't do THP splits and collapses on this tree.
+        */
+       ptep = __find_linux_pte(pgtable, gpa, NULL, &shift);
+       if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
+               kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
+               return true;
+       }
+       return false;
+}
+
+int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
+                                  unsigned long gpa,
+                                  struct kvm_memory_slot *memslot,
+                                  bool writing, bool kvm_ro,
+                                  pte_t *inserted_pte, unsigned int *levelp)
 {
        struct kvm *kvm = vcpu->kvm;
-       unsigned long mmu_seq, pte_size;
-       unsigned long gpa, gfn, hva, pfn;
-       struct kvm_memory_slot *memslot;
        struct page *page = NULL;
-       long ret;
-       bool writing;
+       unsigned long mmu_seq;
+       unsigned long hva, gfn = gpa >> PAGE_SHIFT;
        bool upgrade_write = false;
        bool *upgrade_p = &upgrade_write;
        pte_t pte, *ptep;
-       unsigned long pgflags;
        unsigned int shift, level;
+       int ret;
+
+       /* used to check for invalidations in progress */
+       mmu_seq = kvm->mmu_notifier_seq;
+       smp_rmb();
+
+       /*
+        * Do a fast check first, since __gfn_to_pfn_memslot doesn't
+        * do it with !atomic && !async, which is how we call it.
+        * We always ask for write permission since the common case
+        * is that the page is writable.
+        */
+       hva = gfn_to_hva_memslot(memslot, gfn);
+       if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
+               upgrade_write = true;
+       } else {
+               unsigned long pfn;
+
+               /* Call KVM generic code to do the slow-path check */
+               pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+                                          writing, upgrade_p);
+               if (is_error_noslot_pfn(pfn))
+                       return -EFAULT;
+               page = NULL;
+               if (pfn_valid(pfn)) {
+                       page = pfn_to_page(pfn);
+                       if (PageReserved(page))
+                               page = NULL;
+               }
+       }
+
+       /*
+        * Read the PTE from the process' radix tree and use that
+        * so we get the shift and attribute bits.
+        */
+       local_irq_disable();
+       ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
+       pte = *ptep;
+       local_irq_enable();
+
+       /* Get pte level from shift/size */
+       if (shift == PUD_SHIFT &&
+           (gpa & (PUD_SIZE - PAGE_SIZE)) ==
+           (hva & (PUD_SIZE - PAGE_SIZE))) {
+               level = 2;
+       } else if (shift == PMD_SHIFT &&
+                  (gpa & (PMD_SIZE - PAGE_SIZE)) ==
+                  (hva & (PMD_SIZE - PAGE_SIZE))) {
+               level = 1;
+       } else {
+               level = 0;
+               if (shift > PAGE_SHIFT) {
+                       /*
+                        * If the pte maps more than one page, bring over
+                        * bits from the virtual address to get the real
+                        * address of the specific single page we want.
+                        */
+                       unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
+                       pte = __pte(pte_val(pte) | (hva & rpnmask));
+               }
+       }
+
+       pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
+       if (writing || upgrade_write) {
+               if (pte_val(pte) & _PAGE_WRITE)
+                       pte = __pte(pte_val(pte) | _PAGE_DIRTY);
+       } else {
+               pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
+       }
+
+       /* Allocate space in the tree and write the PTE */
+       ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
+                               mmu_seq, kvm->arch.lpid, NULL, NULL);
+       if (inserted_pte)
+               *inserted_pte = pte;
+       if (levelp)
+               *levelp = level;
+
+       if (page) {
+               if (!ret && (pte_val(pte) & _PAGE_WRITE))
+                       set_page_dirty_lock(page);
+               put_page(page);
+       }
+
+       return ret;
+}
+
+int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                                  unsigned long ea, unsigned long dsisr)
+{
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long gpa, gfn;
+       struct kvm_memory_slot *memslot;
+       long ret;
+       bool writing = !!(dsisr & DSISR_ISSTORE);
+       bool kvm_ro = false;
 
        /* Check for unusual errors */
        if (dsisr & DSISR_UNSUPP_MMU) {
@@ -549,12 +791,14 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                return RESUME_GUEST;
        }
 
-       /* Translate the logical address and get the page */
+       /* Translate the logical address */
        gpa = vcpu->arch.fault_gpa & ~0xfffUL;
        gpa &= ~0xF000000000000000ul;
        gfn = gpa >> PAGE_SHIFT;
        if (!(dsisr & DSISR_PRTABLE_FAULT))
                gpa |= ea & 0xfff;
+
+       /* Get the corresponding memslot */
        memslot = gfn_to_memslot(kvm, gfn);
 
        /* No memslot means it's an emulated MMIO region */
@@ -568,142 +812,35 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
                        return RESUME_GUEST;
                }
-               return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
-                                             dsisr & DSISR_ISSTORE);
+               return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing);
        }
 
-       writing = (dsisr & DSISR_ISSTORE) != 0;
        if (memslot->flags & KVM_MEM_READONLY) {
                if (writing) {
                        /* give the guest a DSI */
-                       dsisr = DSISR_ISSTORE | DSISR_PROTFAULT;
-                       kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+                       kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
+                                                      DSISR_PROTFAULT);
                        return RESUME_GUEST;
                }
-               upgrade_p = NULL;
+               kvm_ro = true;
        }
 
+       /* Failed to set the reference/change bits */
        if (dsisr & DSISR_SET_RC) {
-               /*
-                * Need to set an R or C bit in the 2nd-level tables;
-                * since we are just helping out the hardware here,
-                * it is sufficient to do what the hardware does.
-                */
-               pgflags = _PAGE_ACCESSED;
-               if (writing)
-                       pgflags |= _PAGE_DIRTY;
-               /*
-                * We are walking the secondary page table here. We can do this
-                * without disabling irq.
-                */
                spin_lock(&kvm->mmu_lock);
-               ptep = __find_linux_pte(kvm->arch.pgtable,
-                                       gpa, NULL, &shift);
-               if (ptep && pte_present(*ptep) &&
-                   (!writing || pte_write(*ptep))) {
-                       kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
-                                               gpa, shift);
+               if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable,
+                                           writing, gpa, kvm->arch.lpid))
                        dsisr &= ~DSISR_SET_RC;
-               }
                spin_unlock(&kvm->mmu_lock);
+
                if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
                               DSISR_PROTFAULT | DSISR_SET_RC)))
                        return RESUME_GUEST;
        }
 
-       /* used to check for invalidations in progress */
-       mmu_seq = kvm->mmu_notifier_seq;
-       smp_rmb();
-
-       /*
-        * Do a fast check first, since __gfn_to_pfn_memslot doesn't
-        * do it with !atomic && !async, which is how we call it.
-        * We always ask for write permission since the common case
-        * is that the page is writable.
-        */
-       hva = gfn_to_hva_memslot(memslot, gfn);
-       if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
-               pfn = page_to_pfn(page);
-               upgrade_write = true;
-       } else {
-               /* Call KVM generic code to do the slow-path check */
-               pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
-                                          writing, upgrade_p);
-               if (is_error_noslot_pfn(pfn))
-                       return -EFAULT;
-               page = NULL;
-               if (pfn_valid(pfn)) {
-                       page = pfn_to_page(pfn);
-                       if (PageReserved(page))
-                               page = NULL;
-               }
-       }
-
-       /* See if we can insert a 1GB or 2MB large PTE here */
-       level = 0;
-       if (page && PageCompound(page)) {
-               pte_size = PAGE_SIZE << compound_order(compound_head(page));
-               if (pte_size >= PUD_SIZE &&
-                   (gpa & (PUD_SIZE - PAGE_SIZE)) ==
-                   (hva & (PUD_SIZE - PAGE_SIZE))) {
-                       level = 2;
-                       pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1);
-               } else if (pte_size >= PMD_SIZE &&
-                          (gpa & (PMD_SIZE - PAGE_SIZE)) ==
-                          (hva & (PMD_SIZE - PAGE_SIZE))) {
-                       level = 1;
-                       pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
-               }
-       }
-
-       /*
-        * Compute the PTE value that we need to insert.
-        */
-       if (page) {
-               pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE |
-                       _PAGE_ACCESSED;
-               if (writing || upgrade_write)
-                       pgflags |= _PAGE_WRITE | _PAGE_DIRTY;
-               pte = pfn_pte(pfn, __pgprot(pgflags));
-       } else {
-               /*
-                * Read the PTE from the process' radix tree and use that
-                * so we get the attribute bits.
-                */
-               local_irq_disable();
-               ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
-               pte = *ptep;
-               local_irq_enable();
-               if (shift == PUD_SHIFT &&
-                   (gpa & (PUD_SIZE - PAGE_SIZE)) ==
-                   (hva & (PUD_SIZE - PAGE_SIZE))) {
-                       level = 2;
-               } else if (shift == PMD_SHIFT &&
-                          (gpa & (PMD_SIZE - PAGE_SIZE)) ==
-                          (hva & (PMD_SIZE - PAGE_SIZE))) {
-                       level = 1;
-               } else if (shift && shift != PAGE_SHIFT) {
-                       /* Adjust PFN */
-                       unsigned long mask = (1ul << shift) - PAGE_SIZE;
-                       pte = __pte(pte_val(pte) | (hva & mask));
-               }
-               pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
-               if (writing || upgrade_write) {
-                       if (pte_val(pte) & _PAGE_WRITE)
-                               pte = __pte(pte_val(pte) | _PAGE_DIRTY);
-               } else {
-                       pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
-               }
-       }
-
-       /* Allocate space in the tree and write the PTE */
-       ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
-
-       if (page) {
-               if (!ret && (pte_val(pte) & _PAGE_WRITE))
-                       set_page_dirty_lock(page);
-               put_page(page);
-       }
+       /* Try to insert a pte */
+       ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
+                                            kvm_ro, NULL, NULL);
 
        if (ret == 0 || ret == -EAGAIN)
                ret = RESUME_GUEST;
@@ -717,20 +854,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
        pte_t *ptep;
        unsigned long gpa = gfn << PAGE_SHIFT;
        unsigned int shift;
-       unsigned long old;
 
        ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
-       if (ptep && pte_present(*ptep)) {
-               old = kvmppc_radix_update_pte(kvm, ptep, ~0UL, 0,
-                                             gpa, shift);
-               kvmppc_radix_tlbie_page(kvm, gpa, shift);
-               if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {
-                       unsigned long psize = PAGE_SIZE;
-                       if (shift)
-                               psize = 1ul << shift;
-                       kvmppc_update_dirty_map(memslot, gfn, psize);
-               }
-       }
+       if (ptep && pte_present(*ptep))
+               kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
+                                kvm->arch.lpid);
        return 0;                               
 }
 
@@ -785,7 +913,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm,
                        ret = 1 << (shift - PAGE_SHIFT);
                kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
                                        gpa, shift);
-               kvmppc_radix_tlbie_page(kvm, gpa, shift);
+               kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
        }
        return ret;
 }
@@ -870,6 +998,215 @@ static void pmd_ctor(void *addr)
        memset(addr, 0, RADIX_PMD_TABLE_SIZE);
 }
 
+struct debugfs_radix_state {
+       struct kvm      *kvm;
+       struct mutex    mutex;
+       unsigned long   gpa;
+       int             lpid;
+       int             chars_left;
+       int             buf_index;
+       char            buf[128];
+       u8              hdr;
+};
+
+static int debugfs_radix_open(struct inode *inode, struct file *file)
+{
+       struct kvm *kvm = inode->i_private;
+       struct debugfs_radix_state *p;
+
+       p = kzalloc(sizeof(*p), GFP_KERNEL);
+       if (!p)
+               return -ENOMEM;
+
+       kvm_get_kvm(kvm);
+       p->kvm = kvm;
+       mutex_init(&p->mutex);
+       file->private_data = p;
+
+       return nonseekable_open(inode, file);
+}
+
+static int debugfs_radix_release(struct inode *inode, struct file *file)
+{
+       struct debugfs_radix_state *p = file->private_data;
+
+       kvm_put_kvm(p->kvm);
+       kfree(p);
+       return 0;
+}
+
+static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
+                                size_t len, loff_t *ppos)
+{
+       struct debugfs_radix_state *p = file->private_data;
+       ssize_t ret, r;
+       unsigned long n;
+       struct kvm *kvm;
+       unsigned long gpa;
+       pgd_t *pgt;
+       struct kvm_nested_guest *nested;
+       pgd_t pgd, *pgdp;
+       pud_t pud, *pudp;
+       pmd_t pmd, *pmdp;
+       pte_t *ptep;
+       int shift;
+       unsigned long pte;
+
+       kvm = p->kvm;
+       if (!kvm_is_radix(kvm))
+               return 0;
+
+       ret = mutex_lock_interruptible(&p->mutex);
+       if (ret)
+               return ret;
+
+       if (p->chars_left) {
+               n = p->chars_left;
+               if (n > len)
+                       n = len;
+               r = copy_to_user(buf, p->buf + p->buf_index, n);
+               n -= r;
+               p->chars_left -= n;
+               p->buf_index += n;
+               buf += n;
+               len -= n;
+               ret = n;
+               if (r) {
+                       if (!n)
+                               ret = -EFAULT;
+                       goto out;
+               }
+       }
+
+       gpa = p->gpa;
+       nested = NULL;
+       pgt = NULL;
+       while (len != 0 && p->lpid >= 0) {
+               if (gpa >= RADIX_PGTABLE_RANGE) {
+                       gpa = 0;
+                       pgt = NULL;
+                       if (nested) {
+                               kvmhv_put_nested(nested);
+                               nested = NULL;
+                       }
+                       p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
+                       p->hdr = 0;
+                       if (p->lpid < 0)
+                               break;
+               }
+               if (!pgt) {
+                       if (p->lpid == 0) {
+                               pgt = kvm->arch.pgtable;
+                       } else {
+                               nested = kvmhv_get_nested(kvm, p->lpid, false);
+                               if (!nested) {
+                                       gpa = RADIX_PGTABLE_RANGE;
+                                       continue;
+                               }
+                               pgt = nested->shadow_pgtable;
+                       }
+               }
+               n = 0;
+               if (!p->hdr) {
+                       if (p->lpid > 0)
+                               n = scnprintf(p->buf, sizeof(p->buf),
+                                             "\nNested LPID %d: ", p->lpid);
+                       n += scnprintf(p->buf + n, sizeof(p->buf) - n,
+                                     "pgdir: %lx\n", (unsigned long)pgt);
+                       p->hdr = 1;
+                       goto copy;
+               }
+
+               pgdp = pgt + pgd_index(gpa);
+               pgd = READ_ONCE(*pgdp);
+               if (!(pgd_val(pgd) & _PAGE_PRESENT)) {
+                       gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE;
+                       continue;
+               }
+
+               pudp = pud_offset(&pgd, gpa);
+               pud = READ_ONCE(*pudp);
+               if (!(pud_val(pud) & _PAGE_PRESENT)) {
+                       gpa = (gpa & PUD_MASK) + PUD_SIZE;
+                       continue;
+               }
+               if (pud_val(pud) & _PAGE_PTE) {
+                       pte = pud_val(pud);
+                       shift = PUD_SHIFT;
+                       goto leaf;
+               }
+
+               pmdp = pmd_offset(&pud, gpa);
+               pmd = READ_ONCE(*pmdp);
+               if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
+                       gpa = (gpa & PMD_MASK) + PMD_SIZE;
+                       continue;
+               }
+               if (pmd_val(pmd) & _PAGE_PTE) {
+                       pte = pmd_val(pmd);
+                       shift = PMD_SHIFT;
+                       goto leaf;
+               }
+
+               ptep = pte_offset_kernel(&pmd, gpa);
+               pte = pte_val(READ_ONCE(*ptep));
+               if (!(pte & _PAGE_PRESENT)) {
+                       gpa += PAGE_SIZE;
+                       continue;
+               }
+               shift = PAGE_SHIFT;
+       leaf:
+               n = scnprintf(p->buf, sizeof(p->buf),
+                             " %lx: %lx %d\n", gpa, pte, shift);
+               gpa += 1ul << shift;
+       copy:
+               p->chars_left = n;
+               if (n > len)
+                       n = len;
+               r = copy_to_user(buf, p->buf, n);
+               n -= r;
+               p->chars_left -= n;
+               p->buf_index = n;
+               buf += n;
+               len -= n;
+               ret += n;
+               if (r) {
+                       if (!ret)
+                               ret = -EFAULT;
+                       break;
+               }
+       }
+       p->gpa = gpa;
+       if (nested)
+               kvmhv_put_nested(nested);
+
+ out:
+       mutex_unlock(&p->mutex);
+       return ret;
+}
+
+static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
+                          size_t len, loff_t *ppos)
+{
+       return -EACCES;
+}
+
+static const struct file_operations debugfs_radix_fops = {
+       .owner   = THIS_MODULE,
+       .open    = debugfs_radix_open,
+       .release = debugfs_radix_release,
+       .read    = debugfs_radix_read,
+       .write   = debugfs_radix_write,
+       .llseek  = generic_file_llseek,
+};
+
+void kvmhv_radix_debugfs_init(struct kvm *kvm)
+{
+       kvm->arch.radix_dentry = debugfs_create_file("radix", 0400,
+                                                    kvm->arch.debugfs_dir, kvm,
+                                                    &debugfs_radix_fops);
+}
+
 int kvmppc_radix_init(void)
 {
        unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
index 9a3f264..c0c64d1 100644 (file)
@@ -363,6 +363,40 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
        return ret;
 }
 
+static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
+               unsigned long tce)
+{
+       unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+       enum dma_data_direction dir = iommu_tce_direction(tce);
+       struct kvmppc_spapr_tce_iommu_table *stit;
+       unsigned long ua = 0;
+
+       /* Allow userspace to poison TCE table */
+       if (dir == DMA_NONE)
+               return H_SUCCESS;
+
+       if (iommu_tce_check_gpa(stt->page_shift, gpa))
+               return H_TOO_HARD;
+
+       if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL))
+               return H_TOO_HARD;
+
+       list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
+               unsigned long hpa = 0;
+               struct mm_iommu_table_group_mem_t *mem;
+               long shift = stit->tbl->it_page_shift;
+
+               mem = mm_iommu_lookup(stt->kvm->mm, ua, 1ULL << shift);
+               if (!mem)
+                       return H_TOO_HARD;
+
+               if (mm_iommu_ua_to_hpa(mem, ua, shift, &hpa))
+                       return H_TOO_HARD;
+       }
+
+       return H_SUCCESS;
+}
+
 static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
 {
        unsigned long hpa = 0;
@@ -401,7 +435,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
        long ret;
 
        if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
-               return H_HARDWARE;
+               return H_TOO_HARD;
 
        if (dir == DMA_NONE)
                return H_SUCCESS;
@@ -449,15 +483,15 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
                return H_TOO_HARD;
 
        if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa)))
-               return H_HARDWARE;
+               return H_TOO_HARD;
 
        if (mm_iommu_mapped_inc(mem))
-               return H_CLOSED;
+               return H_TOO_HARD;
 
        ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
        if (WARN_ON_ONCE(ret)) {
                mm_iommu_mapped_dec(mem);
-               return H_HARDWARE;
+               return H_TOO_HARD;
        }
 
        if (dir != DMA_NONE)
@@ -517,8 +551,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 
        idx = srcu_read_lock(&vcpu->kvm->srcu);
 
-       if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
-                       tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) {
+       if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) {
                ret = H_PARAMETER;
                goto unlock_exit;
        }
@@ -533,14 +566,10 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
                        ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl,
                                        entry, ua, dir);
 
-               if (ret == H_SUCCESS)
-                       continue;
-
-               if (ret == H_TOO_HARD)
+               if (ret != H_SUCCESS) {
+                       kvmppc_clear_tce(stit->tbl, entry);
                        goto unlock_exit;
-
-               WARN_ON_ONCE(1);
-               kvmppc_clear_tce(stit->tbl, entry);
+               }
        }
 
        kvmppc_tce_put(stt, entry, tce);
@@ -583,7 +612,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
                return ret;
 
        idx = srcu_read_lock(&vcpu->kvm->srcu);
-       if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
+       if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
                ret = H_TOO_HARD;
                goto unlock_exit;
        }
@@ -599,10 +628,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
                ret = kvmppc_tce_validate(stt, tce);
                if (ret != H_SUCCESS)
                        goto unlock_exit;
+       }
+
+       for (i = 0; i < npages; ++i) {
+               /*
+                * This looks unsafe, because we validate, then regrab
+                * the TCE from userspace which could have been changed by
+                * another thread.
+                *
+                * But it actually is safe, because the relevant checks will be
+                * re-executed in the following code.  If userspace tries to
+                * change this dodgily it will result in a messier failure mode
+                * but won't threaten the host.
+                */
+               if (get_user(tce, tces + i)) {
+                       ret = H_TOO_HARD;
+                       goto unlock_exit;
+               }
+               tce = be64_to_cpu(tce);
 
-               if (kvmppc_gpa_to_ua(vcpu->kvm,
-                               tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
-                               &ua, NULL))
+               if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
                        return H_PARAMETER;
 
                list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -610,14 +655,10 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
                                        stit->tbl, entry + i, ua,
                                        iommu_tce_direction(tce));
 
-                       if (ret == H_SUCCESS)
-                               continue;
-
-                       if (ret == H_TOO_HARD)
+                       if (ret != H_SUCCESS) {
+                               kvmppc_clear_tce(stit->tbl, entry);
                                goto unlock_exit;
-
-                       WARN_ON_ONCE(1);
-                       kvmppc_clear_tce(stit->tbl, entry);
+                       }
                }
 
                kvmppc_tce_put(stt, entry + i, tce);
index 506a4d4..ec99363 100644 (file)
@@ -87,6 +87,7 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm,
 }
 EXPORT_SYMBOL_GPL(kvmppc_find_table);
 
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 /*
  * Validates TCE address.
  * At the moment flags and page mask are validated.
@@ -94,14 +95,14 @@ EXPORT_SYMBOL_GPL(kvmppc_find_table);
  * to the table and user space is supposed to process them), we can skip
  * checking other things (such as TCE is a guest RAM address or the page
  * was actually allocated).
- *
- * WARNING: This will be called in real-mode on HV KVM and virtual
- *          mode on PR KVM
  */
-long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
+static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt,
+               unsigned long tce)
 {
        unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
        enum dma_data_direction dir = iommu_tce_direction(tce);
+       struct kvmppc_spapr_tce_iommu_table *stit;
+       unsigned long ua = 0;
 
        /* Allow userspace to poison TCE table */
        if (dir == DMA_NONE)
@@ -110,9 +111,25 @@ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
        if (iommu_tce_check_gpa(stt->page_shift, gpa))
                return H_PARAMETER;
 
+       if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL))
+               return H_TOO_HARD;
+
+       list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+               unsigned long hpa = 0;
+               struct mm_iommu_table_group_mem_t *mem;
+               long shift = stit->tbl->it_page_shift;
+
+               mem = mm_iommu_lookup_rm(stt->kvm->mm, ua, 1ULL << shift);
+               if (!mem)
+                       return H_TOO_HARD;
+
+               if (mm_iommu_ua_to_hpa_rm(mem, ua, shift, &hpa))
+                       return H_TOO_HARD;
+       }
+
        return H_SUCCESS;
 }
-EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 /* Note on the use of page_address() in real mode,
  *
@@ -164,10 +181,10 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
 }
 EXPORT_SYMBOL_GPL(kvmppc_tce_put);
 
-long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
                unsigned long *ua, unsigned long **prmap)
 {
-       unsigned long gfn = gpa >> PAGE_SHIFT;
+       unsigned long gfn = tce >> PAGE_SHIFT;
        struct kvm_memory_slot *memslot;
 
        memslot = search_memslots(kvm_memslots(kvm), gfn);
@@ -175,7 +192,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
                return -EINVAL;
 
        *ua = __gfn_to_hva_memslot(memslot, gfn) |
-               (gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
+               (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
        if (prmap)
@@ -184,15 +201,38 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
+EXPORT_SYMBOL_GPL(kvmppc_tce_to_ua);
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-static void kvmppc_rm_clear_tce(struct iommu_table *tbl, unsigned long entry)
+static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
+               unsigned long entry, unsigned long *hpa,
+               enum dma_data_direction *direction)
+{
+       long ret;
+
+       ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
+
+       if (!ret && ((*direction == DMA_FROM_DEVICE) ||
+                               (*direction == DMA_BIDIRECTIONAL))) {
+               __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
+               /*
+                * kvmppc_rm_tce_iommu_do_map() updates the UA cache after
+                * calling this so we still get here a valid UA.
+                */
+               if (pua && *pua)
+                       mm_iommu_ua_mark_dirty_rm(mm, be64_to_cpu(*pua));
+       }
+
+       return ret;
+}
+
+static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl,
+               unsigned long entry)
 {
        unsigned long hpa = 0;
        enum dma_data_direction dir = DMA_NONE;
 
-       iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+       iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
 }
 
 static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
@@ -224,7 +264,7 @@ static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm,
        unsigned long hpa = 0;
        long ret;
 
-       if (iommu_tce_xchg_rm(tbl, entry, &hpa, &dir))
+       if (iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir))
                /*
                 * real mode xchg can fail if struct page crosses
                 * a page boundary
@@ -236,7 +276,7 @@ static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm,
 
        ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry);
        if (ret)
-               iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+               iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
 
        return ret;
 }
@@ -277,12 +317,12 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
 
        if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift,
                        &hpa)))
-               return H_HARDWARE;
+               return H_TOO_HARD;
 
        if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
-               return H_CLOSED;
+               return H_TOO_HARD;
 
-       ret = iommu_tce_xchg_rm(tbl, entry, &hpa, &dir);
+       ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
        if (ret) {
                mm_iommu_mapped_dec(mem);
                /*
@@ -345,13 +385,12 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
        if (ret != H_SUCCESS)
                return ret;
 
-       ret = kvmppc_tce_validate(stt, tce);
+       ret = kvmppc_rm_tce_validate(stt, tce);
        if (ret != H_SUCCESS)
                return ret;
 
        dir = iommu_tce_direction(tce);
-       if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
-                       tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
+       if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
                return H_PARAMETER;
 
        entry = ioba >> stt->page_shift;
@@ -364,14 +403,10 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
                        ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt,
                                        stit->tbl, entry, ua, dir);
 
-               if (ret == H_SUCCESS)
-                       continue;
-
-               if (ret == H_TOO_HARD)
+               if (ret != H_SUCCESS) {
+                       kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
                        return ret;
-
-               WARN_ON_ONCE_RM(1);
-               kvmppc_rm_clear_tce(stit->tbl, entry);
+               }
        }
 
        kvmppc_tce_put(stt, entry, tce);
@@ -457,7 +492,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
                 */
                struct mm_iommu_table_group_mem_t *mem;
 
-               if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL))
+               if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL))
                        return H_TOO_HARD;
 
                mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
@@ -473,12 +508,12 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
                 * We do not require memory to be preregistered in this case
                 * so lock rmap and do __find_linux_pte_or_hugepte().
                 */
-               if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
+               if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
                        return H_TOO_HARD;
 
                rmap = (void *) vmalloc_to_phys(rmap);
                if (WARN_ON_ONCE_RM(!rmap))
-                       return H_HARDWARE;
+                       return H_TOO_HARD;
 
                /*
                 * Synchronize with the MMU notifier callbacks in
@@ -498,14 +533,16 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
        for (i = 0; i < npages; ++i) {
                unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
 
-               ret = kvmppc_tce_validate(stt, tce);
+               ret = kvmppc_rm_tce_validate(stt, tce);
                if (ret != H_SUCCESS)
                        goto unlock_exit;
+       }
+
+       for (i = 0; i < npages; ++i) {
+               unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
 
                ua = 0;
-               if (kvmppc_gpa_to_ua(vcpu->kvm,
-                               tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
-                               &ua, NULL))
+               if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
                        return H_PARAMETER;
 
                list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
@@ -513,14 +550,11 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
                                        stit->tbl, entry + i, ua,
                                        iommu_tce_direction(tce));
 
-                       if (ret == H_SUCCESS)
-                               continue;
-
-                       if (ret == H_TOO_HARD)
+                       if (ret != H_SUCCESS) {
+                               kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl,
+                                               entry);
                                goto unlock_exit;
-
-                       WARN_ON_ONCE_RM(1);
-                       kvmppc_rm_clear_tce(stit->tbl, entry);
+                       }
                }
 
                kvmppc_tce_put(stt, entry + i, tce);
@@ -571,7 +605,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
                                return ret;
 
                        WARN_ON_ONCE_RM(1);
-                       kvmppc_rm_clear_tce(stit->tbl, entry);
+                       kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
                }
        }
 
index 36b11c5..8c7e933 100644 (file)
@@ -36,7 +36,6 @@
 #define OP_31_XOP_MTSR         210
 #define OP_31_XOP_MTSRIN       242
 #define OP_31_XOP_TLBIEL       274
-#define OP_31_XOP_TLBIE                306
 /* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */
 #define OP_31_XOP_FAKE_SC1     308
 #define OP_31_XOP_SLBMTE       402
@@ -110,7 +109,7 @@ static inline void kvmppc_copyto_vcpu_tm(struct kvm_vcpu *vcpu)
        vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
        vcpu->arch.tar_tm = vcpu->arch.tar;
        vcpu->arch.lr_tm = vcpu->arch.regs.link;
-       vcpu->arch.cr_tm = vcpu->arch.cr;
+       vcpu->arch.cr_tm = vcpu->arch.regs.ccr;
        vcpu->arch.xer_tm = vcpu->arch.regs.xer;
        vcpu->arch.vrsave_tm = vcpu->arch.vrsave;
 }
@@ -129,7 +128,7 @@ static inline void kvmppc_copyfrom_vcpu_tm(struct kvm_vcpu *vcpu)
        vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
        vcpu->arch.tar = vcpu->arch.tar_tm;
        vcpu->arch.regs.link = vcpu->arch.lr_tm;
-       vcpu->arch.cr = vcpu->arch.cr_tm;
+       vcpu->arch.regs.ccr = vcpu->arch.cr_tm;
        vcpu->arch.regs.xer = vcpu->arch.xer_tm;
        vcpu->arch.vrsave = vcpu->arch.vrsave_tm;
 }
@@ -141,7 +140,7 @@ static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, int ra_val)
        uint64_t texasr;
 
        /* CR0 = 0 | MSR[TS] | 0 */
-       vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) |
+       vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
                (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
                 << CR0_SHIFT);
 
@@ -220,7 +219,7 @@ void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val)
        tm_abort(ra_val);
 
        /* CR0 = 0 | MSR[TS] | 0 */
-       vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) |
+       vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
                (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
                 << CR0_SHIFT);
 
@@ -494,8 +493,8 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
                        if (!(kvmppc_get_msr(vcpu) & MSR_PR)) {
                                preempt_disable();
-                               vcpu->arch.cr = (CR0_TBEGIN_FAILURE |
-                                 (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)));
+                               vcpu->arch.regs.ccr = (CR0_TBEGIN_FAILURE |
+                                 (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)));
 
                                vcpu->arch.texasr = (TEXASR_FS | TEXASR_EXACT |
                                        (((u64)(TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT))
index 3e3a715..788bc61 100644 (file)
@@ -50,6 +50,7 @@
 #include <asm/reg.h>
 #include <asm/ppc-opcode.h>
 #include <asm/asm-prototypes.h>
+#include <asm/archrandom.h>
 #include <asm/debug.h>
 #include <asm/disassemble.h>
 #include <asm/cputable.h>
@@ -104,6 +105,10 @@ static bool indep_threads_mode = true;
 module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
 
+static bool one_vm_per_core;
+module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)");
+
 #ifdef CONFIG_KVM_XICS
 static struct kernel_param_ops module_param_ops = {
        .set = param_set_int,
@@ -117,6 +122,16 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 0644);
 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
 #endif
 
+/* If set, guests are allowed to create and control nested guests */
+static bool nested = true;
+module_param(nested, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");
+
+static inline bool nesting_enabled(struct kvm *kvm)
+{
+       return kvm->arch.nested_enable && kvm_is_radix(kvm);
+}
+
 /* If set, the threads on each CPU core have to be in the same MMU mode */
 static bool no_mixing_hpt_and_radix;
 
@@ -173,6 +188,10 @@ static bool kvmppc_ipi_thread(int cpu)
 {
        unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
 
+       /* If we're a nested hypervisor, fall back to ordinary IPIs for now */
+       if (kvmhv_on_pseries())
+               return false;
+
        /* On POWER9 we can use msgsnd to IPI any cpu */
        if (cpu_has_feature(CPU_FTR_ARCH_300)) {
                msg |= get_hard_smp_processor_id(cpu);
@@ -410,8 +429,8 @@ static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
               vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
        pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
               vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
-       pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
-              vcpu->arch.cr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
+       pr_err("cr = %.8lx  xer = %.16lx  dsisr = %.8x\n",
+              vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
        pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
        pr_err("fault dar = %.16lx dsisr = %.8x\n",
               vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
@@ -730,8 +749,7 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
        /*
         * Ensure that the read of vcore->dpdes comes after the read
         * of vcpu->doorbell_request.  This barrier matches the
-        * lwsync in book3s_hv_rmhandlers.S just before the
-        * fast_guest_return label.
+        * smb_wmb() in kvmppc_guest_entry_inject().
         */
        smp_rmb();
        vc = vcpu->arch.vcore;
@@ -912,6 +930,19 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                        break;
                }
                return RESUME_HOST;
+       case H_SET_DABR:
+               ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4));
+               break;
+       case H_SET_XDABR:
+               ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5));
+               break;
+       case H_GET_TCE:
+               ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                               kvmppc_get_gpr(vcpu, 5));
+               if (ret == H_TOO_HARD)
+                       return RESUME_HOST;
+               break;
        case H_PUT_TCE:
                ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
                                                kvmppc_get_gpr(vcpu, 5),
@@ -935,6 +966,32 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                if (ret == H_TOO_HARD)
                        return RESUME_HOST;
                break;
+       case H_RANDOM:
+               if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
+                       ret = H_HARDWARE;
+               break;
+
+       case H_SET_PARTITION_TABLE:
+               ret = H_FUNCTION;
+               if (nesting_enabled(vcpu->kvm))
+                       ret = kvmhv_set_partition_table(vcpu);
+               break;
+       case H_ENTER_NESTED:
+               ret = H_FUNCTION;
+               if (!nesting_enabled(vcpu->kvm))
+                       break;
+               ret = kvmhv_enter_nested_guest(vcpu);
+               if (ret == H_INTERRUPT) {
+                       kvmppc_set_gpr(vcpu, 3, 0);
+                       return -EINTR;
+               }
+               break;
+       case H_TLB_INVALIDATE:
+               ret = H_FUNCTION;
+               if (nesting_enabled(vcpu->kvm))
+                       ret = kvmhv_do_nested_tlbie(vcpu);
+               break;
+
        default:
                return RESUME_HOST;
        }
@@ -943,6 +1000,24 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
        return RESUME_GUEST;
 }
 
+/*
+ * Handle H_CEDE in the nested virtualization case where we haven't
+ * called the real-mode hcall handlers in book3s_hv_rmhandlers.S.
+ * This has to be done early, not in kvmppc_pseries_do_hcall(), so
+ * that the cede logic in kvmppc_run_single_vcpu() works properly.
+ */
+static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.shregs.msr |= MSR_EE;
+       vcpu->arch.ceded = 1;
+       smp_mb();
+       if (vcpu->arch.prodded) {
+               vcpu->arch.prodded = 0;
+               smp_mb();
+               vcpu->arch.ceded = 0;
+       }
+}
+
 static int kvmppc_hcall_impl_hv(unsigned long cmd)
 {
        switch (cmd) {
@@ -1085,7 +1160,6 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
        return RESUME_GUEST;
 }
 
-/* Called with vcpu->arch.vcore->lock held */
 static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                 struct task_struct *tsk)
 {
@@ -1190,7 +1264,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
                break;
        case BOOK3S_INTERRUPT_H_INST_STORAGE:
                vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
-               vcpu->arch.fault_dsisr = 0;
+               vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr &
+                       DSISR_SRR1_MATCH_64S;
+               if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
+                       vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
                r = RESUME_PAGE_FAULT;
                break;
        /*
@@ -1206,10 +1283,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                swab32(vcpu->arch.emul_inst) :
                                vcpu->arch.emul_inst;
                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
-                       /* Need vcore unlocked to call kvmppc_get_last_inst */
-                       spin_unlock(&vcpu->arch.vcore->lock);
                        r = kvmppc_emulate_debug_inst(run, vcpu);
-                       spin_lock(&vcpu->arch.vcore->lock);
                } else {
                        kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
                        r = RESUME_GUEST;
@@ -1225,12 +1299,8 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
        case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
                r = EMULATE_FAIL;
                if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) &&
-                   cpu_has_feature(CPU_FTR_ARCH_300)) {
-                       /* Need vcore unlocked to call kvmppc_get_last_inst */
-                       spin_unlock(&vcpu->arch.vcore->lock);
+                   cpu_has_feature(CPU_FTR_ARCH_300))
                        r = kvmppc_emulate_doorbell_instr(vcpu);
-                       spin_lock(&vcpu->arch.vcore->lock);
-               }
                if (r == EMULATE_FAIL) {
                        kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
                        r = RESUME_GUEST;
@@ -1265,6 +1335,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
        return r;
 }
 
+static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
+{
+       int r;
+       int srcu_idx;
+
+       vcpu->stat.sum_exits++;
+
+       /*
+        * This can happen if an interrupt occurs in the last stages
+        * of guest entry or the first stages of guest exit (i.e. after
+        * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
+        * and before setting it to KVM_GUEST_MODE_HOST_HV).
+        * That can happen due to a bug, or due to a machine check
+        * occurring at just the wrong time.
+        */
+       if (vcpu->arch.shregs.msr & MSR_HV) {
+               pr_emerg("KVM trap in HV mode while nested!\n");
+               pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
+                        vcpu->arch.trap, kvmppc_get_pc(vcpu),
+                        vcpu->arch.shregs.msr);
+               kvmppc_dump_regs(vcpu);
+               return RESUME_HOST;
+       }
+       switch (vcpu->arch.trap) {
+       /* We're good on these - the host merely wanted to get our attention */
+       case BOOK3S_INTERRUPT_HV_DECREMENTER:
+               vcpu->stat.dec_exits++;
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_EXTERNAL:
+               vcpu->stat.ext_intr_exits++;
+               r = RESUME_HOST;
+               break;
+       case BOOK3S_INTERRUPT_H_DOORBELL:
+       case BOOK3S_INTERRUPT_H_VIRT:
+               vcpu->stat.ext_intr_exits++;
+               r = RESUME_GUEST;
+               break;
+       /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
+       case BOOK3S_INTERRUPT_HMI:
+       case BOOK3S_INTERRUPT_PERFMON:
+       case BOOK3S_INTERRUPT_SYSTEM_RESET:
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_MACHINE_CHECK:
+               /* Pass the machine check to the L1 guest */
+               r = RESUME_HOST;
+               /* Print the MCE event to host console. */
+               machine_check_print_event_info(&vcpu->arch.mce_evt, false);
+               break;
+       /*
+        * We get these next two if the guest accesses a page which it thinks
+        * it has mapped but which is not actually present, either because
+        * it is for an emulated I/O device or because the corresonding
+        * host page has been paged out.
+        */
+       case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+               srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+               r = kvmhv_nested_page_fault(vcpu);
+               srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+               break;
+       case BOOK3S_INTERRUPT_H_INST_STORAGE:
+               vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
+               vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
+                                        DSISR_SRR1_MATCH_64S;
+               if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
+                       vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
+               srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+               r = kvmhv_nested_page_fault(vcpu);
+               srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+               break;
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       case BOOK3S_INTERRUPT_HV_SOFTPATCH:
+               /*
+                * This occurs for various TM-related instructions that
+                * we need to emulate on POWER9 DD2.2.  We have already
+                * handled the cases where the guest was in real-suspend
+                * mode and was transitioning to transactional state.
+                */
+               r = kvmhv_p9_tm_emulation(vcpu);
+               break;
+#endif
+
+       case BOOK3S_INTERRUPT_HV_RM_HARD:
+               vcpu->arch.trap = 0;
+               r = RESUME_GUEST;
+               if (!xive_enabled())
+                       kvmppc_xics_rm_complete(vcpu, 0);
+               break;
+       default:
+               r = RESUME_HOST;
+               break;
+       }
+
+       return r;
+}
+
 static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
                                            struct kvm_sregs *sregs)
 {
@@ -1555,6 +1723,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
        case KVM_REG_PPC_ONLINE:
                *val = get_reg_val(id, vcpu->arch.online);
                break;
+       case KVM_REG_PPC_PTCR:
+               *val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
+               break;
        default:
                r = -EINVAL;
                break;
@@ -1786,6 +1957,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
                        atomic_dec(&vcpu->arch.vcore->online_count);
                vcpu->arch.online = i;
                break;
+       case KVM_REG_PPC_PTCR:
+               vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
+               break;
        default:
                r = -EINVAL;
                break;
@@ -2019,15 +2193,18 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
         * Set the default HFSCR for the guest from the host value.
         * This value is only used on POWER9.
         * On POWER9, we want to virtualize the doorbell facility, so we
-        * turn off the HFSCR bit, which causes those instructions to trap.
+        * don't set the HFSCR_MSGP bit, and that causes those instructions
+        * to trap and then we emulate them.
         */
-       vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
-       if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+       vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
+               HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP;
+       if (cpu_has_feature(CPU_FTR_HVMODE)) {
+               vcpu->arch.hfscr &= mfspr(SPRN_HFSCR);
+               if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+                       vcpu->arch.hfscr |= HFSCR_TM;
+       }
+       if (cpu_has_feature(CPU_FTR_TM_COMP))
                vcpu->arch.hfscr |= HFSCR_TM;
-       else if (!cpu_has_feature(CPU_FTR_TM_COMP))
-               vcpu->arch.hfscr &= ~HFSCR_TM;
-       if (cpu_has_feature(CPU_FTR_ARCH_300))
-               vcpu->arch.hfscr &= ~HFSCR_MSGP;
 
        kvmppc_mmu_book3s_hv_init(vcpu);
 
@@ -2242,10 +2419,18 @@ static void kvmppc_release_hwthread(int cpu)
 
 static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
 {
+       struct kvm_nested_guest *nested = vcpu->arch.nested;
+       cpumask_t *cpu_in_guest;
        int i;
 
        cpu = cpu_first_thread_sibling(cpu);
-       cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
+       if (nested) {
+               cpumask_set_cpu(cpu, &nested->need_tlb_flush);
+               cpu_in_guest = &nested->cpu_in_guest;
+       } else {
+               cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
+               cpu_in_guest = &kvm->arch.cpu_in_guest;
+       }
        /*
         * Make sure setting of bit in need_tlb_flush precedes
         * testing of cpu_in_guest bits.  The matching barrier on
@@ -2253,13 +2438,23 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
         */
        smp_mb();
        for (i = 0; i < threads_per_core; ++i)
-               if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
+               if (cpumask_test_cpu(cpu + i, cpu_in_guest))
                        smp_call_function_single(cpu + i, do_nothing, NULL, 1);
 }
 
 static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
 {
+       struct kvm_nested_guest *nested = vcpu->arch.nested;
        struct kvm *kvm = vcpu->kvm;
+       int prev_cpu;
+
+       if (!cpu_has_feature(CPU_FTR_HVMODE))
+               return;
+
+       if (nested)
+               prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
+       else
+               prev_cpu = vcpu->arch.prev_cpu;
 
        /*
         * With radix, the guest can do TLB invalidations itself,
@@ -2273,12 +2468,46 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
         * ran to flush the TLB.  The TLB is shared between threads,
         * so we use a single bit in .need_tlb_flush for all 4 threads.
         */
-       if (vcpu->arch.prev_cpu != pcpu) {
-               if (vcpu->arch.prev_cpu >= 0 &&
-                   cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
+       if (prev_cpu != pcpu) {
+               if (prev_cpu >= 0 &&
+                   cpu_first_thread_sibling(prev_cpu) !=
                    cpu_first_thread_sibling(pcpu))
-                       radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
-               vcpu->arch.prev_cpu = pcpu;
+                       radix_flush_cpu(kvm, prev_cpu, vcpu);
+               if (nested)
+                       nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
+               else
+                       vcpu->arch.prev_cpu = pcpu;
+       }
+}
+
+static void kvmppc_radix_check_need_tlb_flush(struct kvm *kvm, int pcpu,
+                                             struct kvm_nested_guest *nested)
+{
+       cpumask_t *need_tlb_flush;
+       int lpid;
+
+       if (!cpu_has_feature(CPU_FTR_HVMODE))
+               return;
+
+       if (cpu_has_feature(CPU_FTR_ARCH_300))
+               pcpu &= ~0x3UL;
+
+       if (nested) {
+               lpid = nested->shadow_lpid;
+               need_tlb_flush = &nested->need_tlb_flush;
+       } else {
+               lpid = kvm->arch.lpid;
+               need_tlb_flush = &kvm->arch.need_tlb_flush;
+       }
+
+       mtspr(SPRN_LPID, lpid);
+       isync();
+       smp_mb();
+
+       if (cpumask_test_cpu(pcpu, need_tlb_flush)) {
+               radix__local_flush_tlb_lpid_guest(lpid);
+               /* Clear the bit after the TLB flush */
+               cpumask_clear_cpu(pcpu, need_tlb_flush);
        }
 }
 
@@ -2493,6 +2722,10 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
        if (!cpu_has_feature(CPU_FTR_ARCH_207S))
                return false;
 
+       /* In one_vm_per_core mode, require all vcores to be from the same vm */
+       if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
+               return false;
+
        /* Some POWER9 chips require all threads to be in the same MMU mode */
        if (no_mixing_hpt_and_radix &&
            kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
@@ -2600,6 +2833,14 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
        spin_lock(&vc->lock);
        now = get_tb();
        for_each_runnable_thread(i, vcpu, vc) {
+               /*
+                * It's safe to unlock the vcore in the loop here, because
+                * for_each_runnable_thread() is safe against removal of
+                * the vcpu, and the vcore state is VCORE_EXITING here,
+                * so any vcpus becoming runnable will have their arch.trap
+                * set to zero and can't actually run in the guest.
+                */
+               spin_unlock(&vc->lock);
                /* cancel pending dec exception if dec is positive */
                if (now < vcpu->arch.dec_expires &&
                    kvmppc_core_pending_dec(vcpu))
@@ -2615,6 +2856,7 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
                vcpu->arch.ret = ret;
                vcpu->arch.trap = 0;
 
+               spin_lock(&vc->lock);
                if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
                        if (vcpu->arch.pending_exceptions)
                                kvmppc_core_prepare_to_enter(vcpu);
@@ -2963,8 +3205,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                spin_unlock(&core_info.vc[sub]->lock);
 
        if (kvm_is_radix(vc->kvm)) {
-               int tmp = pcpu;
-
                /*
                 * Do we need to flush the process scoped TLB for the LPAR?
                 *
@@ -2975,17 +3215,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                 *
                 * Hash must be flushed in realmode in order to use tlbiel.
                 */
-               mtspr(SPRN_LPID, vc->kvm->arch.lpid);
-               isync();
-
-               if (cpu_has_feature(CPU_FTR_ARCH_300))
-                       tmp &= ~0x3UL;
-
-               if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) {
-                       radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid);
-                       /* Clear the bit after the TLB flush */
-                       cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush);
-               }
+               kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, NULL);
        }
 
        /*
@@ -3080,6 +3310,300 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 }
 
 /*
+ * Load up hypervisor-mode registers on P9.
+ */
+static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
+                                    unsigned long lpcr)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       s64 hdec;
+       u64 tb, purr, spurr;
+       int trap;
+       unsigned long host_hfscr = mfspr(SPRN_HFSCR);
+       unsigned long host_ciabr = mfspr(SPRN_CIABR);
+       unsigned long host_dawr = mfspr(SPRN_DAWR);
+       unsigned long host_dawrx = mfspr(SPRN_DAWRX);
+       unsigned long host_psscr = mfspr(SPRN_PSSCR);
+       unsigned long host_pidr = mfspr(SPRN_PID);
+
+       hdec = time_limit - mftb();
+       if (hdec < 0)
+               return BOOK3S_INTERRUPT_HV_DECREMENTER;
+       mtspr(SPRN_HDEC, hdec);
+
+       if (vc->tb_offset) {
+               u64 new_tb = mftb() + vc->tb_offset;
+               mtspr(SPRN_TBU40, new_tb);
+               tb = mftb();
+               if ((tb & 0xffffff) < (new_tb & 0xffffff))
+                       mtspr(SPRN_TBU40, new_tb + 0x1000000);
+               vc->tb_offset_applied = vc->tb_offset;
+       }
+
+       if (vc->pcr)
+               mtspr(SPRN_PCR, vc->pcr);
+       mtspr(SPRN_DPDES, vc->dpdes);
+       mtspr(SPRN_VTB, vc->vtb);
+
+       local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
+       local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
+       mtspr(SPRN_PURR, vcpu->arch.purr);
+       mtspr(SPRN_SPURR, vcpu->arch.spurr);
+
+       if (cpu_has_feature(CPU_FTR_DAWR)) {
+               mtspr(SPRN_DAWR, vcpu->arch.dawr);
+               mtspr(SPRN_DAWRX, vcpu->arch.dawrx);
+       }
+       mtspr(SPRN_CIABR, vcpu->arch.ciabr);
+       mtspr(SPRN_IC, vcpu->arch.ic);
+       mtspr(SPRN_PID, vcpu->arch.pid);
+
+       mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
+             (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
+
+       mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
+
+       mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
+       mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
+       mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
+       mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
+
+       mtspr(SPRN_AMOR, ~0UL);
+
+       mtspr(SPRN_LPCR, lpcr);
+       isync();
+
+       kvmppc_xive_push_vcpu(vcpu);
+
+       mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
+       mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
+
+       trap = __kvmhv_vcpu_entry_p9(vcpu);
+
+       /* Advance host PURR/SPURR by the amount used by guest */
+       purr = mfspr(SPRN_PURR);
+       spurr = mfspr(SPRN_SPURR);
+       mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
+             purr - vcpu->arch.purr);
+       mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
+             spurr - vcpu->arch.spurr);
+       vcpu->arch.purr = purr;
+       vcpu->arch.spurr = spurr;
+
+       vcpu->arch.ic = mfspr(SPRN_IC);
+       vcpu->arch.pid = mfspr(SPRN_PID);
+       vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
+
+       vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
+       vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
+       vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
+       vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
+
+       mtspr(SPRN_PSSCR, host_psscr);
+       mtspr(SPRN_HFSCR, host_hfscr);
+       mtspr(SPRN_CIABR, host_ciabr);
+       mtspr(SPRN_DAWR, host_dawr);
+       mtspr(SPRN_DAWRX, host_dawrx);
+       mtspr(SPRN_PID, host_pidr);
+
+       /*
+        * Since this is radix, do a eieio; tlbsync; ptesync sequence in
+        * case we interrupted the guest between a tlbie and a ptesync.
+        */
+       asm volatile("eieio; tlbsync; ptesync");
+
+       mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid);    /* restore host LPID */
+       isync();
+
+       vc->dpdes = mfspr(SPRN_DPDES);
+       vc->vtb = mfspr(SPRN_VTB);
+       mtspr(SPRN_DPDES, 0);
+       if (vc->pcr)
+               mtspr(SPRN_PCR, 0);
+
+       if (vc->tb_offset_applied) {
+               u64 new_tb = mftb() - vc->tb_offset_applied;
+               mtspr(SPRN_TBU40, new_tb);
+               tb = mftb();
+               if ((tb & 0xffffff) < (new_tb & 0xffffff))
+                       mtspr(SPRN_TBU40, new_tb + 0x1000000);
+               vc->tb_offset_applied = 0;
+       }
+
+       mtspr(SPRN_HDEC, 0x7fffffff);
+       mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
+
+       return trap;
+}
+
+/*
+ * Virtual-mode guest entry for POWER9 and later when the host and
+ * guest are both using the radix MMU.  The LPIDR has already been set.
+ */
+int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
+                        unsigned long lpcr)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       unsigned long host_dscr = mfspr(SPRN_DSCR);
+       unsigned long host_tidr = mfspr(SPRN_TIDR);
+       unsigned long host_iamr = mfspr(SPRN_IAMR);
+       s64 dec;
+       u64 tb;
+       int trap, save_pmu;
+
+       dec = mfspr(SPRN_DEC);
+       tb = mftb();
+       if (dec < 512)
+               return BOOK3S_INTERRUPT_HV_DECREMENTER;
+       local_paca->kvm_hstate.dec_expires = dec + tb;
+       if (local_paca->kvm_hstate.dec_expires < time_limit)
+               time_limit = local_paca->kvm_hstate.dec_expires;
+
+       vcpu->arch.ceded = 0;
+
+       kvmhv_save_host_pmu();          /* saves it to PACA kvm_hstate */
+
+       kvmppc_subcore_enter_guest();
+
+       vc->entry_exit_map = 1;
+       vc->in_guest = 1;
+
+       if (vcpu->arch.vpa.pinned_addr) {
+               struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
+               u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
+               lp->yield_count = cpu_to_be32(yield_count);
+               vcpu->arch.vpa.dirty = 1;
+       }
+
+       if (cpu_has_feature(CPU_FTR_TM) ||
+           cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+               kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
+
+       kvmhv_load_guest_pmu(vcpu);
+
+       msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
+       load_fp_state(&vcpu->arch.fp);
+#ifdef CONFIG_ALTIVEC
+       load_vr_state(&vcpu->arch.vr);
+#endif
+
+       mtspr(SPRN_DSCR, vcpu->arch.dscr);
+       mtspr(SPRN_IAMR, vcpu->arch.iamr);
+       mtspr(SPRN_PSPB, vcpu->arch.pspb);
+       mtspr(SPRN_FSCR, vcpu->arch.fscr);
+       mtspr(SPRN_TAR, vcpu->arch.tar);
+       mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
+       mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
+       mtspr(SPRN_BESCR, vcpu->arch.bescr);
+       mtspr(SPRN_WORT, vcpu->arch.wort);
+       mtspr(SPRN_TIDR, vcpu->arch.tid);
+       mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
+       mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
+       mtspr(SPRN_AMR, vcpu->arch.amr);
+       mtspr(SPRN_UAMOR, vcpu->arch.uamor);
+
+       if (!(vcpu->arch.ctrl & 1))
+               mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
+
+       mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
+
+       if (kvmhv_on_pseries()) {
+               /* call our hypervisor to load up HV regs and go */
+               struct hv_guest_state hvregs;
+
+               kvmhv_save_hv_regs(vcpu, &hvregs);
+               hvregs.lpcr = lpcr;
+               vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
+               hvregs.version = HV_GUEST_STATE_VERSION;
+               if (vcpu->arch.nested) {
+                       hvregs.lpid = vcpu->arch.nested->shadow_lpid;
+                       hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
+               } else {
+                       hvregs.lpid = vcpu->kvm->arch.lpid;
+                       hvregs.vcpu_token = vcpu->vcpu_id;
+               }
+               hvregs.hdec_expiry = time_limit;
+               trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
+                                         __pa(&vcpu->arch.regs));
+               kvmhv_restore_hv_return_state(vcpu, &hvregs);
+               vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
+               vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
+               vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
+
+               /* H_CEDE has to be handled now, not later */
+               if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
+                   kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
+                       kvmppc_nested_cede(vcpu);
+                       trap = 0;
+               }
+       } else {
+               trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
+       }
+
+       vcpu->arch.slb_max = 0;
+       dec = mfspr(SPRN_DEC);
+       tb = mftb();
+       vcpu->arch.dec_expires = dec + tb;
+       vcpu->cpu = -1;
+       vcpu->arch.thread_cpu = -1;
+       vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
+
+       vcpu->arch.iamr = mfspr(SPRN_IAMR);
+       vcpu->arch.pspb = mfspr(SPRN_PSPB);
+       vcpu->arch.fscr = mfspr(SPRN_FSCR);
+       vcpu->arch.tar = mfspr(SPRN_TAR);
+       vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
+       vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
+       vcpu->arch.bescr = mfspr(SPRN_BESCR);
+       vcpu->arch.wort = mfspr(SPRN_WORT);
+       vcpu->arch.tid = mfspr(SPRN_TIDR);
+       vcpu->arch.amr = mfspr(SPRN_AMR);
+       vcpu->arch.uamor = mfspr(SPRN_UAMOR);
+       vcpu->arch.dscr = mfspr(SPRN_DSCR);
+
+       mtspr(SPRN_PSPB, 0);
+       mtspr(SPRN_WORT, 0);
+       mtspr(SPRN_AMR, 0);
+       mtspr(SPRN_UAMOR, 0);
+       mtspr(SPRN_DSCR, host_dscr);
+       mtspr(SPRN_TIDR, host_tidr);
+       mtspr(SPRN_IAMR, host_iamr);
+       mtspr(SPRN_PSPB, 0);
+
+       msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
+       store_fp_state(&vcpu->arch.fp);
+#ifdef CONFIG_ALTIVEC
+       store_vr_state(&vcpu->arch.vr);
+#endif
+
+       if (cpu_has_feature(CPU_FTR_TM) ||
+           cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+               kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
+
+       save_pmu = 1;
+       if (vcpu->arch.vpa.pinned_addr) {
+               struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
+               u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
+               lp->yield_count = cpu_to_be32(yield_count);
+               vcpu->arch.vpa.dirty = 1;
+               save_pmu = lp->pmcregs_in_use;
+       }
+
+       kvmhv_save_guest_pmu(vcpu, save_pmu);
+
+       vc->entry_exit_map = 0x101;
+       vc->in_guest = 0;
+
+       mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
+
+       kvmhv_load_host_pmu();
+
+       kvmppc_subcore_exit_guest();
+
+       return trap;
+}
+
+/*
  * Wait for some other vcpu thread to execute us, and
  * wake us up when we need to handle something in the host.
  */
@@ -3256,6 +3780,11 @@ out:
        trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
 }
 
+/*
+ * This never fails for a radix guest, as none of the operations it does
+ * for a radix guest can fail or have a way to report failure.
+ * kvmhv_run_single_vcpu() relies on this fact.
+ */
 static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
 {
        int r = 0;
@@ -3405,6 +3934,171 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        return vcpu->arch.ret;
 }
 
+int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
+                         struct kvm_vcpu *vcpu, u64 time_limit,
+                         unsigned long lpcr)
+{
+       int trap, r, pcpu;
+       int srcu_idx;
+       struct kvmppc_vcore *vc;
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_nested_guest *nested = vcpu->arch.nested;
+
+       trace_kvmppc_run_vcpu_enter(vcpu);
+
+       kvm_run->exit_reason = 0;
+       vcpu->arch.ret = RESUME_GUEST;
+       vcpu->arch.trap = 0;
+
+       vc = vcpu->arch.vcore;
+       vcpu->arch.ceded = 0;
+       vcpu->arch.run_task = current;
+       vcpu->arch.kvm_run = kvm_run;
+       vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
+       vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
+       vcpu->arch.busy_preempt = TB_NIL;
+       vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
+       vc->runnable_threads[0] = vcpu;
+       vc->n_runnable = 1;
+       vc->runner = vcpu;
+
+       /* See if the MMU is ready to go */
+       if (!kvm->arch.mmu_ready)
+               kvmhv_setup_mmu(vcpu);
+
+       if (need_resched())
+               cond_resched();
+
+       kvmppc_update_vpas(vcpu);
+
+       init_vcore_to_run(vc);
+       vc->preempt_tb = TB_NIL;
+
+       preempt_disable();
+       pcpu = smp_processor_id();
+       vc->pcpu = pcpu;
+       kvmppc_prepare_radix_vcpu(vcpu, pcpu);
+
+       local_irq_disable();
+       hard_irq_disable();
+       if (signal_pending(current))
+               goto sigpend;
+       if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready)
+               goto out;
+
+       if (!nested) {
+               kvmppc_core_prepare_to_enter(vcpu);
+               if (vcpu->arch.doorbell_request) {
+                       vc->dpdes = 1;
+                       smp_wmb();
+                       vcpu->arch.doorbell_request = 0;
+               }
+               if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
+                            &vcpu->arch.pending_exceptions))
+                       lpcr |= LPCR_MER;
+       } else if (vcpu->arch.pending_exceptions ||
+                  vcpu->arch.doorbell_request ||
+                  xive_interrupt_pending(vcpu)) {
+               vcpu->arch.ret = RESUME_HOST;
+               goto out;
+       }
+
+       kvmppc_clear_host_core(pcpu);
+
+       local_paca->kvm_hstate.tid = 0;
+       local_paca->kvm_hstate.napping = 0;
+       local_paca->kvm_hstate.kvm_split_mode = NULL;
+       kvmppc_start_thread(vcpu, vc);
+       kvmppc_create_dtl_entry(vcpu, vc);
+       trace_kvm_guest_enter(vcpu);
+
+       vc->vcore_state = VCORE_RUNNING;
+       trace_kvmppc_run_core(vc, 0);
+
+       if (cpu_has_feature(CPU_FTR_HVMODE))
+               kvmppc_radix_check_need_tlb_flush(kvm, pcpu, nested);
+
+       trace_hardirqs_on();
+       guest_enter_irqoff();
+
+       srcu_idx = srcu_read_lock(&kvm->srcu);
+
+       this_cpu_disable_ftrace();
+
+       trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr);
+       vcpu->arch.trap = trap;
+
+       this_cpu_enable_ftrace();
+
+       srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+       if (cpu_has_feature(CPU_FTR_HVMODE)) {
+               mtspr(SPRN_LPID, kvm->arch.host_lpid);
+               isync();
+       }
+
+       trace_hardirqs_off();
+       set_irq_happened(trap);
+
+       kvmppc_set_host_core(pcpu);
+
+       local_irq_enable();
+       guest_exit();
+
+       cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest);
+
+       preempt_enable();
+
+       /* cancel pending decrementer exception if DEC is now positive */
+       if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
+               kvmppc_core_dequeue_dec(vcpu);
+
+       trace_kvm_guest_exit(vcpu);
+       r = RESUME_GUEST;
+       if (trap) {
+               if (!nested)
+                       r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
+               else
+                       r = kvmppc_handle_nested_exit(vcpu);
+       }
+       vcpu->arch.ret = r;
+
+       if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded &&
+           !kvmppc_vcpu_woken(vcpu)) {
+               kvmppc_set_timer(vcpu);
+               while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) {
+                       if (signal_pending(current)) {
+                               vcpu->stat.signal_exits++;
+                               kvm_run->exit_reason = KVM_EXIT_INTR;
+                               vcpu->arch.ret = -EINTR;
+                               break;
+                       }
+                       spin_lock(&vc->lock);
+                       kvmppc_vcore_blocked(vc);
+                       spin_unlock(&vc->lock);
+               }
+       }
+       vcpu->arch.ceded = 0;
+
+       vc->vcore_state = VCORE_INACTIVE;
+       trace_kvmppc_run_core(vc, 1);
+
+ done:
+       kvmppc_remove_runnable(vc, vcpu);
+       trace_kvmppc_run_vcpu_exit(vcpu, kvm_run);
+
+       return vcpu->arch.ret;
+
+ sigpend:
+       vcpu->stat.signal_exits++;
+       kvm_run->exit_reason = KVM_EXIT_INTR;
+       vcpu->arch.ret = -EINTR;
+ out:
+       local_irq_enable();
+       preempt_enable();
+       goto done;
+}
+
 static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
        int r;
@@ -3480,7 +4174,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
        vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
 
        do {
-               r = kvmppc_run_vcpu(run, vcpu);
+               if (kvm->arch.threads_indep && kvm_is_radix(kvm))
+                       r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0,
+                                                 vcpu->arch.vcore->lpcr);
+               else
+                       r = kvmppc_run_vcpu(run, vcpu);
 
                if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
                    !(vcpu->arch.shregs.msr & MSR_PR)) {
@@ -3559,6 +4257,10 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
        kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
        kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
 
+       /* If running as a nested hypervisor, we don't support HPT guests */
+       if (kvmhv_on_pseries())
+               info->flags |= KVM_PPC_NO_HASH;
+
        return 0;
 }
 
@@ -3723,8 +4425,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm)
                        __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
                dw1 = PATB_GR | kvm->arch.process_table;
        }
-
-       mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
+       kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
 }
 
 /*
@@ -3820,6 +4521,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
 int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
 {
+       if (nesting_enabled(kvm))
+               kvmhv_release_all_nested(kvm);
        kvmppc_free_radix(kvm);
        kvmppc_update_lpcr(kvm, LPCR_VPM1,
                           LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
@@ -3841,6 +4544,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
        kvmppc_free_hpt(&kvm->arch.hpt);
        kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
                           LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
+       kvmppc_rmap_reset(kvm);
        kvm->arch.radix = 1;
        return 0;
 }
@@ -3940,6 +4644,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 
        kvmppc_alloc_host_rm_ops();
 
+       kvmhv_vm_nested_init(kvm);
+
        /*
         * Since we don't flush the TLB when tearing down a VM,
         * and this lpid might have previously been used,
@@ -3958,9 +4664,13 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
                kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
 
        /* Init LPCR for virtual RMA mode */
-       kvm->arch.host_lpid = mfspr(SPRN_LPID);
-       kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
-       lpcr &= LPCR_PECE | LPCR_LPES;
+       if (cpu_has_feature(CPU_FTR_HVMODE)) {
+               kvm->arch.host_lpid = mfspr(SPRN_LPID);
+               kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
+               lpcr &= LPCR_PECE | LPCR_LPES;
+       } else {
+               lpcr = 0;
+       }
        lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
                LPCR_VPM0 | LPCR_VPM1;
        kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
@@ -4027,8 +4737,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
         * On POWER9, we only need to do this if the "indep_threads_mode"
         * module parameter has been set to N.
         */
-       if (cpu_has_feature(CPU_FTR_ARCH_300))
-               kvm->arch.threads_indep = indep_threads_mode;
+       if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+               if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
+                       pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
+                       kvm->arch.threads_indep = true;
+               } else {
+                       kvm->arch.threads_indep = indep_threads_mode;
+               }
+       }
        if (!kvm->arch.threads_indep)
                kvm_hv_vm_activated();
 
@@ -4051,6 +4767,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
        snprintf(buf, sizeof(buf), "vm%d", current->pid);
        kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
        kvmppc_mmu_debugfs_init(kvm);
+       if (radix_enabled())
+               kvmhv_radix_debugfs_init(kvm);
 
        return 0;
 }
@@ -4073,13 +4791,21 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 
        kvmppc_free_vcores(kvm);
 
-       kvmppc_free_lpid(kvm->arch.lpid);
 
        if (kvm_is_radix(kvm))
                kvmppc_free_radix(kvm);
        else
                kvmppc_free_hpt(&kvm->arch.hpt);
 
+       /* Perform global invalidation and return lpid to the pool */
+       if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+               if (nesting_enabled(kvm))
+                       kvmhv_release_all_nested(kvm);
+               kvm->arch.process_table = 0;
+               kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
+       }
+       kvmppc_free_lpid(kvm->arch.lpid);
+
        kvmppc_free_pimap(kvm);
 }
 
@@ -4104,11 +4830,15 @@ static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
 
 static int kvmppc_core_check_processor_compat_hv(void)
 {
-       if (!cpu_has_feature(CPU_FTR_HVMODE) ||
-           !cpu_has_feature(CPU_FTR_ARCH_206))
-               return -EIO;
+       if (cpu_has_feature(CPU_FTR_HVMODE) &&
+           cpu_has_feature(CPU_FTR_ARCH_206))
+               return 0;
 
-       return 0;
+       /* POWER9 in radix mode is capable of being a nested hypervisor. */
+       if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
+               return 0;
+
+       return -EIO;
 }
 
 #ifdef CONFIG_KVM_XICS
@@ -4426,6 +5156,10 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
        if (radix && !radix_enabled())
                return -EINVAL;
 
+       /* If we're a nested hypervisor, we currently only support radix */
+       if (kvmhv_on_pseries() && !radix)
+               return -EINVAL;
+
        mutex_lock(&kvm->lock);
        if (radix != kvm_is_radix(kvm)) {
                if (kvm->arch.mmu_ready) {
@@ -4458,6 +5192,19 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
        return err;
 }
 
+static int kvmhv_enable_nested(struct kvm *kvm)
+{
+       if (!nested)
+               return -EPERM;
+       if (!cpu_has_feature(CPU_FTR_ARCH_300))
+               return -ENODEV;
+
+       /* kvm == NULL means the caller is testing if the capability exists */
+       if (kvm)
+               kvm->arch.nested_enable = true;
+       return 0;
+}
+
 static struct kvmppc_ops kvm_ops_hv = {
        .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
        .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -4497,6 +5244,7 @@ static struct kvmppc_ops kvm_ops_hv = {
        .configure_mmu = kvmhv_configure_mmu,
        .get_rmmu_info = kvmhv_get_rmmu_info,
        .set_smt_mode = kvmhv_set_smt_mode,
+       .enable_nested = kvmhv_enable_nested,
 };
 
 static int kvm_init_subcore_bitmap(void)
@@ -4547,6 +5295,10 @@ static int kvmppc_book3s_init_hv(void)
        if (r < 0)
                return -ENODEV;
 
+       r = kvmhv_nested_init();
+       if (r)
+               return r;
+
        r = kvm_init_subcore_bitmap();
        if (r)
                return r;
@@ -4557,7 +5309,8 @@ static int kvmppc_book3s_init_hv(void)
         * indirectly, via OPAL.
         */
 #ifdef CONFIG_SMP
-       if (!xive_enabled() && !local_paca->kvm_hstate.xics_phys) {
+       if (!xive_enabled() && !kvmhv_on_pseries() &&
+           !local_paca->kvm_hstate.xics_phys) {
                struct device_node *np;
 
                np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
@@ -4605,6 +5358,7 @@ static void kvmppc_book3s_exit_hv(void)
        if (kvmppc_radix_possible())
                kvmppc_radix_exit();
        kvmppc_hv_ops = NULL;
+       kvmhv_nested_exit();
 }
 
 module_init(kvmppc_book3s_init_hv);
index fc6bb96..a71e2fc 100644 (file)
@@ -231,6 +231,15 @@ void kvmhv_rm_send_ipi(int cpu)
        void __iomem *xics_phys;
        unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
 
+       /* For a nested hypervisor, use the XICS via hcall */
+       if (kvmhv_on_pseries()) {
+               unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+               plpar_hcall_raw(H_IPI, retbuf, get_hard_smp_processor_id(cpu),
+                               IPI_PRIORITY);
+               return;
+       }
+
        /* On POWER9 we can use msgsnd for any destination cpu. */
        if (cpu_has_feature(CPU_FTR_ARCH_300)) {
                msg |= get_hard_smp_processor_id(cpu);
@@ -460,12 +469,19 @@ static long kvmppc_read_one_intr(bool *again)
                return 1;
 
        /* Now read the interrupt from the ICP */
-       xics_phys = local_paca->kvm_hstate.xics_phys;
-       rc = 0;
-       if (!xics_phys)
-               rc = opal_int_get_xirr(&xirr, false);
-       else
-               xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
+       if (kvmhv_on_pseries()) {
+               unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+               rc = plpar_hcall_raw(H_XIRR, retbuf, 0xFF);
+               xirr = cpu_to_be32(retbuf[0]);
+       } else {
+               xics_phys = local_paca->kvm_hstate.xics_phys;
+               rc = 0;
+               if (!xics_phys)
+                       rc = opal_int_get_xirr(&xirr, false);
+               else
+                       xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
+       }
        if (rc < 0)
                return 1;
 
@@ -494,7 +510,13 @@ static long kvmppc_read_one_intr(bool *again)
         */
        if (xisr == XICS_IPI) {
                rc = 0;
-               if (xics_phys) {
+               if (kvmhv_on_pseries()) {
+                       unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+                       plpar_hcall_raw(H_IPI, retbuf,
+                                       hard_smp_processor_id(), 0xff);
+                       plpar_hcall_raw(H_EOI, retbuf, h_xirr);
+               } else if (xics_phys) {
                        __raw_rm_writeb(0xff, xics_phys + XICS_MFRR);
                        __raw_rm_writel(xirr, xics_phys + XICS_XIRR);
                } else {
@@ -520,7 +542,13 @@ static long kvmppc_read_one_intr(bool *again)
                        /* We raced with the host,
                         * we need to resend that IPI, bummer
                         */
-                       if (xics_phys)
+                       if (kvmhv_on_pseries()) {
+                               unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+                               plpar_hcall_raw(H_IPI, retbuf,
+                                               hard_smp_processor_id(),
+                                               IPI_PRIORITY);
+                       } else if (xics_phys)
                                __raw_rm_writeb(IPI_PRIORITY,
                                                xics_phys + XICS_MFRR);
                        else
@@ -729,3 +757,51 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
        smp_mb();
        local_paca->kvm_hstate.kvm_split_mode = NULL;
 }
+
+/*
+ * Is there a PRIV_DOORBELL pending for the guest (on POWER9)?
+ * Can we inject a Decrementer or a External interrupt?
+ */
+void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
+{
+       int ext;
+       unsigned long vec = 0;
+       unsigned long lpcr;
+
+       /* Insert EXTERNAL bit into LPCR at the MER bit position */
+       ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1;
+       lpcr = mfspr(SPRN_LPCR);
+       lpcr |= ext << LPCR_MER_SH;
+       mtspr(SPRN_LPCR, lpcr);
+       isync();
+
+       if (vcpu->arch.shregs.msr & MSR_EE) {
+               if (ext) {
+                       vec = BOOK3S_INTERRUPT_EXTERNAL;
+               } else {
+                       long int dec = mfspr(SPRN_DEC);
+                       if (!(lpcr & LPCR_LD))
+                               dec = (int) dec;
+                       if (dec < 0)
+                               vec = BOOK3S_INTERRUPT_DECREMENTER;
+               }
+       }
+       if (vec) {
+               unsigned long msr, old_msr = vcpu->arch.shregs.msr;
+
+               kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu));
+               kvmppc_set_srr1(vcpu, old_msr);
+               kvmppc_set_pc(vcpu, vec);
+               msr = vcpu->arch.intr_msr;
+               if (MSR_TM_ACTIVE(old_msr))
+                       msr |= MSR_TS_S;
+               vcpu->arch.shregs.msr = msr;
+       }
+
+       if (vcpu->arch.doorbell_request) {
+               mtspr(SPRN_DPDES, 1);
+               vcpu->arch.vcore->dpdes = 1;
+               smp_wmb();
+               vcpu->arch.doorbell_request = 0;
+       }
+}
index 666b91c..a6d1001 100644 (file)
@@ -64,52 +64,7 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
        /* Save host PMU registers */
-BEGIN_FTR_SECTION
-       /* Work around P8 PMAE bug */
-       li      r3, -1
-       clrrdi  r3, r3, 10
-       mfspr   r8, SPRN_MMCR2
-       mtspr   SPRN_MMCR2, r3          /* freeze all counters using MMCR2 */
-       isync
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-       li      r3, 1
-       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
-       mfspr   r7, SPRN_MMCR0          /* save MMCR0 */
-       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable interrupts */
-       mfspr   r6, SPRN_MMCRA
-       /* Clear MMCRA in order to disable SDAR updates */
-       li      r5, 0
-       mtspr   SPRN_MMCRA, r5
-       isync
-       lbz     r5, PACA_PMCINUSE(r13)  /* is the host using the PMU? */
-       cmpwi   r5, 0
-       beq     31f                     /* skip if not */
-       mfspr   r5, SPRN_MMCR1
-       mfspr   r9, SPRN_SIAR
-       mfspr   r10, SPRN_SDAR
-       std     r7, HSTATE_MMCR0(r13)
-       std     r5, HSTATE_MMCR1(r13)
-       std     r6, HSTATE_MMCRA(r13)
-       std     r9, HSTATE_SIAR(r13)
-       std     r10, HSTATE_SDAR(r13)
-BEGIN_FTR_SECTION
-       mfspr   r9, SPRN_SIER
-       std     r8, HSTATE_MMCR2(r13)
-       std     r9, HSTATE_SIER(r13)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-       mfspr   r3, SPRN_PMC1
-       mfspr   r5, SPRN_PMC2
-       mfspr   r6, SPRN_PMC3
-       mfspr   r7, SPRN_PMC4
-       mfspr   r8, SPRN_PMC5
-       mfspr   r9, SPRN_PMC6
-       stw     r3, HSTATE_PMC1(r13)
-       stw     r5, HSTATE_PMC2(r13)
-       stw     r6, HSTATE_PMC3(r13)
-       stw     r7, HSTATE_PMC4(r13)
-       stw     r8, HSTATE_PMC5(r13)
-       stw     r9, HSTATE_PMC6(r13)
-31:
+       bl      kvmhv_save_host_pmu
 
        /*
         * Put whatever is in the decrementer into the
@@ -161,3 +116,51 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
        ld      r0, PPC_LR_STKOFF(r1)
        mtlr    r0
        blr
+
+_GLOBAL(kvmhv_save_host_pmu)
+BEGIN_FTR_SECTION
+       /* Work around P8 PMAE bug */
+       li      r3, -1
+       clrrdi  r3, r3, 10
+       mfspr   r8, SPRN_MMCR2
+       mtspr   SPRN_MMCR2, r3          /* freeze all counters using MMCR2 */
+       isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       li      r3, 1
+       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
+       mfspr   r7, SPRN_MMCR0          /* save MMCR0 */
+       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable interrupts */
+       mfspr   r6, SPRN_MMCRA
+       /* Clear MMCRA in order to disable SDAR updates */
+       li      r5, 0
+       mtspr   SPRN_MMCRA, r5
+       isync
+       lbz     r5, PACA_PMCINUSE(r13)  /* is the host using the PMU? */
+       cmpwi   r5, 0
+       beq     31f                     /* skip if not */
+       mfspr   r5, SPRN_MMCR1
+       mfspr   r9, SPRN_SIAR
+       mfspr   r10, SPRN_SDAR
+       std     r7, HSTATE_MMCR0(r13)
+       std     r5, HSTATE_MMCR1(r13)
+       std     r6, HSTATE_MMCRA(r13)
+       std     r9, HSTATE_SIAR(r13)
+       std     r10, HSTATE_SDAR(r13)
+BEGIN_FTR_SECTION
+       mfspr   r9, SPRN_SIER
+       std     r8, HSTATE_MMCR2(r13)
+       std     r9, HSTATE_SIER(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       mfspr   r3, SPRN_PMC1
+       mfspr   r5, SPRN_PMC2
+       mfspr   r6, SPRN_PMC3
+       mfspr   r7, SPRN_PMC4
+       mfspr   r8, SPRN_PMC5
+       mfspr   r9, SPRN_PMC6
+       stw     r3, HSTATE_PMC1(r13)
+       stw     r5, HSTATE_PMC2(r13)
+       stw     r6, HSTATE_PMC3(r13)
+       stw     r7, HSTATE_PMC4(r13)
+       stw     r8, HSTATE_PMC5(r13)
+       stw     r9, HSTATE_PMC6(r13)
+31:    blr
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
new file mode 100644 (file)
index 0000000..401d2ec
--- /dev/null
@@ -0,0 +1,1291 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2018
+ * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com>
+ *        Paul Mackerras <paulus@ozlabs.org>
+ *
+ * Description: KVM functions specific to running nested KVM-HV guests
+ * on Book3S processors (specifically POWER9 and later).
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/llist.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/pte-walk.h>
+#include <asm/reg.h>
+
+static struct patb_entry *pseries_partition_tb;
+
+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
+static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free);
+
+void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+       hr->pcr = vc->pcr;
+       hr->dpdes = vc->dpdes;
+       hr->hfscr = vcpu->arch.hfscr;
+       hr->tb_offset = vc->tb_offset;
+       hr->dawr0 = vcpu->arch.dawr;
+       hr->dawrx0 = vcpu->arch.dawrx;
+       hr->ciabr = vcpu->arch.ciabr;
+       hr->purr = vcpu->arch.purr;
+       hr->spurr = vcpu->arch.spurr;
+       hr->ic = vcpu->arch.ic;
+       hr->vtb = vc->vtb;
+       hr->srr0 = vcpu->arch.shregs.srr0;
+       hr->srr1 = vcpu->arch.shregs.srr1;
+       hr->sprg[0] = vcpu->arch.shregs.sprg0;
+       hr->sprg[1] = vcpu->arch.shregs.sprg1;
+       hr->sprg[2] = vcpu->arch.shregs.sprg2;
+       hr->sprg[3] = vcpu->arch.shregs.sprg3;
+       hr->pidr = vcpu->arch.pid;
+       hr->cfar = vcpu->arch.cfar;
+       hr->ppr = vcpu->arch.ppr;
+}
+
+static void byteswap_pt_regs(struct pt_regs *regs)
+{
+       unsigned long *addr = (unsigned long *) regs;
+
+       for (; addr < ((unsigned long *) (regs + 1)); addr++)
+               *addr = swab64(*addr);
+}
+
+static void byteswap_hv_regs(struct hv_guest_state *hr)
+{
+       hr->version = swab64(hr->version);
+       hr->lpid = swab32(hr->lpid);
+       hr->vcpu_token = swab32(hr->vcpu_token);
+       hr->lpcr = swab64(hr->lpcr);
+       hr->pcr = swab64(hr->pcr);
+       hr->amor = swab64(hr->amor);
+       hr->dpdes = swab64(hr->dpdes);
+       hr->hfscr = swab64(hr->hfscr);
+       hr->tb_offset = swab64(hr->tb_offset);
+       hr->dawr0 = swab64(hr->dawr0);
+       hr->dawrx0 = swab64(hr->dawrx0);
+       hr->ciabr = swab64(hr->ciabr);
+       hr->hdec_expiry = swab64(hr->hdec_expiry);
+       hr->purr = swab64(hr->purr);
+       hr->spurr = swab64(hr->spurr);
+       hr->ic = swab64(hr->ic);
+       hr->vtb = swab64(hr->vtb);
+       hr->hdar = swab64(hr->hdar);
+       hr->hdsisr = swab64(hr->hdsisr);
+       hr->heir = swab64(hr->heir);
+       hr->asdr = swab64(hr->asdr);
+       hr->srr0 = swab64(hr->srr0);
+       hr->srr1 = swab64(hr->srr1);
+       hr->sprg[0] = swab64(hr->sprg[0]);
+       hr->sprg[1] = swab64(hr->sprg[1]);
+       hr->sprg[2] = swab64(hr->sprg[2]);
+       hr->sprg[3] = swab64(hr->sprg[3]);
+       hr->pidr = swab64(hr->pidr);
+       hr->cfar = swab64(hr->cfar);
+       hr->ppr = swab64(hr->ppr);
+}
+
+static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
+                                struct hv_guest_state *hr)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+       hr->dpdes = vc->dpdes;
+       hr->hfscr = vcpu->arch.hfscr;
+       hr->purr = vcpu->arch.purr;
+       hr->spurr = vcpu->arch.spurr;
+       hr->ic = vcpu->arch.ic;
+       hr->vtb = vc->vtb;
+       hr->srr0 = vcpu->arch.shregs.srr0;
+       hr->srr1 = vcpu->arch.shregs.srr1;
+       hr->sprg[0] = vcpu->arch.shregs.sprg0;
+       hr->sprg[1] = vcpu->arch.shregs.sprg1;
+       hr->sprg[2] = vcpu->arch.shregs.sprg2;
+       hr->sprg[3] = vcpu->arch.shregs.sprg3;
+       hr->pidr = vcpu->arch.pid;
+       hr->cfar = vcpu->arch.cfar;
+       hr->ppr = vcpu->arch.ppr;
+       switch (trap) {
+       case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+               hr->hdar = vcpu->arch.fault_dar;
+               hr->hdsisr = vcpu->arch.fault_dsisr;
+               hr->asdr = vcpu->arch.fault_gpa;
+               break;
+       case BOOK3S_INTERRUPT_H_INST_STORAGE:
+               hr->asdr = vcpu->arch.fault_gpa;
+               break;
+       case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+               hr->heir = vcpu->arch.emul_inst;
+               break;
+       }
+}
+
+static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+{
+       /*
+        * Don't let L1 enable features for L2 which we've disabled for L1,
+        * but preserve the interrupt cause field.
+        */
+       hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr);
+
+       /* Don't let data address watchpoint match in hypervisor state */
+       hr->dawrx0 &= ~DAWRX_HYP;
+
+       /* Don't let completed instruction address breakpt match in HV state */
+       if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
+               hr->ciabr &= ~CIABR_PRIV;
+}
+
+static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+       vc->pcr = hr->pcr;
+       vc->dpdes = hr->dpdes;
+       vcpu->arch.hfscr = hr->hfscr;
+       vcpu->arch.dawr = hr->dawr0;
+       vcpu->arch.dawrx = hr->dawrx0;
+       vcpu->arch.ciabr = hr->ciabr;
+       vcpu->arch.purr = hr->purr;
+       vcpu->arch.spurr = hr->spurr;
+       vcpu->arch.ic = hr->ic;
+       vc->vtb = hr->vtb;
+       vcpu->arch.shregs.srr0 = hr->srr0;
+       vcpu->arch.shregs.srr1 = hr->srr1;
+       vcpu->arch.shregs.sprg0 = hr->sprg[0];
+       vcpu->arch.shregs.sprg1 = hr->sprg[1];
+       vcpu->arch.shregs.sprg2 = hr->sprg[2];
+       vcpu->arch.shregs.sprg3 = hr->sprg[3];
+       vcpu->arch.pid = hr->pidr;
+       vcpu->arch.cfar = hr->cfar;
+       vcpu->arch.ppr = hr->ppr;
+}
+
+void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
+                                  struct hv_guest_state *hr)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+       vc->dpdes = hr->dpdes;
+       vcpu->arch.hfscr = hr->hfscr;
+       vcpu->arch.purr = hr->purr;
+       vcpu->arch.spurr = hr->spurr;
+       vcpu->arch.ic = hr->ic;
+       vc->vtb = hr->vtb;
+       vcpu->arch.fault_dar = hr->hdar;
+       vcpu->arch.fault_dsisr = hr->hdsisr;
+       vcpu->arch.fault_gpa = hr->asdr;
+       vcpu->arch.emul_inst = hr->heir;
+       vcpu->arch.shregs.srr0 = hr->srr0;
+       vcpu->arch.shregs.srr1 = hr->srr1;
+       vcpu->arch.shregs.sprg0 = hr->sprg[0];
+       vcpu->arch.shregs.sprg1 = hr->sprg[1];
+       vcpu->arch.shregs.sprg2 = hr->sprg[2];
+       vcpu->arch.shregs.sprg3 = hr->sprg[3];
+       vcpu->arch.pid = hr->pidr;
+       vcpu->arch.cfar = hr->cfar;
+       vcpu->arch.ppr = hr->ppr;
+}
+
+long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
+{
+       long int err, r;
+       struct kvm_nested_guest *l2;
+       struct pt_regs l2_regs, saved_l1_regs;
+       struct hv_guest_state l2_hv, saved_l1_hv;
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       u64 hv_ptr, regs_ptr;
+       u64 hdec_exp;
+       s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
+       u64 mask;
+       unsigned long lpcr;
+
+       if (vcpu->kvm->arch.l1_ptcr == 0)
+               return H_NOT_AVAILABLE;
+
+       /* copy parameters in */
+       hv_ptr = kvmppc_get_gpr(vcpu, 4);
+       err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv,
+                                 sizeof(struct hv_guest_state));
+       if (err)
+               return H_PARAMETER;
+       if (kvmppc_need_byteswap(vcpu))
+               byteswap_hv_regs(&l2_hv);
+       if (l2_hv.version != HV_GUEST_STATE_VERSION)
+               return H_P2;
+
+       regs_ptr = kvmppc_get_gpr(vcpu, 5);
+       err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs,
+                                 sizeof(struct pt_regs));
+       if (err)
+               return H_PARAMETER;
+       if (kvmppc_need_byteswap(vcpu))
+               byteswap_pt_regs(&l2_regs);
+       if (l2_hv.vcpu_token >= NR_CPUS)
+               return H_PARAMETER;
+
+       /* translate lpid */
+       l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true);
+       if (!l2)
+               return H_PARAMETER;
+       if (!l2->l1_gr_to_hr) {
+               mutex_lock(&l2->tlb_lock);
+               kvmhv_update_ptbl_cache(l2);
+               mutex_unlock(&l2->tlb_lock);
+       }
+
+       /* save l1 values of things */
+       vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
+       saved_l1_regs = vcpu->arch.regs;
+       kvmhv_save_hv_regs(vcpu, &saved_l1_hv);
+
+       /* convert TB values/offsets to host (L0) values */
+       hdec_exp = l2_hv.hdec_expiry - vc->tb_offset;
+       vc->tb_offset += l2_hv.tb_offset;
+
+       /* set L1 state to L2 state */
+       vcpu->arch.nested = l2;
+       vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token;
+       vcpu->arch.regs = l2_regs;
+       vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
+       mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
+               LPCR_LPES | LPCR_MER;
+       lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask);
+       sanitise_hv_regs(vcpu, &l2_hv);
+       restore_hv_regs(vcpu, &l2_hv);
+
+       vcpu->arch.ret = RESUME_GUEST;
+       vcpu->arch.trap = 0;
+       do {
+               if (mftb() >= hdec_exp) {
+                       vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
+                       r = RESUME_HOST;
+                       break;
+               }
+               r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp,
+                                         lpcr);
+       } while (is_kvmppc_resume_guest(r));
+
+       /* save L2 state for return */
+       l2_regs = vcpu->arch.regs;
+       l2_regs.msr = vcpu->arch.shregs.msr;
+       delta_purr = vcpu->arch.purr - l2_hv.purr;
+       delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
+       delta_ic = vcpu->arch.ic - l2_hv.ic;
+       delta_vtb = vc->vtb - l2_hv.vtb;
+       save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv);
+
+       /* restore L1 state */
+       vcpu->arch.nested = NULL;
+       vcpu->arch.regs = saved_l1_regs;
+       vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK;
+       /* set L1 MSR TS field according to L2 transaction state */
+       if (l2_regs.msr & MSR_TS_MASK)
+               vcpu->arch.shregs.msr |= MSR_TS_S;
+       vc->tb_offset = saved_l1_hv.tb_offset;
+       restore_hv_regs(vcpu, &saved_l1_hv);
+       vcpu->arch.purr += delta_purr;
+       vcpu->arch.spurr += delta_spurr;
+       vcpu->arch.ic += delta_ic;
+       vc->vtb += delta_vtb;
+
+       kvmhv_put_nested(l2);
+
+       /* copy l2_hv_state and regs back to guest */
+       if (kvmppc_need_byteswap(vcpu)) {
+               byteswap_hv_regs(&l2_hv);
+               byteswap_pt_regs(&l2_regs);
+       }
+       err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv,
+                                  sizeof(struct hv_guest_state));
+       if (err)
+               return H_AUTHORITY;
+       err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs,
+                                  sizeof(struct pt_regs));
+       if (err)
+               return H_AUTHORITY;
+
+       if (r == -EINTR)
+               return H_INTERRUPT;
+
+       return vcpu->arch.trap;
+}
+
+long kvmhv_nested_init(void)
+{
+       long int ptb_order;
+       unsigned long ptcr;
+       long rc;
+
+       if (!kvmhv_on_pseries())
+               return 0;
+       if (!radix_enabled())
+               return -ENODEV;
+
+       /* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
+       ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
+       if (ptb_order < 8)
+               ptb_order = 8;
+       pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
+                                      GFP_KERNEL);
+       if (!pseries_partition_tb) {
+               pr_err("kvm-hv: failed to allocated nested partition table\n");
+               return -ENOMEM;
+       }
+
+       ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
+       rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
+       if (rc != H_SUCCESS) {
+               pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n",
+                      rc);
+               kfree(pseries_partition_tb);
+               pseries_partition_tb = NULL;
+               return -ENODEV;
+       }
+
+       return 0;
+}
+
+void kvmhv_nested_exit(void)
+{
+       /*
+        * N.B. the kvmhv_on_pseries() test is there because it enables
+        * the compiler to remove the call to plpar_hcall_norets()
+        * when CONFIG_PPC_PSERIES=n.
+        */
+       if (kvmhv_on_pseries() && pseries_partition_tb) {
+               plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
+               kfree(pseries_partition_tb);
+               pseries_partition_tb = NULL;
+       }
+}
+
+static void kvmhv_flush_lpid(unsigned int lpid)
+{
+       long rc;
+
+       if (!kvmhv_on_pseries()) {
+               radix__flush_tlb_lpid(lpid);
+               return;
+       }
+
+       rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1),
+                               lpid, TLBIEL_INVAL_SET_LPID);
+       if (rc)
+               pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n", rc);
+}
+
+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
+{
+       if (!kvmhv_on_pseries()) {
+               mmu_partition_table_set_entry(lpid, dw0, dw1);
+               return;
+       }
+
+       pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+       pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+       /* L0 will do the necessary barriers */
+       kvmhv_flush_lpid(lpid);
+}
+
+static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
+{
+       unsigned long dw0;
+
+       dw0 = PATB_HR | radix__get_tree_size() |
+               __pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
+       kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
+}
+
+void kvmhv_vm_nested_init(struct kvm *kvm)
+{
+       kvm->arch.max_nested_lpid = -1;
+}
+
+/*
+ * Handle the H_SET_PARTITION_TABLE hcall.
+ * r4 = guest real address of partition table + log_2(size) - 12
+ * (formatted as for the PTCR).
+ */
+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
+       int srcu_idx;
+       long ret = H_SUCCESS;
+
+       srcu_idx = srcu_read_lock(&kvm->srcu);
+       /*
+        * Limit the partition table to 4096 entries (because that's what
+        * hardware supports), and check the base address.
+        */
+       if ((ptcr & PRTS_MASK) > 12 - 8 ||
+           !kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT))
+               ret = H_PARAMETER;
+       srcu_read_unlock(&kvm->srcu, srcu_idx);
+       if (ret == H_SUCCESS)
+               kvm->arch.l1_ptcr = ptcr;
+       return ret;
+}
+
+/*
+ * Reload the partition table entry for a guest.
+ * Caller must hold gp->tlb_lock.
+ */
+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
+{
+       int ret;
+       struct patb_entry ptbl_entry;
+       unsigned long ptbl_addr;
+       struct kvm *kvm = gp->l1_host;
+
+       ret = -EFAULT;
+       ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4);
+       if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 8)))
+               ret = kvm_read_guest(kvm, ptbl_addr,
+                                    &ptbl_entry, sizeof(ptbl_entry));
+       if (ret) {
+               gp->l1_gr_to_hr = 0;
+               gp->process_table = 0;
+       } else {
+               gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0);
+               gp->process_table = be64_to_cpu(ptbl_entry.patb1);
+       }
+       kvmhv_set_nested_ptbl(gp);
+}
+
+struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
+{
+       struct kvm_nested_guest *gp;
+       long shadow_lpid;
+
+       gp = kzalloc(sizeof(*gp), GFP_KERNEL);
+       if (!gp)
+               return NULL;
+       gp->l1_host = kvm;
+       gp->l1_lpid = lpid;
+       mutex_init(&gp->tlb_lock);
+       gp->shadow_pgtable = pgd_alloc(kvm->mm);
+       if (!gp->shadow_pgtable)
+               goto out_free;
+       shadow_lpid = kvmppc_alloc_lpid();
+       if (shadow_lpid < 0)
+               goto out_free2;
+       gp->shadow_lpid = shadow_lpid;
+
+       memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu));
+
+       return gp;
+
+ out_free2:
+       pgd_free(kvm->mm, gp->shadow_pgtable);
+ out_free:
+       kfree(gp);
+       return NULL;
+}
+
+/*
+ * Free up any resources allocated for a nested guest.
+ */
+static void kvmhv_release_nested(struct kvm_nested_guest *gp)
+{
+       struct kvm *kvm = gp->l1_host;
+
+       if (gp->shadow_pgtable) {
+               /*
+                * No vcpu is using this struct and no call to
+                * kvmhv_get_nested can find this struct,
+                * so we don't need to hold kvm->mmu_lock.
+                */
+               kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
+                                         gp->shadow_lpid);
+               pgd_free(kvm->mm, gp->shadow_pgtable);
+       }
+       kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
+       kvmppc_free_lpid(gp->shadow_lpid);
+       kfree(gp);
+}
+
+static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
+{
+       struct kvm *kvm = gp->l1_host;
+       int lpid = gp->l1_lpid;
+       long ref;
+
+       spin_lock(&kvm->mmu_lock);
+       if (gp == kvm->arch.nested_guests[lpid]) {
+               kvm->arch.nested_guests[lpid] = NULL;
+               if (lpid == kvm->arch.max_nested_lpid) {
+                       while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
+                               ;
+                       kvm->arch.max_nested_lpid = lpid;
+               }
+               --gp->refcnt;
+       }
+       ref = gp->refcnt;
+       spin_unlock(&kvm->mmu_lock);
+       if (ref == 0)
+               kvmhv_release_nested(gp);
+}
+
+/*
+ * Free up all nested resources allocated for this guest.
+ * This is called with no vcpus of the guest running, when
+ * switching the guest to HPT mode or when destroying the
+ * guest.
+ */
+void kvmhv_release_all_nested(struct kvm *kvm)
+{
+       int i;
+       struct kvm_nested_guest *gp;
+       struct kvm_nested_guest *freelist = NULL;
+       struct kvm_memory_slot *memslot;
+       int srcu_idx;
+
+       spin_lock(&kvm->mmu_lock);
+       for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
+               gp = kvm->arch.nested_guests[i];
+               if (!gp)
+                       continue;
+               kvm->arch.nested_guests[i] = NULL;
+               if (--gp->refcnt == 0) {
+                       gp->next = freelist;
+                       freelist = gp;
+               }
+       }
+       kvm->arch.max_nested_lpid = -1;
+       spin_unlock(&kvm->mmu_lock);
+       while ((gp = freelist) != NULL) {
+               freelist = gp->next;
+               kvmhv_release_nested(gp);
+       }
+
+       srcu_idx = srcu_read_lock(&kvm->srcu);
+       kvm_for_each_memslot(memslot, kvm_memslots(kvm))
+               kvmhv_free_memslot_nest_rmap(memslot);
+       srcu_read_unlock(&kvm->srcu, srcu_idx);
+}
+
+/* caller must hold gp->tlb_lock */
+static void kvmhv_flush_nested(struct kvm_nested_guest *gp)
+{
+       struct kvm *kvm = gp->l1_host;
+
+       spin_lock(&kvm->mmu_lock);
+       kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, gp->shadow_lpid);
+       spin_unlock(&kvm->mmu_lock);
+       kvmhv_flush_lpid(gp->shadow_lpid);
+       kvmhv_update_ptbl_cache(gp);
+       if (gp->l1_gr_to_hr == 0)
+               kvmhv_remove_nested(gp);
+}
+
+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
+                                         bool create)
+{
+       struct kvm_nested_guest *gp, *newgp;
+
+       if (l1_lpid >= KVM_MAX_NESTED_GUESTS ||
+           l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
+               return NULL;
+
+       spin_lock(&kvm->mmu_lock);
+       gp = kvm->arch.nested_guests[l1_lpid];
+       if (gp)
+               ++gp->refcnt;
+       spin_unlock(&kvm->mmu_lock);
+
+       if (gp || !create)
+               return gp;
+
+       newgp = kvmhv_alloc_nested(kvm, l1_lpid);
+       if (!newgp)
+               return NULL;
+       spin_lock(&kvm->mmu_lock);
+       if (kvm->arch.nested_guests[l1_lpid]) {
+               /* someone else beat us to it */
+               gp = kvm->arch.nested_guests[l1_lpid];
+       } else {
+               kvm->arch.nested_guests[l1_lpid] = newgp;
+               ++newgp->refcnt;
+               gp = newgp;
+               newgp = NULL;
+               if (l1_lpid > kvm->arch.max_nested_lpid)
+                       kvm->arch.max_nested_lpid = l1_lpid;
+       }
+       ++gp->refcnt;
+       spin_unlock(&kvm->mmu_lock);
+
+       if (newgp)
+               kvmhv_release_nested(newgp);
+
+       return gp;
+}
+
+void kvmhv_put_nested(struct kvm_nested_guest *gp)
+{
+       struct kvm *kvm = gp->l1_host;
+       long ref;
+
+       spin_lock(&kvm->mmu_lock);
+       ref = --gp->refcnt;
+       spin_unlock(&kvm->mmu_lock);
+       if (ref == 0)
+               kvmhv_release_nested(gp);
+}
+
+static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
+{
+       if (lpid > kvm->arch.max_nested_lpid)
+               return NULL;
+       return kvm->arch.nested_guests[lpid];
+}
+
+static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2)
+{
+       return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK |
+                                      RMAP_NESTED_GPA_MASK));
+}
+
+void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
+                           struct rmap_nested **n_rmap)
+{
+       struct llist_node *entry = ((struct llist_head *) rmapp)->first;
+       struct rmap_nested *cursor;
+       u64 rmap, new_rmap = (*n_rmap)->rmap;
+
+       /* Are there any existing entries? */
+       if (!(*rmapp)) {
+               /* No -> use the rmap as a single entry */
+               *rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY;
+               return;
+       }
+
+       /* Do any entries match what we're trying to insert? */
+       for_each_nest_rmap_safe(cursor, entry, &rmap) {
+               if (kvmhv_n_rmap_is_equal(rmap, new_rmap))
+                       return;
+       }
+
+       /* Do we need to create a list or just add the new entry? */
+       rmap = *rmapp;
+       if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
+               *rmapp = 0UL;
+       llist_add(&((*n_rmap)->list), (struct llist_head *) rmapp);
+       if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
+               (*n_rmap)->list.next = (struct llist_node *) rmap;
+
+       /* Set NULL so not freed by caller */
+       *n_rmap = NULL;
+}
+
+static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap,
+                                  unsigned long hpa, unsigned long mask)
+{
+       struct kvm_nested_guest *gp;
+       unsigned long gpa;
+       unsigned int shift, lpid;
+       pte_t *ptep;
+
+       gpa = n_rmap & RMAP_NESTED_GPA_MASK;
+       lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
+       gp = kvmhv_find_nested(kvm, lpid);
+       if (!gp)
+               return;
+
+       /* Find and invalidate the pte */
+       ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
+       /* Don't spuriously invalidate ptes if the pfn has changed */
+       if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
+               kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
+}
+
+static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp,
+                                       unsigned long hpa, unsigned long mask)
+{
+       struct llist_node *entry = llist_del_all((struct llist_head *) rmapp);
+       struct rmap_nested *cursor;
+       unsigned long rmap;
+
+       for_each_nest_rmap_safe(cursor, entry, &rmap) {
+               kvmhv_remove_nest_rmap(kvm, rmap, hpa, mask);
+               kfree(cursor);
+       }
+}
+
+/* called with kvm->mmu_lock held */
+void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
+                                 struct kvm_memory_slot *memslot,
+                                 unsigned long gpa, unsigned long hpa,
+                                 unsigned long nbytes)
+{
+       unsigned long gfn, end_gfn;
+       unsigned long addr_mask;
+
+       if (!memslot)
+               return;
+       gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn;
+       end_gfn = gfn + (nbytes >> PAGE_SHIFT);
+
+       addr_mask = PTE_RPN_MASK & ~(nbytes - 1);
+       hpa &= addr_mask;
+
+       for (; gfn < end_gfn; gfn++) {
+               unsigned long *rmap = &memslot->arch.rmap[gfn];
+               kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask);
+       }
+}
+
+static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free)
+{
+       unsigned long page;
+
+       for (page = 0; page < free->npages; page++) {
+               unsigned long rmap, *rmapp = &free->arch.rmap[page];
+               struct rmap_nested *cursor;
+               struct llist_node *entry;
+
+               entry = llist_del_all((struct llist_head *) rmapp);
+               for_each_nest_rmap_safe(cursor, entry, &rmap)
+                       kfree(cursor);
+       }
+}
+
+static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu,
+                                       struct kvm_nested_guest *gp,
+                                       long gpa, int *shift_ret)
+{
+       struct kvm *kvm = vcpu->kvm;
+       bool ret = false;
+       pte_t *ptep;
+       int shift;
+
+       spin_lock(&kvm->mmu_lock);
+       ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
+       if (!shift)
+               shift = PAGE_SHIFT;
+       if (ptep && pte_present(*ptep)) {
+               kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
+               ret = true;
+       }
+       spin_unlock(&kvm->mmu_lock);
+
+       if (shift_ret)
+               *shift_ret = shift;
+       return ret;
+}
+
+static inline int get_ric(unsigned int instr)
+{
+       return (instr >> 18) & 0x3;
+}
+
+static inline int get_prs(unsigned int instr)
+{
+       return (instr >> 17) & 0x1;
+}
+
+static inline int get_r(unsigned int instr)
+{
+       return (instr >> 16) & 0x1;
+}
+
+static inline int get_lpid(unsigned long r_val)
+{
+       return r_val & 0xffffffff;
+}
+
+static inline int get_is(unsigned long r_val)
+{
+       return (r_val >> 10) & 0x3;
+}
+
+static inline int get_ap(unsigned long r_val)
+{
+       return (r_val >> 5) & 0x7;
+}
+
+static inline long get_epn(unsigned long r_val)
+{
+       return r_val >> 12;
+}
+
+static int kvmhv_emulate_tlbie_tlb_addr(struct kvm_vcpu *vcpu, int lpid,
+                                       int ap, long epn)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_nested_guest *gp;
+       long npages;
+       int shift, shadow_shift;
+       unsigned long addr;
+
+       shift = ap_to_shift(ap);
+       addr = epn << 12;
+       if (shift < 0)
+               /* Invalid ap encoding */
+               return -EINVAL;
+
+       addr &= ~((1UL << shift) - 1);
+       npages = 1UL << (shift - PAGE_SHIFT);
+
+       gp = kvmhv_get_nested(kvm, lpid, false);
+       if (!gp) /* No such guest -> nothing to do */
+               return 0;
+       mutex_lock(&gp->tlb_lock);
+
+       /* There may be more than one host page backing this single guest pte */
+       do {
+               kvmhv_invalidate_shadow_pte(vcpu, gp, addr, &shadow_shift);
+
+               npages -= 1UL << (shadow_shift - PAGE_SHIFT);
+               addr += 1UL << shadow_shift;
+       } while (npages > 0);
+
+       mutex_unlock(&gp->tlb_lock);
+       kvmhv_put_nested(gp);
+       return 0;
+}
+
+static void kvmhv_emulate_tlbie_lpid(struct kvm_vcpu *vcpu,
+                                    struct kvm_nested_guest *gp, int ric)
+{
+       struct kvm *kvm = vcpu->kvm;
+
+       mutex_lock(&gp->tlb_lock);
+       switch (ric) {
+       case 0:
+               /* Invalidate TLB */
+               spin_lock(&kvm->mmu_lock);
+               kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
+                                         gp->shadow_lpid);
+               kvmhv_flush_lpid(gp->shadow_lpid);
+               spin_unlock(&kvm->mmu_lock);
+               break;
+       case 1:
+               /*
+                * Invalidate PWC
+                * We don't cache this -> nothing to do
+                */
+               break;
+       case 2:
+               /* Invalidate TLB, PWC and caching of partition table entries */
+               kvmhv_flush_nested(gp);
+               break;
+       default:
+               break;
+       }
+       mutex_unlock(&gp->tlb_lock);
+}
+
+static void kvmhv_emulate_tlbie_all_lpid(struct kvm_vcpu *vcpu, int ric)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_nested_guest *gp;
+       int i;
+
+       spin_lock(&kvm->mmu_lock);
+       for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
+               gp = kvm->arch.nested_guests[i];
+               if (gp) {
+                       spin_unlock(&kvm->mmu_lock);
+                       kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
+                       spin_lock(&kvm->mmu_lock);
+               }
+       }
+       spin_unlock(&kvm->mmu_lock);
+}
+
+static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr,
+                                   unsigned long rsval, unsigned long rbval)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_nested_guest *gp;
+       int r, ric, prs, is, ap;
+       int lpid;
+       long epn;
+       int ret = 0;
+
+       ric = get_ric(instr);
+       prs = get_prs(instr);
+       r = get_r(instr);
+       lpid = get_lpid(rsval);
+       is = get_is(rbval);
+
+       /*
+        * These cases are invalid and are not handled:
+        * r   != 1 -> Only radix supported
+        * prs == 1 -> Not HV privileged
+        * ric == 3 -> No cluster bombs for radix
+        * is  == 1 -> Partition scoped translations not associated with pid
+        * (!is) && (ric == 1 || ric == 2) -> Not supported by ISA
+        */
+       if ((!r) || (prs) || (ric == 3) || (is == 1) ||
+           ((!is) && (ric == 1 || ric == 2)))
+               return -EINVAL;
+
+       switch (is) {
+       case 0:
+               /*
+                * We know ric == 0
+                * Invalidate TLB for a given target address
+                */
+               epn = get_epn(rbval);
+               ap = get_ap(rbval);
+               ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap, epn);
+               break;
+       case 2:
+               /* Invalidate matching LPID */
+               gp = kvmhv_get_nested(kvm, lpid, false);
+               if (gp) {
+                       kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
+                       kvmhv_put_nested(gp);
+               }
+               break;
+       case 3:
+               /* Invalidate ALL LPIDs */
+               kvmhv_emulate_tlbie_all_lpid(vcpu, ric);
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       return ret;
+}
+
+/*
+ * This handles the H_TLB_INVALIDATE hcall.
+ * Parameters are (r4) tlbie instruction code, (r5) rS contents,
+ * (r6) rB contents.
+ */
+long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu)
+{
+       int ret;
+
+       ret = kvmhv_emulate_priv_tlbie(vcpu, kvmppc_get_gpr(vcpu, 4),
+                       kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6));
+       if (ret)
+               return H_PARAMETER;
+       return H_SUCCESS;
+}
+
+/* Used to convert a nested guest real address to a L1 guest real address */
+static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
+                                      struct kvm_nested_guest *gp,
+                                      unsigned long n_gpa, unsigned long dsisr,
+                                      struct kvmppc_pte *gpte_p)
+{
+       u64 fault_addr, flags = dsisr & DSISR_ISSTORE;
+       int ret;
+
+       ret = kvmppc_mmu_walk_radix_tree(vcpu, n_gpa, gpte_p, gp->l1_gr_to_hr,
+                                        &fault_addr);
+
+       if (ret) {
+               /* We didn't find a pte */
+               if (ret == -EINVAL) {
+                       /* Unsupported mmu config */
+                       flags |= DSISR_UNSUPP_MMU;
+               } else if (ret == -ENOENT) {
+                       /* No translation found */
+                       flags |= DSISR_NOHPTE;
+               } else if (ret == -EFAULT) {
+                       /* Couldn't access L1 real address */
+                       flags |= DSISR_PRTABLE_FAULT;
+                       vcpu->arch.fault_gpa = fault_addr;
+               } else {
+                       /* Unknown error */
+                       return ret;
+               }
+               goto forward_to_l1;
+       } else {
+               /* We found a pte -> check permissions */
+               if (dsisr & DSISR_ISSTORE) {
+                       /* Can we write? */
+                       if (!gpte_p->may_write) {
+                               flags |= DSISR_PROTFAULT;
+                               goto forward_to_l1;
+                       }
+               } else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
+                       /* Can we execute? */
+                       if (!gpte_p->may_execute) {
+                               flags |= SRR1_ISI_N_OR_G;
+                               goto forward_to_l1;
+                       }
+               } else {
+                       /* Can we read? */
+                       if (!gpte_p->may_read && !gpte_p->may_write) {
+                               flags |= DSISR_PROTFAULT;
+                               goto forward_to_l1;
+                       }
+               }
+       }
+
+       return 0;
+
+forward_to_l1:
+       vcpu->arch.fault_dsisr = flags;
+       if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
+               vcpu->arch.shregs.msr &= ~0x783f0000ul;
+               vcpu->arch.shregs.msr |= flags;
+       }
+       return RESUME_HOST;
+}
+
+static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu,
+                                      struct kvm_nested_guest *gp,
+                                      unsigned long n_gpa,
+                                      struct kvmppc_pte gpte,
+                                      unsigned long dsisr)
+{
+       struct kvm *kvm = vcpu->kvm;
+       bool writing = !!(dsisr & DSISR_ISSTORE);
+       u64 pgflags;
+       bool ret;
+
+       /* Are the rc bits set in the L1 partition scoped pte? */
+       pgflags = _PAGE_ACCESSED;
+       if (writing)
+               pgflags |= _PAGE_DIRTY;
+       if (pgflags & ~gpte.rc)
+               return RESUME_HOST;
+
+       spin_lock(&kvm->mmu_lock);
+       /* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */
+       ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing,
+                                    gpte.raddr, kvm->arch.lpid);
+       spin_unlock(&kvm->mmu_lock);
+       if (!ret)
+               return -EINVAL;
+
+       /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */
+       ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa,
+                                     gp->shadow_lpid);
+       if (!ret)
+               return -EINVAL;
+       return 0;
+}
+
+static inline int kvmppc_radix_level_to_shift(int level)
+{
+       switch (level) {
+       case 2:
+               return PUD_SHIFT;
+       case 1:
+               return PMD_SHIFT;
+       default:
+               return PAGE_SHIFT;
+       }
+}
+
+static inline int kvmppc_radix_shift_to_level(int shift)
+{
+       if (shift == PUD_SHIFT)
+               return 2;
+       if (shift == PMD_SHIFT)
+               return 1;
+       if (shift == PAGE_SHIFT)
+               return 0;
+       WARN_ON_ONCE(1);
+       return 0;
+}
+
+/* called with gp->tlb_lock held */
+static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
+                                         struct kvm_nested_guest *gp)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_memory_slot *memslot;
+       struct rmap_nested *n_rmap;
+       struct kvmppc_pte gpte;
+       pte_t pte, *pte_p;
+       unsigned long mmu_seq;
+       unsigned long dsisr = vcpu->arch.fault_dsisr;
+       unsigned long ea = vcpu->arch.fault_dar;
+       unsigned long *rmapp;
+       unsigned long n_gpa, gpa, gfn, perm = 0UL;
+       unsigned int shift, l1_shift, level;
+       bool writing = !!(dsisr & DSISR_ISSTORE);
+       bool kvm_ro = false;
+       long int ret;
+
+       if (!gp->l1_gr_to_hr) {
+               kvmhv_update_ptbl_cache(gp);
+               if (!gp->l1_gr_to_hr)
+                       return RESUME_HOST;
+       }
+
+       /* Convert the nested guest real address into a L1 guest real address */
+
+       n_gpa = vcpu->arch.fault_gpa & ~0xF000000000000FFFULL;
+       if (!(dsisr & DSISR_PRTABLE_FAULT))
+               n_gpa |= ea & 0xFFF;
+       ret = kvmhv_translate_addr_nested(vcpu, gp, n_gpa, dsisr, &gpte);
+
+       /*
+        * If the hardware found a translation but we don't now have a usable
+        * translation in the l1 partition-scoped tree, remove the shadow pte
+        * and let the guest retry.
+        */
+       if (ret == RESUME_HOST &&
+           (dsisr & (DSISR_PROTFAULT | DSISR_BADACCESS | DSISR_NOEXEC_OR_G |
+                     DSISR_BAD_COPYPASTE)))
+               goto inval;
+       if (ret)
+               return ret;
+
+       /* Failed to set the reference/change bits */
+       if (dsisr & DSISR_SET_RC) {
+               ret = kvmhv_handle_nested_set_rc(vcpu, gp, n_gpa, gpte, dsisr);
+               if (ret == RESUME_HOST)
+                       return ret;
+               if (ret)
+                       goto inval;
+               dsisr &= ~DSISR_SET_RC;
+               if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
+                              DSISR_PROTFAULT)))
+                       return RESUME_GUEST;
+       }
+
+       /*
+        * We took an HISI or HDSI while we were running a nested guest which
+        * means we have no partition scoped translation for that. This means
+        * we need to insert a pte for the mapping into our shadow_pgtable.
+        */
+
+       l1_shift = gpte.page_shift;
+       if (l1_shift < PAGE_SHIFT) {
+               /* We don't support l1 using a page size smaller than our own */
+               pr_err("KVM: L1 guest page shift (%d) less than our own (%d)\n",
+                       l1_shift, PAGE_SHIFT);
+               return -EINVAL;
+       }
+       gpa = gpte.raddr;
+       gfn = gpa >> PAGE_SHIFT;
+
+       /* 1. Get the corresponding host memslot */
+
+       memslot = gfn_to_memslot(kvm, gfn);
+       if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
+               if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS)) {
+                       /* unusual error -> reflect to the guest as a DSI */
+                       kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+                       return RESUME_GUEST;
+               }
+               /* passthrough of emulated MMIO case... */
+               pr_err("emulated MMIO passthrough?\n");
+               return -EINVAL;
+       }
+       if (memslot->flags & KVM_MEM_READONLY) {
+               if (writing) {
+                       /* Give the guest a DSI */
+                       kvmppc_core_queue_data_storage(vcpu, ea,
+                                       DSISR_ISSTORE | DSISR_PROTFAULT);
+                       return RESUME_GUEST;
+               }
+               kvm_ro = true;
+       }
+
+       /* 2. Find the host pte for this L1 guest real address */
+
+       /* Used to check for invalidations in progress */
+       mmu_seq = kvm->mmu_notifier_seq;
+       smp_rmb();
+
+       /* See if can find translation in our partition scoped tables for L1 */
+       pte = __pte(0);
+       spin_lock(&kvm->mmu_lock);
+       pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
+       if (!shift)
+               shift = PAGE_SHIFT;
+       if (pte_p)
+               pte = *pte_p;
+       spin_unlock(&kvm->mmu_lock);
+
+       if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) {
+               /* No suitable pte found -> try to insert a mapping */
+               ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot,
+                                       writing, kvm_ro, &pte, &level);
+               if (ret == -EAGAIN)
+                       return RESUME_GUEST;
+               else if (ret)
+                       return ret;
+               shift = kvmppc_radix_level_to_shift(level);
+       }
+
+       /* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */
+
+       /* The permissions is the combination of the host and l1 guest ptes */
+       perm |= gpte.may_read ? 0UL : _PAGE_READ;
+       perm |= gpte.may_write ? 0UL : _PAGE_WRITE;
+       perm |= gpte.may_execute ? 0UL : _PAGE_EXEC;
+       pte = __pte(pte_val(pte) & ~perm);
+
+       /* What size pte can we insert? */
+       if (shift > l1_shift) {
+               u64 mask;
+               unsigned int actual_shift = PAGE_SHIFT;
+               if (PMD_SHIFT < l1_shift)
+                       actual_shift = PMD_SHIFT;
+               mask = (1UL << shift) - (1UL << actual_shift);
+               pte = __pte(pte_val(pte) | (gpa & mask));
+               shift = actual_shift;
+       }
+       level = kvmppc_radix_shift_to_level(shift);
+       n_gpa &= ~((1UL << shift) - 1);
+
+       /* 4. Insert the pte into our shadow_pgtable */
+
+       n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL);
+       if (!n_rmap)
+               return RESUME_GUEST; /* Let the guest try again */
+       n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) |
+               (((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT);
+       rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
+       ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level,
+                               mmu_seq, gp->shadow_lpid, rmapp, &n_rmap);
+       if (n_rmap)
+               kfree(n_rmap);
+       if (ret == -EAGAIN)
+               ret = RESUME_GUEST;     /* Let the guest try again */
+
+       return ret;
+
+ inval:
+       kvmhv_invalidate_shadow_pte(vcpu, gp, n_gpa, NULL);
+       return RESUME_GUEST;
+}
+
+long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu)
+{
+       struct kvm_nested_guest *gp = vcpu->arch.nested;
+       long int ret;
+
+       mutex_lock(&gp->tlb_lock);
+       ret = __kvmhv_nested_page_fault(vcpu, gp);
+       mutex_unlock(&gp->tlb_lock);
+       return ret;
+}
+
+int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid)
+{
+       int ret = -1;
+
+       spin_lock(&kvm->mmu_lock);
+       while (++lpid <= kvm->arch.max_nested_lpid) {
+               if (kvm->arch.nested_guests[lpid]) {
+                       ret = lpid;
+                       break;
+               }
+       }
+       spin_unlock(&kvm->mmu_lock);
+       return ret;
+}
index b11043b..0787f12 100644 (file)
@@ -177,6 +177,7 @@ void kvmppc_subcore_enter_guest(void)
 
        local_paca->sibling_subcore_state->in_guest[subcore_id] = 1;
 }
+EXPORT_SYMBOL_GPL(kvmppc_subcore_enter_guest);
 
 void kvmppc_subcore_exit_guest(void)
 {
@@ -187,6 +188,7 @@ void kvmppc_subcore_exit_guest(void)
 
        local_paca->sibling_subcore_state->in_guest[subcore_id] = 0;
 }
+EXPORT_SYMBOL_GPL(kvmppc_subcore_exit_guest);
 
 static bool kvmppc_tb_resync_required(void)
 {
@@ -331,5 +333,13 @@ long kvmppc_realmode_hmi_handler(void)
        } else {
                wait_for_tb_resync();
        }
+
+       /*
+        * Reset tb_offset_applied so the guest exit code won't try
+        * to subtract the previous timebase offset from the timebase.
+        */
+       if (local_paca->kvm_hstate.kvm_vcore)
+               local_paca->kvm_hstate.kvm_vcore->tb_offset_applied = 0;
+
        return 0;
 }
index 758d1d2..b3f5786 100644 (file)
@@ -136,7 +136,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 
        /* Mark the target VCPU as having an interrupt pending */
        vcpu->stat.queue_intr++;
-       set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
+       set_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
 
        /* Kick self ? Just set MER and return */
        if (vcpu == this_vcpu) {
@@ -170,8 +170,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
 {
        /* Note: Only called on self ! */
-       clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
-                 &vcpu->arch.pending_exceptions);
+       clear_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
        mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER);
 }
 
@@ -768,6 +767,14 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
        void __iomem *xics_phys;
        int64_t rc;
 
+       if (kvmhv_on_pseries()) {
+               unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+               iosync();
+               plpar_hcall_raw(H_EOI, retbuf, hwirq);
+               return;
+       }
+
        rc = pnv_opal_pci_msi_eoi(c, hwirq);
 
        if (rc)
index 1d14046..9b8d50a 100644 (file)
@@ -28,6 +28,7 @@
 #include <asm/exception-64s.h>
 #include <asm/kvm_book3s_asm.h>
 #include <asm/book3s/64/mmu-hash.h>
+#include <asm/export.h>
 #include <asm/tm.h>
 #include <asm/opal.h>
 #include <asm/xive-regs.h>
@@ -46,8 +47,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 #define NAPPING_NOVCPU 2
 
 /* Stack frame offsets for kvmppc_hv_entry */
-#define SFS                    160
+#define SFS                    208
 #define STACK_SLOT_TRAP                (SFS-4)
+#define STACK_SLOT_SHORT_PATH  (SFS-8)
 #define STACK_SLOT_TID         (SFS-16)
 #define STACK_SLOT_PSSCR       (SFS-24)
 #define STACK_SLOT_PID         (SFS-32)
@@ -56,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 #define STACK_SLOT_DAWR                (SFS-56)
 #define STACK_SLOT_DAWRX       (SFS-64)
 #define STACK_SLOT_HFSCR       (SFS-72)
+/* the following is used by the P9 short path */
+#define STACK_SLOT_NVGPRS      (SFS-152)       /* 18 gprs */
 
 /*
  * Call kvmppc_hv_entry in real mode.
@@ -113,45 +117,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        mtspr   SPRN_SPRG_VDSO_WRITE,r3
 
        /* Reload the host's PMU registers */
-       lbz     r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
-       cmpwi   r4, 0
-       beq     23f                     /* skip if not */
-BEGIN_FTR_SECTION
-       ld      r3, HSTATE_MMCR0(r13)
-       andi.   r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
-       cmpwi   r4, MMCR0_PMAO
-       beql    kvmppc_fix_pmao
-END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
-       lwz     r3, HSTATE_PMC1(r13)
-       lwz     r4, HSTATE_PMC2(r13)
-       lwz     r5, HSTATE_PMC3(r13)
-       lwz     r6, HSTATE_PMC4(r13)
-       lwz     r8, HSTATE_PMC5(r13)
-       lwz     r9, HSTATE_PMC6(r13)
-       mtspr   SPRN_PMC1, r3
-       mtspr   SPRN_PMC2, r4
-       mtspr   SPRN_PMC3, r5
-       mtspr   SPRN_PMC4, r6
-       mtspr   SPRN_PMC5, r8
-       mtspr   SPRN_PMC6, r9
-       ld      r3, HSTATE_MMCR0(r13)
-       ld      r4, HSTATE_MMCR1(r13)
-       ld      r5, HSTATE_MMCRA(r13)
-       ld      r6, HSTATE_SIAR(r13)
-       ld      r7, HSTATE_SDAR(r13)
-       mtspr   SPRN_MMCR1, r4
-       mtspr   SPRN_MMCRA, r5
-       mtspr   SPRN_SIAR, r6
-       mtspr   SPRN_SDAR, r7
-BEGIN_FTR_SECTION
-       ld      r8, HSTATE_MMCR2(r13)
-       ld      r9, HSTATE_SIER(r13)
-       mtspr   SPRN_MMCR2, r8
-       mtspr   SPRN_SIER, r9
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-       mtspr   SPRN_MMCR0, r3
-       isync
-23:
+       bl      kvmhv_load_host_pmu
 
        /*
         * Reload DEC.  HDEC interrupts were disabled when
@@ -796,66 +762,23 @@ BEGIN_FTR_SECTION
        b       91f
 END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
        /*
-        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
         */
        mr      r3, r4
        ld      r4, VCPU_MSR(r3)
+       li      r5, 0                   /* don't preserve non-vol regs */
        bl      kvmppc_restore_tm_hv
+       nop
        ld      r4, HSTATE_KVM_VCPU(r13)
 91:
 #endif
 
-       /* Load guest PMU registers */
-       /* R4 is live here (vcpu pointer) */
-       li      r3, 1
-       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
-       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable ints */
-       isync
-BEGIN_FTR_SECTION
-       ld      r3, VCPU_MMCR(r4)
-       andi.   r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
-       cmpwi   r5, MMCR0_PMAO
-       beql    kvmppc_fix_pmao
-END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
-       lwz     r3, VCPU_PMC(r4)        /* always load up guest PMU registers */
-       lwz     r5, VCPU_PMC + 4(r4)    /* to prevent information leak */
-       lwz     r6, VCPU_PMC + 8(r4)
-       lwz     r7, VCPU_PMC + 12(r4)
-       lwz     r8, VCPU_PMC + 16(r4)
-       lwz     r9, VCPU_PMC + 20(r4)
-       mtspr   SPRN_PMC1, r3
-       mtspr   SPRN_PMC2, r5
-       mtspr   SPRN_PMC3, r6
-       mtspr   SPRN_PMC4, r7
-       mtspr   SPRN_PMC5, r8
-       mtspr   SPRN_PMC6, r9
-       ld      r3, VCPU_MMCR(r4)
-       ld      r5, VCPU_MMCR + 8(r4)
-       ld      r6, VCPU_MMCR + 16(r4)
-       ld      r7, VCPU_SIAR(r4)
-       ld      r8, VCPU_SDAR(r4)
-       mtspr   SPRN_MMCR1, r5
-       mtspr   SPRN_MMCRA, r6
-       mtspr   SPRN_SIAR, r7
-       mtspr   SPRN_SDAR, r8
-BEGIN_FTR_SECTION
-       ld      r5, VCPU_MMCR + 24(r4)
-       ld      r6, VCPU_SIER(r4)
-       mtspr   SPRN_MMCR2, r5
-       mtspr   SPRN_SIER, r6
-BEGIN_FTR_SECTION_NESTED(96)
-       lwz     r7, VCPU_PMC + 24(r4)
-       lwz     r8, VCPU_PMC + 28(r4)
-       ld      r9, VCPU_MMCR + 32(r4)
-       mtspr   SPRN_SPMC1, r7
-       mtspr   SPRN_SPMC2, r8
-       mtspr   SPRN_MMCRS, r9
-END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-       mtspr   SPRN_MMCR0, r3
-       isync
+       /* Load guest PMU registers; r4 = vcpu pointer here */
+       mr      r3, r4
+       bl      kvmhv_load_guest_pmu
 
        /* Load up FP, VMX and VSX registers */
+       ld      r4, HSTATE_KVM_VCPU(r13)
        bl      kvmppc_load_fp
 
        ld      r14, VCPU_GPR(R14)(r4)
@@ -1100,73 +1023,40 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 no_xive:
 #endif /* CONFIG_KVM_XICS */
 
-deliver_guest_interrupt:
-       ld      r6, VCPU_CTR(r4)
-       ld      r7, VCPU_XER(r4)
-
-       mtctr   r6
-       mtxer   r7
+       li      r0, 0
+       stw     r0, STACK_SLOT_SHORT_PATH(r1)
 
-kvmppc_cede_reentry:           /* r4 = vcpu, r13 = paca */
-       ld      r10, VCPU_PC(r4)
-       ld      r11, VCPU_MSR(r4)
+deliver_guest_interrupt:       /* r4 = vcpu, r13 = paca */
+       /* Check if we can deliver an external or decrementer interrupt now */
+       ld      r0, VCPU_PENDING_EXC(r4)
+BEGIN_FTR_SECTION
+       /* On POWER9, also check for emulated doorbell interrupt */
+       lbz     r3, VCPU_DBELL_REQ(r4)
+       or      r0, r0, r3
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+       cmpdi   r0, 0
+       beq     71f
+       mr      r3, r4
+       bl      kvmppc_guest_entry_inject_int
+       ld      r4, HSTATE_KVM_VCPU(r13)
+71:
        ld      r6, VCPU_SRR0(r4)
        ld      r7, VCPU_SRR1(r4)
        mtspr   SPRN_SRR0, r6
        mtspr   SPRN_SRR1, r7
 
+fast_guest_entry_c:
+       ld      r10, VCPU_PC(r4)
+       ld      r11, VCPU_MSR(r4)
        /* r11 = vcpu->arch.msr & ~MSR_HV */
        rldicl  r11, r11, 63 - MSR_HV_LG, 1
        rotldi  r11, r11, 1 + MSR_HV_LG
        ori     r11, r11, MSR_ME
 
-       /* Check if we can deliver an external or decrementer interrupt now */
-       ld      r0, VCPU_PENDING_EXC(r4)
-       rldicl  r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63
-       cmpdi   cr1, r0, 0
-       andi.   r8, r11, MSR_EE
-       mfspr   r8, SPRN_LPCR
-       /* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */
-       rldimi  r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH
-       mtspr   SPRN_LPCR, r8
-       isync
-       beq     5f
-       li      r0, BOOK3S_INTERRUPT_EXTERNAL
-       bne     cr1, 12f
-       mfspr   r0, SPRN_DEC
-BEGIN_FTR_SECTION
-       /* On POWER9 check whether the guest has large decrementer enabled */
-       andis.  r8, r8, LPCR_LD@h
-       bne     15f
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-       extsw   r0, r0
-15:    cmpdi   r0, 0
-       li      r0, BOOK3S_INTERRUPT_DECREMENTER
-       bge     5f
-
-12:    mtspr   SPRN_SRR0, r10
-       mr      r10,r0
-       mtspr   SPRN_SRR1, r11
-       mr      r9, r4
-       bl      kvmppc_msr_interrupt
-5:
-BEGIN_FTR_SECTION
-       b       fast_guest_return
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
-       /* On POWER9, check for pending doorbell requests */
-       lbz     r0, VCPU_DBELL_REQ(r4)
-       cmpwi   r0, 0
-       beq     fast_guest_return
-       ld      r5, HSTATE_KVM_VCORE(r13)
-       /* Set DPDES register so the CPU will take a doorbell interrupt */
-       li      r0, 1
-       mtspr   SPRN_DPDES, r0
-       std     r0, VCORE_DPDES(r5)
-       /* Make sure other cpus see vcore->dpdes set before dbell req clear */
-       lwsync
-       /* Clear the pending doorbell request */
-       li      r0, 0
-       stb     r0, VCPU_DBELL_REQ(r4)
+       ld      r6, VCPU_CTR(r4)
+       ld      r7, VCPU_XER(r4)
+       mtctr   r6
+       mtxer   r7
 
 /*
  * Required state:
@@ -1202,7 +1092,7 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
        ld      r5, VCPU_LR(r4)
-       lwz     r6, VCPU_CR(r4)
+       l     r6, VCPU_CR(r4)
        mtlr    r5
        mtcr    r6
 
@@ -1234,6 +1124,83 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
        HRFI_TO_GUEST
        b       .
 
+/*
+ * Enter the guest on a P9 or later system where we have exactly
+ * one vcpu per vcore and we don't need to go to real mode
+ * (which implies that host and guest are both using radix MMU mode).
+ * r3 = vcpu pointer
+ * Most SPRs and all the VSRs have been loaded already.
+ */
+_GLOBAL(__kvmhv_vcpu_entry_p9)
+EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9)
+       mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+       stdu    r1, -SFS(r1)
+
+       li      r0, 1
+       stw     r0, STACK_SLOT_SHORT_PATH(r1)
+
+       std     r3, HSTATE_KVM_VCPU(r13)
+       mfcr    r4
+       stw     r4, SFS+8(r1)
+
+       std     r1, HSTATE_HOST_R1(r13)
+
+       reg = 14
+       .rept   18
+       std     reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
+       reg = reg + 1
+       .endr
+
+       reg = 14
+       .rept   18
+       ld      reg, __VCPU_GPR(reg)(r3)
+       reg = reg + 1
+       .endr
+
+       mfmsr   r10
+       std     r10, HSTATE_HOST_MSR(r13)
+
+       mr      r4, r3
+       b       fast_guest_entry_c
+guest_exit_short_path:
+
+       li      r0, KVM_GUEST_MODE_NONE
+       stb     r0, HSTATE_IN_GUEST(r13)
+
+       reg = 14
+       .rept   18
+       std     reg, __VCPU_GPR(reg)(r9)
+       reg = reg + 1
+       .endr
+
+       reg = 14
+       .rept   18
+       ld      reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
+       reg = reg + 1
+       .endr
+
+       lwz     r4, SFS+8(r1)
+       mtcr    r4
+
+       mr      r3, r12         /* trap number */
+
+       addi    r1, r1, SFS
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
+
+       /* If we are in real mode, do a rfid to get back to the caller */
+       mfmsr   r4
+       andi.   r5, r4, MSR_IR
+       bnelr
+       rldicl  r5, r4, 64 - MSR_TS_S_LG, 62    /* extract TS field */
+       mtspr   SPRN_SRR0, r0
+       ld      r10, HSTATE_HOST_MSR(r13)
+       rldimi  r10, r5, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+       mtspr   SPRN_SRR1, r10
+       RFI_TO_KERNEL
+       b       .
+
 secondary_too_late:
        li      r12, 0
        stw     r12, STACK_SLOT_TRAP(r1)
@@ -1313,7 +1280,7 @@ kvmppc_interrupt_hv:
        std     r3, VCPU_GPR(R12)(r9)
        /* CR is in the high half of r12 */
        srdi    r4, r12, 32
-       stw     r4, VCPU_CR(r9)
+       std     r4, VCPU_CR(r9)
 BEGIN_FTR_SECTION
        ld      r3, HSTATE_CFAR(r13)
        std     r3, VCPU_CFAR(r9)
@@ -1387,18 +1354,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
        std     r3, VCPU_CTR(r9)
        std     r4, VCPU_XER(r9)
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-       /* For softpatch interrupt, go off and do TM instruction emulation */
-       cmpwi   r12, BOOK3S_INTERRUPT_HV_SOFTPATCH
-       beq     kvmppc_tm_emul
-#endif
+       /* Save more register state  */
+       mfdar   r3
+       mfdsisr r4
+       std     r3, VCPU_DAR(r9)
+       stw     r4, VCPU_DSISR(r9)
 
        /* If this is a page table miss then see if it's theirs or ours */
        cmpwi   r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
        beq     kvmppc_hdsi
+       std     r3, VCPU_FAULT_DAR(r9)
+       stw     r4, VCPU_FAULT_DSISR(r9)
        cmpwi   r12, BOOK3S_INTERRUPT_H_INST_STORAGE
        beq     kvmppc_hisi
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       /* For softpatch interrupt, go off and do TM instruction emulation */
+       cmpwi   r12, BOOK3S_INTERRUPT_HV_SOFTPATCH
+       beq     kvmppc_tm_emul
+#endif
+
        /* See if this is a leftover HDEC interrupt */
        cmpwi   r12,BOOK3S_INTERRUPT_HV_DECREMENTER
        bne     2f
@@ -1418,10 +1393,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 BEGIN_FTR_SECTION
        PPC_MSGSYNC
        lwsync
+       /* always exit if we're running a nested guest */
+       ld      r0, VCPU_NESTED(r9)
+       cmpdi   r0, 0
+       bne     guest_exit_cont
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
        lbz     r0, HSTATE_HOST_IPI(r13)
        cmpwi   r0, 0
-       beq     4f
+       beq     maybe_reenter_guest
        b       guest_exit_cont
 3:
        /* If it's a hypervisor facility unavailable interrupt, save HFSCR */
@@ -1433,82 +1412,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 14:
        /* External interrupt ? */
        cmpwi   r12, BOOK3S_INTERRUPT_EXTERNAL
-       bne+    guest_exit_cont
-
-       /* External interrupt, first check for host_ipi. If this is
-        * set, we know the host wants us out so let's do it now
-        */
-       bl      kvmppc_read_intr
-
-       /*
-        * Restore the active volatile registers after returning from
-        * a C function.
-        */
-       ld      r9, HSTATE_KVM_VCPU(r13)
-       li      r12, BOOK3S_INTERRUPT_EXTERNAL
-
-       /*
-        * kvmppc_read_intr return codes:
-        *
-        * Exit to host (r3 > 0)
-        *   1 An interrupt is pending that needs to be handled by the host
-        *     Exit guest and return to host by branching to guest_exit_cont
-        *
-        *   2 Passthrough that needs completion in the host
-        *     Exit guest and return to host by branching to guest_exit_cont
-        *     However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
-        *     to indicate to the host to complete handling the interrupt
-        *
-        * Before returning to guest, we check if any CPU is heading out
-        * to the host and if so, we head out also. If no CPUs are heading
-        * check return values <= 0.
-        *
-        * Return to guest (r3 <= 0)
-        *  0 No external interrupt is pending
-        * -1 A guest wakeup IPI (which has now been cleared)
-        *    In either case, we return to guest to deliver any pending
-        *    guest interrupts.
-        *
-        * -2 A PCI passthrough external interrupt was handled
-        *    (interrupt was delivered directly to guest)
-        *    Return to guest to deliver any pending guest interrupts.
-        */
-
-       cmpdi   r3, 1
-       ble     1f
-
-       /* Return code = 2 */
-       li      r12, BOOK3S_INTERRUPT_HV_RM_HARD
-       stw     r12, VCPU_TRAP(r9)
-       b       guest_exit_cont
-
-1:     /* Return code <= 1 */
-       cmpdi   r3, 0
-       bgt     guest_exit_cont
-
-       /* Return code <= 0 */
-4:     ld      r5, HSTATE_KVM_VCORE(r13)
-       lwz     r0, VCORE_ENTRY_EXIT(r5)
-       cmpwi   r0, 0x100
-       mr      r4, r9
-       blt     deliver_guest_interrupt
-
-guest_exit_cont:               /* r9 = vcpu, r12 = trap, r13 = paca */
-       /* Save more register state  */
-       mfdar   r6
-       mfdsisr r7
-       std     r6, VCPU_DAR(r9)
-       stw     r7, VCPU_DSISR(r9)
-       /* don't overwrite fault_dar/fault_dsisr if HDSI */
-       cmpwi   r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
-       beq     mc_cont
-       std     r6, VCPU_FAULT_DAR(r9)
-       stw     r7, VCPU_FAULT_DSISR(r9)
-
+       beq     kvmppc_guest_external
        /* See if it is a machine check */
        cmpwi   r12, BOOK3S_INTERRUPT_MACHINE_CHECK
        beq     machine_check_realmode
-mc_cont:
+       /* Or a hypervisor maintenance interrupt */
+       cmpwi   r12, BOOK3S_INTERRUPT_HMI
+       beq     hmi_realmode
+
+guest_exit_cont:               /* r9 = vcpu, r12 = trap, r13 = paca */
+
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
        addi    r3, r9, VCPU_TB_RMEXIT
        mr      r4, r9
@@ -1552,6 +1465,11 @@ mc_cont:
 1:
 #endif /* CONFIG_KVM_XICS */
 
+       /* If we came in through the P9 short path, go back out to C now */
+       lwz     r0, STACK_SLOT_SHORT_PATH(r1)
+       cmpwi   r0, 0
+       bne     guest_exit_short_path
+
        /* For hash guest, read the guest SLB and save it away */
        ld      r5, VCPU_KVM(r9)
        lbz     r0, KVM_RADIX(r5)
@@ -1780,11 +1698,13 @@ BEGIN_FTR_SECTION
        b       91f
 END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
        /*
-        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
         */
        mr      r3, r9
        ld      r4, VCPU_MSR(r3)
+       li      r5, 0                   /* don't preserve non-vol regs */
        bl      kvmppc_save_tm_hv
+       nop
        ld      r9, HSTATE_KVM_VCPU(r13)
 91:
 #endif
@@ -1802,90 +1722,19 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 25:
        /* Save PMU registers if requested */
        /* r8 and cr0.eq are live here */
+       mr      r3, r9
+       li      r4, 1
+       beq     21f                     /* if no VPA, save PMU stuff anyway */
+       lbz     r4, LPPACA_PMCINUSE(r8)
+21:    bl      kvmhv_save_guest_pmu
+       ld      r9, HSTATE_KVM_VCPU(r13)
+
+       /* Restore host values of some registers */
 BEGIN_FTR_SECTION
-       /*
-        * POWER8 seems to have a hardware bug where setting
-        * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
-        * when some counters are already negative doesn't seem
-        * to cause a performance monitor alert (and hence interrupt).
-        * The effect of this is that when saving the PMU state,
-        * if there is no PMU alert pending when we read MMCR0
-        * before freezing the counters, but one becomes pending
-        * before we read the counters, we lose it.
-        * To work around this, we need a way to freeze the counters
-        * before reading MMCR0.  Normally, freezing the counters
-        * is done by writing MMCR0 (to set MMCR0[FC]) which
-        * unavoidably writes MMCR0[PMA0] as well.  On POWER8,
-        * we can also freeze the counters using MMCR2, by writing
-        * 1s to all the counter freeze condition bits (there are
-        * 9 bits each for 6 counters).
-        */
-       li      r3, -1                  /* set all freeze bits */
-       clrrdi  r3, r3, 10
-       mfspr   r10, SPRN_MMCR2
-       mtspr   SPRN_MMCR2, r3
-       isync
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-       li      r3, 1
-       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
-       mfspr   r4, SPRN_MMCR0          /* save MMCR0 */
-       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable ints */
-       mfspr   r6, SPRN_MMCRA
-       /* Clear MMCRA in order to disable SDAR updates */
-       li      r7, 0
-       mtspr   SPRN_MMCRA, r7
-       isync
-       beq     21f                     /* if no VPA, save PMU stuff anyway */
-       lbz     r7, LPPACA_PMCINUSE(r8)
-       cmpwi   r7, 0                   /* did they ask for PMU stuff to be saved? */
-       bne     21f
-       std     r3, VCPU_MMCR(r9)       /* if not, set saved MMCR0 to FC */
-       b       22f
-21:    mfspr   r5, SPRN_MMCR1
-       mfspr   r7, SPRN_SIAR
-       mfspr   r8, SPRN_SDAR
-       std     r4, VCPU_MMCR(r9)
-       std     r5, VCPU_MMCR + 8(r9)
-       std     r6, VCPU_MMCR + 16(r9)
-BEGIN_FTR_SECTION
-       std     r10, VCPU_MMCR + 24(r9)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-       std     r7, VCPU_SIAR(r9)
-       std     r8, VCPU_SDAR(r9)
-       mfspr   r3, SPRN_PMC1
-       mfspr   r4, SPRN_PMC2
-       mfspr   r5, SPRN_PMC3
-       mfspr   r6, SPRN_PMC4
-       mfspr   r7, SPRN_PMC5
-       mfspr   r8, SPRN_PMC6
-       stw     r3, VCPU_PMC(r9)
-       stw     r4, VCPU_PMC + 4(r9)
-       stw     r5, VCPU_PMC + 8(r9)
-       stw     r6, VCPU_PMC + 12(r9)
-       stw     r7, VCPU_PMC + 16(r9)
-       stw     r8, VCPU_PMC + 20(r9)
-BEGIN_FTR_SECTION
-       mfspr   r5, SPRN_SIER
-       std     r5, VCPU_SIER(r9)
-BEGIN_FTR_SECTION_NESTED(96)
-       mfspr   r6, SPRN_SPMC1
-       mfspr   r7, SPRN_SPMC2
-       mfspr   r8, SPRN_MMCRS
-       stw     r6, VCPU_PMC + 24(r9)
-       stw     r7, VCPU_PMC + 28(r9)
-       std     r8, VCPU_MMCR + 32(r9)
-       lis     r4, 0x8000
-       mtspr   SPRN_MMCRS, r4
-END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-22:
-
-       /* Restore host values of some registers */
-BEGIN_FTR_SECTION
-       ld      r5, STACK_SLOT_CIABR(r1)
-       ld      r6, STACK_SLOT_DAWR(r1)
-       ld      r7, STACK_SLOT_DAWRX(r1)
-       mtspr   SPRN_CIABR, r5
+       ld      r5, STACK_SLOT_CIABR(r1)
+       ld      r6, STACK_SLOT_DAWR(r1)
+       ld      r7, STACK_SLOT_DAWRX(r1)
+       mtspr   SPRN_CIABR, r5
        /*
         * If the DAWR doesn't work, it's ok to write these here as
         * this value should always be zero
@@ -2010,24 +1859,6 @@ BEGIN_FTR_SECTION
        mtspr   SPRN_DPDES, r8
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
-       /* If HMI, call kvmppc_realmode_hmi_handler() */
-       lwz     r12, STACK_SLOT_TRAP(r1)
-       cmpwi   r12, BOOK3S_INTERRUPT_HMI
-       bne     27f
-       bl      kvmppc_realmode_hmi_handler
-       nop
-       cmpdi   r3, 0
-       /*
-        * At this point kvmppc_realmode_hmi_handler may have resync-ed
-        * the TB, and if it has, we must not subtract the guest timebase
-        * offset from the timebase. So, skip it.
-        *
-        * Also, do not call kvmppc_subcore_exit_guest() because it has
-        * been invoked as part of kvmppc_realmode_hmi_handler().
-        */
-       beq     30f
-
-27:
        /* Subtract timebase offset from timebase */
        ld      r8, VCORE_TB_OFFSET_APPL(r5)
        cmpdi   r8,0
@@ -2045,7 +1876,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        addis   r8,r8,0x100             /* if so, increment upper 40 bits */
        mtspr   SPRN_TBU40,r8
 
-17:    bl      kvmppc_subcore_exit_guest
+17:
+       /*
+        * If this is an HMI, we called kvmppc_realmode_hmi_handler
+        * above, which may or may not have already called
+        * kvmppc_subcore_exit_guest.  Fortunately, all that
+        * kvmppc_subcore_exit_guest does is clear a flag, so calling
+        * it again here is benign even if kvmppc_realmode_hmi_handler
+        * has already called it.
+        */
+       bl      kvmppc_subcore_exit_guest
        nop
 30:    ld      r5,HSTATE_KVM_VCORE(r13)
        ld      r4,VCORE_KVM(r5)        /* pointer to struct kvm */
@@ -2099,6 +1939,67 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
        mtlr    r0
        blr
 
+kvmppc_guest_external:
+       /* External interrupt, first check for host_ipi. If this is
+        * set, we know the host wants us out so let's do it now
+        */
+       bl      kvmppc_read_intr
+
+       /*
+        * Restore the active volatile registers after returning from
+        * a C function.
+        */
+       ld      r9, HSTATE_KVM_VCPU(r13)
+       li      r12, BOOK3S_INTERRUPT_EXTERNAL
+
+       /*
+        * kvmppc_read_intr return codes:
+        *
+        * Exit to host (r3 > 0)
+        *   1 An interrupt is pending that needs to be handled by the host
+        *     Exit guest and return to host by branching to guest_exit_cont
+        *
+        *   2 Passthrough that needs completion in the host
+        *     Exit guest and return to host by branching to guest_exit_cont
+        *     However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
+        *     to indicate to the host to complete handling the interrupt
+        *
+        * Before returning to guest, we check if any CPU is heading out
+        * to the host and if so, we head out also. If no CPUs are heading
+        * check return values <= 0.
+        *
+        * Return to guest (r3 <= 0)
+        *  0 No external interrupt is pending
+        * -1 A guest wakeup IPI (which has now been cleared)
+        *    In either case, we return to guest to deliver any pending
+        *    guest interrupts.
+        *
+        * -2 A PCI passthrough external interrupt was handled
+        *    (interrupt was delivered directly to guest)
+        *    Return to guest to deliver any pending guest interrupts.
+        */
+
+       cmpdi   r3, 1
+       ble     1f
+
+       /* Return code = 2 */
+       li      r12, BOOK3S_INTERRUPT_HV_RM_HARD
+       stw     r12, VCPU_TRAP(r9)
+       b       guest_exit_cont
+
+1:     /* Return code <= 1 */
+       cmpdi   r3, 0
+       bgt     guest_exit_cont
+
+       /* Return code <= 0 */
+maybe_reenter_guest:
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       lwz     r0, VCORE_ENTRY_EXIT(r5)
+       cmpwi   r0, 0x100
+       mr      r4, r9
+       blt     deliver_guest_interrupt
+       b       guest_exit_cont
+
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 /*
  * Softpatch interrupt for transactional memory emulation cases
@@ -2302,6 +2203,10 @@ hcall_try_real_mode:
        andi.   r0,r11,MSR_PR
        /* sc 1 from userspace - reflect to guest syscall */
        bne     sc_1_fast_return
+       /* sc 1 from nested guest - give it to L1 to handle */
+       ld      r0, VCPU_NESTED(r9)
+       cmpdi   r0, 0
+       bne     guest_exit_cont
        clrrdi  r3,r3,2
        cmpldi  r3,hcall_real_table_end - hcall_real_table
        bge     guest_exit_cont
@@ -2561,6 +2466,7 @@ hcall_real_table:
 hcall_real_table_end:
 
 _GLOBAL(kvmppc_h_set_xdabr)
+EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr)
        andi.   r0, r5, DABRX_USER | DABRX_KERNEL
        beq     6f
        li      r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI
@@ -2570,6 +2476,7 @@ _GLOBAL(kvmppc_h_set_xdabr)
        blr
 
 _GLOBAL(kvmppc_h_set_dabr)
+EXPORT_SYMBOL_GPL(kvmppc_h_set_dabr)
        li      r5, DABRX_USER | DABRX_KERNEL
 3:
 BEGIN_FTR_SECTION
@@ -2682,11 +2589,13 @@ BEGIN_FTR_SECTION
        b       91f
 END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
        /*
-        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
         */
        ld      r3, HSTATE_KVM_VCPU(r13)
        ld      r4, VCPU_MSR(r3)
+       li      r5, 0                   /* don't preserve non-vol regs */
        bl      kvmppc_save_tm_hv
+       nop
 91:
 #endif
 
@@ -2802,11 +2711,13 @@ BEGIN_FTR_SECTION
        b       91f
 END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
        /*
-        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+        * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
         */
        mr      r3, r4
        ld      r4, VCPU_MSR(r3)
+       li      r5, 0                   /* don't preserve non-vol regs */
        bl      kvmppc_restore_tm_hv
+       nop
        ld      r4, HSTATE_KVM_VCPU(r13)
 91:
 #endif
@@ -2874,13 +2785,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
        mr      r9, r4
        cmpdi   r3, 0
        bgt     guest_exit_cont
-
-       /* see if any other thread is already exiting */
-       lwz     r0,VCORE_ENTRY_EXIT(r5)
-       cmpwi   r0,0x100
-       bge     guest_exit_cont
-
-       b       kvmppc_cede_reentry     /* if not go back to guest */
+       b       maybe_reenter_guest
 
        /* cede when already previously prodded case */
 kvm_cede_prodded:
@@ -2947,12 +2852,12 @@ machine_check_realmode:
         */
        ld      r11, VCPU_MSR(r9)
        rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */
-       bne     mc_cont                 /* if so, exit to host */
+       bne     guest_exit_cont         /* if so, exit to host */
        /* Check if guest is capable of handling NMI exit */
        ld      r10, VCPU_KVM(r9)
        lbz     r10, KVM_FWNMI(r10)
        cmpdi   r10, 1                  /* FWNMI capable? */
-       beq     mc_cont                 /* if so, exit with KVM_EXIT_NMI. */
+       beq     guest_exit_cont         /* if so, exit with KVM_EXIT_NMI. */
 
        /* if not, fall through for backward compatibility. */
        andi.   r10, r11, MSR_RI        /* check for unrecoverable exception */
@@ -2966,6 +2871,21 @@ machine_check_realmode:
 2:     b       fast_interrupt_c_return
 
 /*
+ * Call C code to handle a HMI in real mode.
+ * Only the primary thread does the call, secondary threads are handled
+ * by calling hmi_exception_realmode() after kvmppc_hv_entry returns.
+ * r9 points to the vcpu on entry
+ */
+hmi_realmode:
+       lbz     r0, HSTATE_PTID(r13)
+       cmpwi   r0, 0
+       bne     guest_exit_cont
+       bl      kvmppc_realmode_hmi_handler
+       ld      r9, HSTATE_KVM_VCPU(r13)
+       li      r12, BOOK3S_INTERRUPT_HMI
+       b       guest_exit_cont
+
+/*
  * Check the reason we woke from nap, and take appropriate action.
  * Returns (in r3):
  *     0 if nothing needs to be done
@@ -3130,10 +3050,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
  * Save transactional state and TM-related registers.
  * Called with r3 pointing to the vcpu struct and r4 containing
  * the guest MSR value.
- * This can modify all checkpointed registers, but
+ * r5 is non-zero iff non-volatile register state needs to be maintained.
+ * If r5 == 0, this can modify all checkpointed registers, but
  * restores r1 and r2 before exit.
  */
-kvmppc_save_tm_hv:
+_GLOBAL_TOC(kvmppc_save_tm_hv)
+EXPORT_SYMBOL_GPL(kvmppc_save_tm_hv)
        /* See if we need to handle fake suspend mode */
 BEGIN_FTR_SECTION
        b       __kvmppc_save_tm
@@ -3161,12 +3083,6 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
        nop
 
-       std     r1, HSTATE_HOST_R1(r13)
-
-       /* Clear the MSR RI since r1, r13 may be foobar. */
-       li      r5, 0
-       mtmsrd  r5, 1
-
        /* We have to treclaim here because that's the only way to do S->N */
        li      r3, TM_CAUSE_KVM_RESCHED
        TRECLAIM(R3)
@@ -3175,22 +3091,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
         * We were in fake suspend, so we are not going to save the
         * register state as the guest checkpointed state (since
         * we already have it), therefore we can now use any volatile GPR.
+        * In fact treclaim in fake suspend state doesn't modify
+        * any registers.
         */
-       /* Reload PACA pointer, stack pointer and TOC. */
-       GET_PACA(r13)
-       ld      r1, HSTATE_HOST_R1(r13)
-       ld      r2, PACATOC(r13)
 
-       /* Set MSR RI now we have r1 and r13 back. */
-       li      r5, MSR_RI
-       mtmsrd  r5, 1
-
-       HMT_MEDIUM
-       ld      r6, HSTATE_DSCR(r13)
-       mtspr   SPRN_DSCR, r6
-BEGIN_FTR_SECTION_NESTED(96)
+BEGIN_FTR_SECTION
        bl      pnv_power9_force_smt4_release
-END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
+END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
        nop
 
 4:
@@ -3216,10 +3123,12 @@ END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
  * Restore transactional state and TM-related registers.
  * Called with r3 pointing to the vcpu struct
  * and r4 containing the guest MSR value.
+ * r5 is non-zero iff non-volatile register state needs to be maintained.
  * This potentially modifies all checkpointed registers.
  * It restores r1 and r2 from the PACA.
  */
-kvmppc_restore_tm_hv:
+_GLOBAL_TOC(kvmppc_restore_tm_hv)
+EXPORT_SYMBOL_GPL(kvmppc_restore_tm_hv)
        /*
         * If we are doing TM emulation for the guest on a POWER9 DD2,
         * then we don't actually do a trechkpt -- we either set up
@@ -3424,6 +3333,194 @@ kvmppc_msr_interrupt:
        blr
 
 /*
+ * Load up guest PMU state.  R3 points to the vcpu struct.
+ */
+_GLOBAL(kvmhv_load_guest_pmu)
+EXPORT_SYMBOL_GPL(kvmhv_load_guest_pmu)
+       mr      r4, r3
+       mflr    r0
+       li      r3, 1
+       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
+       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable ints */
+       isync
+BEGIN_FTR_SECTION
+       ld      r3, VCPU_MMCR(r4)
+       andi.   r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
+       cmpwi   r5, MMCR0_PMAO
+       beql    kvmppc_fix_pmao
+END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
+       lwz     r3, VCPU_PMC(r4)        /* always load up guest PMU registers */
+       lwz     r5, VCPU_PMC + 4(r4)    /* to prevent information leak */
+       lwz     r6, VCPU_PMC + 8(r4)
+       lwz     r7, VCPU_PMC + 12(r4)
+       lwz     r8, VCPU_PMC + 16(r4)
+       lwz     r9, VCPU_PMC + 20(r4)
+       mtspr   SPRN_PMC1, r3
+       mtspr   SPRN_PMC2, r5
+       mtspr   SPRN_PMC3, r6
+       mtspr   SPRN_PMC4, r7
+       mtspr   SPRN_PMC5, r8
+       mtspr   SPRN_PMC6, r9
+       ld      r3, VCPU_MMCR(r4)
+       ld      r5, VCPU_MMCR + 8(r4)
+       ld      r6, VCPU_MMCR + 16(r4)
+       ld      r7, VCPU_SIAR(r4)
+       ld      r8, VCPU_SDAR(r4)
+       mtspr   SPRN_MMCR1, r5
+       mtspr   SPRN_MMCRA, r6
+       mtspr   SPRN_SIAR, r7
+       mtspr   SPRN_SDAR, r8
+BEGIN_FTR_SECTION
+       ld      r5, VCPU_MMCR + 24(r4)
+       ld      r6, VCPU_SIER(r4)
+       mtspr   SPRN_MMCR2, r5
+       mtspr   SPRN_SIER, r6
+BEGIN_FTR_SECTION_NESTED(96)
+       lwz     r7, VCPU_PMC + 24(r4)
+       lwz     r8, VCPU_PMC + 28(r4)
+       ld      r9, VCPU_MMCR + 32(r4)
+       mtspr   SPRN_SPMC1, r7
+       mtspr   SPRN_SPMC2, r8
+       mtspr   SPRN_MMCRS, r9
+END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       mtspr   SPRN_MMCR0, r3
+       isync
+       mtlr    r0
+       blr
+
+/*
+ * Reload host PMU state saved in the PACA by kvmhv_save_host_pmu.
+ */
+_GLOBAL(kvmhv_load_host_pmu)
+EXPORT_SYMBOL_GPL(kvmhv_load_host_pmu)
+       mflr    r0
+       lbz     r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
+       cmpwi   r4, 0
+       beq     23f                     /* skip if not */
+BEGIN_FTR_SECTION
+       ld      r3, HSTATE_MMCR0(r13)
+       andi.   r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
+       cmpwi   r4, MMCR0_PMAO
+       beql    kvmppc_fix_pmao
+END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
+       lwz     r3, HSTATE_PMC1(r13)
+       lwz     r4, HSTATE_PMC2(r13)
+       lwz     r5, HSTATE_PMC3(r13)
+       lwz     r6, HSTATE_PMC4(r13)
+       lwz     r8, HSTATE_PMC5(r13)
+       lwz     r9, HSTATE_PMC6(r13)
+       mtspr   SPRN_PMC1, r3
+       mtspr   SPRN_PMC2, r4
+       mtspr   SPRN_PMC3, r5
+       mtspr   SPRN_PMC4, r6
+       mtspr   SPRN_PMC5, r8
+       mtspr   SPRN_PMC6, r9
+       ld      r3, HSTATE_MMCR0(r13)
+       ld      r4, HSTATE_MMCR1(r13)
+       ld      r5, HSTATE_MMCRA(r13)
+       ld      r6, HSTATE_SIAR(r13)
+       ld      r7, HSTATE_SDAR(r13)
+       mtspr   SPRN_MMCR1, r4
+       mtspr   SPRN_MMCRA, r5
+       mtspr   SPRN_SIAR, r6
+       mtspr   SPRN_SDAR, r7
+BEGIN_FTR_SECTION
+       ld      r8, HSTATE_MMCR2(r13)
+       ld      r9, HSTATE_SIER(r13)
+       mtspr   SPRN_MMCR2, r8
+       mtspr   SPRN_SIER, r9
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       mtspr   SPRN_MMCR0, r3
+       isync
+       mtlr    r0
+23:    blr
+
+/*
+ * Save guest PMU state into the vcpu struct.
+ * r3 = vcpu, r4 = full save flag (PMU in use flag set in VPA)
+ */
+_GLOBAL(kvmhv_save_guest_pmu)
+EXPORT_SYMBOL_GPL(kvmhv_save_guest_pmu)
+       mr      r9, r3
+       mr      r8, r4
+BEGIN_FTR_SECTION
+       /*
+        * POWER8 seems to have a hardware bug where setting
+        * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
+        * when some counters are already negative doesn't seem
+        * to cause a performance monitor alert (and hence interrupt).
+        * The effect of this is that when saving the PMU state,
+        * if there is no PMU alert pending when we read MMCR0
+        * before freezing the counters, but one becomes pending
+        * before we read the counters, we lose it.
+        * To work around this, we need a way to freeze the counters
+        * before reading MMCR0.  Normally, freezing the counters
+        * is done by writing MMCR0 (to set MMCR0[FC]) which
+        * unavoidably writes MMCR0[PMA0] as well.  On POWER8,
+        * we can also freeze the counters using MMCR2, by writing
+        * 1s to all the counter freeze condition bits (there are
+        * 9 bits each for 6 counters).
+        */
+       li      r3, -1                  /* set all freeze bits */
+       clrrdi  r3, r3, 10
+       mfspr   r10, SPRN_MMCR2
+       mtspr   SPRN_MMCR2, r3
+       isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       li      r3, 1
+       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
+       mfspr   r4, SPRN_MMCR0          /* save MMCR0 */
+       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable ints */
+       mfspr   r6, SPRN_MMCRA
+       /* Clear MMCRA in order to disable SDAR updates */
+       li      r7, 0
+       mtspr   SPRN_MMCRA, r7
+       isync
+       cmpwi   r8, 0                   /* did they ask for PMU stuff to be saved? */
+       bne     21f
+       std     r3, VCPU_MMCR(r9)       /* if not, set saved MMCR0 to FC */
+       b       22f
+21:    mfspr   r5, SPRN_MMCR1
+       mfspr   r7, SPRN_SIAR
+       mfspr   r8, SPRN_SDAR
+       std     r4, VCPU_MMCR(r9)
+       std     r5, VCPU_MMCR + 8(r9)
+       std     r6, VCPU_MMCR + 16(r9)
+BEGIN_FTR_SECTION
+       std     r10, VCPU_MMCR + 24(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       std     r7, VCPU_SIAR(r9)
+       std     r8, VCPU_SDAR(r9)
+       mfspr   r3, SPRN_PMC1
+       mfspr   r4, SPRN_PMC2
+       mfspr   r5, SPRN_PMC3
+       mfspr   r6, SPRN_PMC4
+       mfspr   r7, SPRN_PMC5
+       mfspr   r8, SPRN_PMC6
+       stw     r3, VCPU_PMC(r9)
+       stw     r4, VCPU_PMC + 4(r9)
+       stw     r5, VCPU_PMC + 8(r9)
+       stw     r6, VCPU_PMC + 12(r9)
+       stw     r7, VCPU_PMC + 16(r9)
+       stw     r8, VCPU_PMC + 20(r9)
+BEGIN_FTR_SECTION
+       mfspr   r5, SPRN_SIER
+       std     r5, VCPU_SIER(r9)
+BEGIN_FTR_SECTION_NESTED(96)
+       mfspr   r6, SPRN_SPMC1
+       mfspr   r7, SPRN_SPMC2
+       mfspr   r8, SPRN_MMCRS
+       stw     r6, VCPU_PMC + 24(r9)
+       stw     r7, VCPU_PMC + 28(r9)
+       std     r8, VCPU_MMCR + 32(r9)
+       lis     r4, 0x8000
+       mtspr   SPRN_MMCRS, r4
+END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+22:    blr
+
+/*
  * This works around a hardware bug on POWER8E processors, where
  * writing a 1 to the MMCR0[PMAO] bit doesn't generate a
  * performance monitor interrupt.  Instead, when we need to have
index 0082850..888e260 100644 (file)
@@ -130,7 +130,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
                        return RESUME_GUEST;
                }
                /* Set CR0 to indicate previous transactional state */
-               vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
+               vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
                        (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
                /* L=1 => tresume, L=0 => tsuspend */
                if (instr & (1 << 21)) {
@@ -174,7 +174,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
                copy_from_checkpoint(vcpu);
 
                /* Set CR0 to indicate previous transactional state */
-               vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
+               vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
                        (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
                vcpu->arch.shregs.msr &= ~MSR_TS_MASK;
                return RESUME_GUEST;
@@ -204,7 +204,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
                copy_to_checkpoint(vcpu);
 
                /* Set CR0 to indicate previous transactional state */
-               vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
+               vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
                        (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
                vcpu->arch.shregs.msr = msr | MSR_TS_S;
                return RESUME_GUEST;
index b2c7c6f..3cf5863 100644 (file)
@@ -89,7 +89,8 @@ int kvmhv_p9_tm_emulation_early(struct kvm_vcpu *vcpu)
                if (instr & (1 << 21))
                        vcpu->arch.shregs.msr = (msr & ~MSR_TS_MASK) | MSR_TS_T;
                /* Set CR0 to 0b0010 */
-               vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0x20000000;
+               vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
+                       0x20000000;
                return 1;
        }
 
@@ -105,5 +106,5 @@ void kvmhv_emulate_tm_rollback(struct kvm_vcpu *vcpu)
        vcpu->arch.shregs.msr &= ~MSR_TS_MASK;  /* go to N state */
        vcpu->arch.regs.nip = vcpu->arch.tfhar;
        copy_from_checkpoint(vcpu);
-       vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0xa0000000;
+       vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | 0xa0000000;
 }
index 614ebb4..4efd65d 100644 (file)
@@ -167,7 +167,7 @@ void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu)
        svcpu->gpr[11] = vcpu->arch.regs.gpr[11];
        svcpu->gpr[12] = vcpu->arch.regs.gpr[12];
        svcpu->gpr[13] = vcpu->arch.regs.gpr[13];
-       svcpu->cr  = vcpu->arch.cr;
+       svcpu->cr  = vcpu->arch.regs.ccr;
        svcpu->xer = vcpu->arch.regs.xer;
        svcpu->ctr = vcpu->arch.regs.ctr;
        svcpu->lr  = vcpu->arch.regs.link;
@@ -249,7 +249,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
        vcpu->arch.regs.gpr[11] = svcpu->gpr[11];
        vcpu->arch.regs.gpr[12] = svcpu->gpr[12];
        vcpu->arch.regs.gpr[13] = svcpu->gpr[13];
-       vcpu->arch.cr  = svcpu->cr;
+       vcpu->arch.regs.ccr  = svcpu->cr;
        vcpu->arch.regs.xer = svcpu->xer;
        vcpu->arch.regs.ctr = svcpu->ctr;
        vcpu->arch.regs.link  = svcpu->lr;
@@ -1246,7 +1246,6 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
                r = RESUME_GUEST;
                break;
        case BOOK3S_INTERRUPT_EXTERNAL:
-       case BOOK3S_INTERRUPT_EXTERNAL_LEVEL:
        case BOOK3S_INTERRUPT_EXTERNAL_HV:
        case BOOK3S_INTERRUPT_H_VIRT:
                vcpu->stat.ext_intr_exits++;
index b8356cd..b0b2bfc 100644 (file)
@@ -310,7 +310,7 @@ static inline bool icp_try_update(struct kvmppc_icp *icp,
         */
        if (new.out_ee) {
                kvmppc_book3s_queue_irqprio(icp->vcpu,
-                                           BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+                                           BOOK3S_INTERRUPT_EXTERNAL);
                if (!change_self)
                        kvmppc_fast_vcpu_kick(icp->vcpu);
        }
@@ -593,8 +593,7 @@ static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu)
        u32 xirr;
 
        /* First, remove EE from the processor */
-       kvmppc_book3s_dequeue_irqprio(icp->vcpu,
-                                     BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+       kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
 
        /*
         * ICP State: Accept_Interrupt
@@ -754,8 +753,7 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
         * We can remove EE from the current processor, the update
         * transaction will set it again if needed
         */
-       kvmppc_book3s_dequeue_irqprio(icp->vcpu,
-                                     BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+       kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
 
        do {
                old_state = new_state = READ_ONCE(icp->state);
@@ -1167,8 +1165,7 @@ int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
         * Deassert the CPU interrupt request.
         * icp_try_update will reassert it if necessary.
         */
-       kvmppc_book3s_dequeue_irqprio(icp->vcpu,
-                                     BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+       kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
 
        /*
         * Note that if we displace an interrupt from old_state.xisr,
@@ -1393,7 +1390,8 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
        }
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-       if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+       if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+           cpu_has_feature(CPU_FTR_HVMODE)) {
                /* Enable real mode support */
                xics->real_mode = ENABLE_REALMODE;
                xics->real_mode_dbg = DEBUG_REALMODE;
index 30c2eb7..ad4a370 100644 (file)
 #define XIVE_Q_GAP     2
 
 /*
+ * Push a vcpu's context to the XIVE on guest entry.
+ * This assumes we are in virtual mode (MMU on)
+ */
+void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
+{
+       void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
+       u64 pq;
+
+       if (!tima)
+               return;
+       eieio();
+       __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
+       __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
+       vcpu->arch.xive_pushed = 1;
+       eieio();
+
+       /*
+        * We clear the irq_pending flag. There is a small chance of a
+        * race vs. the escalation interrupt happening on another
+        * processor setting it again, but the only consequence is to
+        * cause a spurious wakeup on the next H_CEDE, which is not an
+        * issue.
+        */
+       vcpu->arch.irq_pending = 0;
+
+       /*
+        * In single escalation mode, if the escalation interrupt is
+        * on, we mask it.
+        */
+       if (vcpu->arch.xive_esc_on) {
+               pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
+                                                 XIVE_ESB_SET_PQ_01));
+               mb();
+
+               /*
+                * We have a possible subtle race here: The escalation
+                * interrupt might have fired and be on its way to the
+                * host queue while we mask it, and if we unmask it
+                * early enough (re-cede right away), there is a
+                * theorical possibility that it fires again, thus
+                * landing in the target queue more than once which is
+                * a big no-no.
+                *
+                * Fortunately, solving this is rather easy. If the
+                * above load setting PQ to 01 returns a previous
+                * value where P is set, then we know the escalation
+                * interrupt is somewhere on its way to the host. In
+                * that case we simply don't clear the xive_esc_on
+                * flag below. It will be eventually cleared by the
+                * handler for the escalation interrupt.
+                *
+                * Then, when doing a cede, we check that flag again
+                * before re-enabling the escalation interrupt, and if
+                * set, we abort the cede.
+                */
+               if (!(pq & XIVE_ESB_VAL_P))
+                       /* Now P is 0, we can clear the flag */
+                       vcpu->arch.xive_esc_on = 0;
+       }
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu);
+
+/*
  * This is a simple trigger for a generic XIVE IRQ. This must
  * only be called for interrupts that support a trigger page
  */
index 4171ede..033363d 100644 (file)
@@ -280,14 +280,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu)
        /* First collect pending bits from HW */
        GLUE(X_PFX,ack_pending)(xc);
 
-       /*
-        * Cleanup the old-style bits if needed (they may have been
-        * set by pull or an escalation interrupts).
-        */
-       if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions))
-               clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
-                         &vcpu->arch.pending_exceptions);
-
        pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
                 xc->pending, xc->hw_cppr, xc->cppr);
 
index 81bd8a0..051af7d 100644 (file)
         */
        PPC_LL  r4, PACACURRENT(r13)
        PPC_LL  r4, (THREAD + THREAD_KVM_VCPU)(r4)
-       stw     r10, VCPU_CR(r4)
+       PPC_STL r10, VCPU_CR(r4)
        PPC_STL r11, VCPU_GPR(R4)(r4)
        PPC_STL r5, VCPU_GPR(R5)(r4)
        PPC_STL r6, VCPU_GPR(R6)(r4)
@@ -292,7 +292,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
        PPC_STL r4, VCPU_GPR(R4)(r11)
        PPC_LL  r4, THREAD_NORMSAVE(0)(r10)
        PPC_STL r5, VCPU_GPR(R5)(r11)
-       stw     r13, VCPU_CR(r11)
+       PPC_STL r13, VCPU_CR(r11)
        mfspr   r5, \srr0
        PPC_STL r3, VCPU_GPR(R10)(r11)
        PPC_LL  r3, THREAD_NORMSAVE(2)(r10)
@@ -319,7 +319,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
        PPC_STL r4, VCPU_GPR(R4)(r11)
        PPC_LL  r4, GPR9(r8)
        PPC_STL r5, VCPU_GPR(R5)(r11)
-       stw     r9, VCPU_CR(r11)
+       PPC_STL r9, VCPU_CR(r11)
        mfspr   r5, \srr0
        PPC_STL r3, VCPU_GPR(R8)(r11)
        PPC_LL  r3, GPR10(r8)
@@ -643,7 +643,7 @@ lightweight_exit:
        PPC_LL  r3, VCPU_LR(r4)
        PPC_LL  r5, VCPU_XER(r4)
        PPC_LL  r6, VCPU_CTR(r4)
-       lwz     r7, VCPU_CR(r4)
+       PPC_LL  r7, VCPU_CR(r4)
        PPC_LL  r8, VCPU_PC(r4)
        PPC_LD(r9, VCPU_SHARED_MSR, r11)
        PPC_LL  r0, VCPU_GPR(R0)(r4)
index 75dce1e..f91b130 100644 (file)
@@ -117,7 +117,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
        emulated = EMULATE_FAIL;
        vcpu->arch.regs.msr = vcpu->arch.shared->msr;
-       vcpu->arch.regs.ccr = vcpu->arch.cr;
        if (analyse_instr(&op, &vcpu->arch.regs, inst) == 0) {
                int type = op.type & INSTR_TYPE_MASK;
                int size = GETSIZE(op.type);
index eba5756..2869a29 100644 (file)
@@ -594,7 +594,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = !!(hv_enabled && radix_enabled());
                break;
        case KVM_CAP_PPC_MMU_HASH_V3:
-               r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300));
+               r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) &&
+                      cpu_has_feature(CPU_FTR_HVMODE));
+               break;
+       case KVM_CAP_PPC_NESTED_HV:
+               r = !!(hv_enabled && kvmppc_hv_ops->enable_nested &&
+                      !kvmppc_hv_ops->enable_nested(NULL));
                break;
 #endif
        case KVM_CAP_SYNC_MMU:
@@ -2114,6 +2119,14 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
                        r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags);
                break;
        }
+
+       case KVM_CAP_PPC_NESTED_HV:
+               r = -EINVAL;
+               if (!is_kvmppc_hv_enabled(kvm) ||
+                   !kvm->arch.kvm_ops->enable_nested)
+                       break;
+               r = kvm->arch.kvm_ops->enable_nested(kvm);
+               break;
 #endif
        default:
                r = -EINVAL;
index 90e330f..0531a14 100644 (file)
  * Save transactional state and TM-related registers.
  * Called with:
  * - r3 pointing to the vcpu struct
- * - r4 points to the MSR with current TS bits:
+ * - r4 containing the MSR with current TS bits:
  *     (For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR).
- * This can modify all checkpointed registers, but
- * restores r1, r2 before exit.
+ * - r5 containing a flag indicating that non-volatile registers
+ *     must be preserved.
+ * If r5 == 0, this can modify all checkpointed registers, but
+ * restores r1, r2 before exit.  If r5 != 0, this restores the
+ * MSR TM/FP/VEC/VSX bits to their state on entry.
  */
 _GLOBAL(__kvmppc_save_tm)
        mflr    r0
        std     r0, PPC_LR_STKOFF(r1)
+       stdu    r1, -SWITCH_FRAME_SIZE(r1)
+
+       mr      r9, r3
+       cmpdi   cr7, r5, 0
 
        /* Turn on TM. */
        mfmsr   r8
+       mr      r10, r8
        li      r0, 1
        rldimi  r8, r0, MSR_TM_LG, 63-MSR_TM_LG
        ori     r8, r8, MSR_FP
@@ -51,6 +59,27 @@ _GLOBAL(__kvmppc_save_tm)
        std     r1, HSTATE_SCRATCH2(r13)
        std     r3, HSTATE_SCRATCH1(r13)
 
+       /* Save CR on the stack - even if r5 == 0 we need to get cr7 back. */
+       mfcr    r6
+       SAVE_GPR(6, r1)
+
+       /* Save DSCR so we can restore it to avoid running with user value */
+       mfspr   r7, SPRN_DSCR
+       SAVE_GPR(7, r1)
+
+       /*
+        * We are going to do treclaim., which will modify all checkpointed
+        * registers.  Save the non-volatile registers on the stack if
+        * preservation of non-volatile state has been requested.
+        */
+       beq     cr7, 3f
+       SAVE_NVGPRS(r1)
+
+       /* MSR[TS] will be 0 (non-transactional) once we do treclaim. */
+       li      r0, 0
+       rldimi  r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+       SAVE_GPR(10, r1)        /* final MSR value */
+3:
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 BEGIN_FTR_SECTION
        /* Emulation of the treclaim instruction needs TEXASR before treclaim */
@@ -74,22 +103,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
        std     r9, PACATMSCRATCH(r13)
        ld      r9, HSTATE_SCRATCH1(r13)
 
-       /* Get a few more GPRs free. */
-       std     r29, VCPU_GPRS_TM(29)(r9)
-       std     r30, VCPU_GPRS_TM(30)(r9)
-       std     r31, VCPU_GPRS_TM(31)(r9)
-
-       /* Save away PPR and DSCR soon so don't run with user values. */
-       mfspr   r31, SPRN_PPR
+       /* Save away PPR soon so we don't run with user value. */
+       std     r0, VCPU_GPRS_TM(0)(r9)
+       mfspr   r0, SPRN_PPR
        HMT_MEDIUM
-       mfspr   r30, SPRN_DSCR
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-       ld      r29, HSTATE_DSCR(r13)
-       mtspr   SPRN_DSCR, r29
-#endif
 
-       /* Save all but r9, r13 & r29-r31 */
-       reg = 0
+       /* Reload stack pointer. */
+       std     r1, VCPU_GPRS_TM(1)(r9)
+       ld      r1, HSTATE_SCRATCH2(r13)
+
+       /* Set MSR RI now we have r1 and r13 back. */
+       std     r2, VCPU_GPRS_TM(2)(r9)
+       li      r2, MSR_RI
+       mtmsrd  r2, 1
+
+       /* Reload TOC pointer. */
+       ld      r2, PACATOC(r13)
+
+       /* Save all but r0-r2, r9 & r13 */
+       reg = 3
        .rept   29
        .if (reg != 9) && (reg != 13)
        std     reg, VCPU_GPRS_TM(reg)(r9)
@@ -103,33 +135,29 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
        ld      r4, PACATMSCRATCH(r13)
        std     r4, VCPU_GPRS_TM(9)(r9)
 
-       /* Reload stack pointer and TOC. */
-       ld      r1, HSTATE_SCRATCH2(r13)
-       ld      r2, PACATOC(r13)
-
-       /* Set MSR RI now we have r1 and r13 back. */
-       li      r5, MSR_RI
-       mtmsrd  r5, 1
+       /* Restore host DSCR and CR values, after saving guest values */
+       mfcr    r6
+       mfspr   r7, SPRN_DSCR
+       stw     r6, VCPU_CR_TM(r9)
+       std     r7, VCPU_DSCR_TM(r9)
+       REST_GPR(6, r1)
+       REST_GPR(7, r1)
+       mtcr    r6
+       mtspr   SPRN_DSCR, r7
 
-       /* Save away checkpinted SPRs. */
-       std     r31, VCPU_PPR_TM(r9)
-       std     r30, VCPU_DSCR_TM(r9)
+       /* Save away checkpointed SPRs. */
+       std     r0, VCPU_PPR_TM(r9)
        mflr    r5
-       mfcr    r6
        mfctr   r7
        mfspr   r8, SPRN_AMR
        mfspr   r10, SPRN_TAR
        mfxer   r11
        std     r5, VCPU_LR_TM(r9)
-       stw     r6, VCPU_CR_TM(r9)
        std     r7, VCPU_CTR_TM(r9)
        std     r8, VCPU_AMR_TM(r9)
        std     r10, VCPU_TAR_TM(r9)
        std     r11, VCPU_XER_TM(r9)
 
-       /* Restore r12 as trap number. */
-       lwz     r12, VCPU_TRAP(r9)
-
        /* Save FP/VSX. */
        addi    r3, r9, VCPU_FPRS_TM
        bl      store_fp_state
@@ -137,6 +165,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
        bl      store_vr_state
        mfspr   r6, SPRN_VRSAVE
        stw     r6, VCPU_VRSAVE_TM(r9)
+
+       /* Restore non-volatile registers if requested to */
+       beq     cr7, 1f
+       REST_NVGPRS(r1)
+       REST_GPR(10, r1)
 1:
        /*
         * We need to save these SPRs after the treclaim so that the software
@@ -146,12 +179,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
         */
        mfspr   r7, SPRN_TEXASR
        std     r7, VCPU_TEXASR(r9)
-11:
        mfspr   r5, SPRN_TFHAR
        mfspr   r6, SPRN_TFIAR
        std     r5, VCPU_TFHAR(r9)
        std     r6, VCPU_TFIAR(r9)
 
+       /* Restore MSR state if requested */
+       beq     cr7, 2f
+       mtmsrd  r10, 0
+2:
+       addi    r1, r1, SWITCH_FRAME_SIZE
        ld      r0, PPC_LR_STKOFF(r1)
        mtlr    r0
        blr
@@ -161,49 +198,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
  * be invoked from C function by PR KVM only.
  */
 _GLOBAL(_kvmppc_save_tm_pr)
-       mflr    r5
-       std     r5, PPC_LR_STKOFF(r1)
-       stdu    r1, -SWITCH_FRAME_SIZE(r1)
-       SAVE_NVGPRS(r1)
-
-       /* save MSR since TM/math bits might be impacted
-        * by __kvmppc_save_tm().
-        */
-       mfmsr   r5
-       SAVE_GPR(5, r1)
-
-       /* also save DSCR/CR/TAR so that it can be recovered later */
-       mfspr   r6, SPRN_DSCR
-       SAVE_GPR(6, r1)
-
-       mfcr    r7
-       stw     r7, _CCR(r1)
+       mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+       stdu    r1, -PPC_MIN_STKFRM(r1)
 
        mfspr   r8, SPRN_TAR
-       SAVE_GPR(8, r1)
+       std     r8, PPC_MIN_STKFRM-8(r1)
 
+       li      r5, 1           /* preserve non-volatile registers */
        bl      __kvmppc_save_tm
 
-       REST_GPR(8, r1)
+       ld      r8, PPC_MIN_STKFRM-8(r1)
        mtspr   SPRN_TAR, r8
 
-       ld      r7, _CCR(r1)
-       mtcr    r7
-
-       REST_GPR(6, r1)
-       mtspr   SPRN_DSCR, r6
-
-       /* need preserve current MSR's MSR_TS bits */
-       REST_GPR(5, r1)
-       mfmsr   r6
-       rldicl  r6, r6, 64 - MSR_TS_S_LG, 62
-       rldimi  r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
-       mtmsrd  r5
-
-       REST_NVGPRS(r1)
-       addi    r1, r1, SWITCH_FRAME_SIZE
-       ld      r5, PPC_LR_STKOFF(r1)
-       mtlr    r5
+       addi    r1, r1, PPC_MIN_STKFRM
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
        blr
 
 EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
@@ -215,15 +225,21 @@ EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
  *  - r4 is the guest MSR with desired TS bits:
  *     For HV KVM, it is VCPU_MSR
  *     For PR KVM, it is provided by caller
- * This potentially modifies all checkpointed registers.
- * It restores r1, r2 from the PACA.
+ * - r5 containing a flag indicating that non-volatile registers
+ *     must be preserved.
+ * If r5 == 0, this potentially modifies all checkpointed registers, but
+ * restores r1, r2 from the PACA before exit.
+ * If r5 != 0, this restores the MSR TM/FP/VEC/VSX bits to their state on entry.
  */
 _GLOBAL(__kvmppc_restore_tm)
        mflr    r0
        std     r0, PPC_LR_STKOFF(r1)
 
+       cmpdi   cr7, r5, 0
+
        /* Turn on TM/FP/VSX/VMX so we can restore them. */
        mfmsr   r5
+       mr      r10, r5
        li      r6, MSR_TM >> 32
        sldi    r6, r6, 32
        or      r5, r5, r6
@@ -244,8 +260,7 @@ _GLOBAL(__kvmppc_restore_tm)
 
        mr      r5, r4
        rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
-       beqlr           /* TM not active in guest */
-       std     r1, HSTATE_SCRATCH2(r13)
+       beq     9f              /* TM not active in guest */
 
        /* Make sure the failure summary is set, otherwise we'll program check
         * when we trechkpt.  It's possible that this might have been not set
@@ -256,6 +271,26 @@ _GLOBAL(__kvmppc_restore_tm)
        mtspr   SPRN_TEXASR, r7
 
        /*
+        * Make a stack frame and save non-volatile registers if requested.
+        */
+       stdu    r1, -SWITCH_FRAME_SIZE(r1)
+       std     r1, HSTATE_SCRATCH2(r13)
+
+       mfcr    r6
+       mfspr   r7, SPRN_DSCR
+       SAVE_GPR(2, r1)
+       SAVE_GPR(6, r1)
+       SAVE_GPR(7, r1)
+
+       beq     cr7, 4f
+       SAVE_NVGPRS(r1)
+
+       /* MSR[TS] will be 1 (suspended) once we do trechkpt */
+       li      r0, 1
+       rldimi  r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+       SAVE_GPR(10, r1)        /* final MSR value */
+4:
+       /*
         * We need to load up the checkpointed state for the guest.
         * We need to do this early as it will blow away any GPRs, VSRs and
         * some SPRs.
@@ -291,8 +326,6 @@ _GLOBAL(__kvmppc_restore_tm)
        ld      r29, VCPU_DSCR_TM(r3)
        ld      r30, VCPU_PPR_TM(r3)
 
-       std     r2, PACATMSCRATCH(r13) /* Save TOC */
-
        /* Clear the MSR RI since r1, r13 are all going to be foobar. */
        li      r5, 0
        mtmsrd  r5, 1
@@ -318,18 +351,31 @@ _GLOBAL(__kvmppc_restore_tm)
        /* Now let's get back the state we need. */
        HMT_MEDIUM
        GET_PACA(r13)
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-       ld      r29, HSTATE_DSCR(r13)
-       mtspr   SPRN_DSCR, r29
-#endif
        ld      r1, HSTATE_SCRATCH2(r13)
-       ld      r2, PACATMSCRATCH(r13)
+       REST_GPR(7, r1)
+       mtspr   SPRN_DSCR, r7
 
        /* Set the MSR RI since we have our registers back. */
        li      r5, MSR_RI
        mtmsrd  r5, 1
+
+       /* Restore TOC pointer and CR */
+       REST_GPR(2, r1)
+       REST_GPR(6, r1)
+       mtcr    r6
+
+       /* Restore non-volatile registers if requested to. */
+       beq     cr7, 5f
+       REST_GPR(10, r1)
+       REST_NVGPRS(r1)
+
+5:     addi    r1, r1, SWITCH_FRAME_SIZE
        ld      r0, PPC_LR_STKOFF(r1)
        mtlr    r0
+
+9:     /* Restore MSR bits if requested */
+       beqlr   cr7
+       mtmsrd  r10, 0
        blr
 
 /*
@@ -337,47 +383,23 @@ _GLOBAL(__kvmppc_restore_tm)
  * can be invoked from C function by PR KVM only.
  */
 _GLOBAL(_kvmppc_restore_tm_pr)
-       mflr    r5
-       std     r5, PPC_LR_STKOFF(r1)
-       stdu    r1, -SWITCH_FRAME_SIZE(r1)
-       SAVE_NVGPRS(r1)
-
-       /* save MSR to avoid TM/math bits change */
-       mfmsr   r5
-       SAVE_GPR(5, r1)
-
-       /* also save DSCR/CR/TAR so that it can be recovered later */
-       mfspr   r6, SPRN_DSCR
-       SAVE_GPR(6, r1)
-
-       mfcr    r7
-       stw     r7, _CCR(r1)
+       mflr    r0
+       std     r0, PPC_LR_STKOFF(r1)
+       stdu    r1, -PPC_MIN_STKFRM(r1)
 
+       /* save TAR so that it can be recovered later */
        mfspr   r8, SPRN_TAR
-       SAVE_GPR(8, r1)
+       std     r8, PPC_MIN_STKFRM-8(r1)
 
+       li      r5, 1
        bl      __kvmppc_restore_tm
 
-       REST_GPR(8, r1)
+       ld      r8, PPC_MIN_STKFRM-8(r1)
        mtspr   SPRN_TAR, r8
 
-       ld      r7, _CCR(r1)
-       mtcr    r7
-
-       REST_GPR(6, r1)
-       mtspr   SPRN_DSCR, r6
-
-       /* need preserve current MSR's MSR_TS bits */
-       REST_GPR(5, r1)
-       mfmsr   r6
-       rldicl  r6, r6, 64 - MSR_TS_S_LG, 62
-       rldimi  r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
-       mtmsrd  r5
-
-       REST_NVGPRS(r1)
-       addi    r1, r1, SWITCH_FRAME_SIZE
-       ld      r5, PPC_LR_STKOFF(r1)
-       mtlr    r5
+       addi    r1, r1, PPC_MIN_STKFRM
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
        blr
 
 EXPORT_SYMBOL_GPL(_kvmppc_restore_tm_pr);
index f3b2375..372a82f 100644 (file)
@@ -14,7 +14,6 @@
        {0x400, "INST_STORAGE"}, \
        {0x480, "INST_SEGMENT"}, \
        {0x500, "EXTERNAL"}, \
-       {0x501, "EXTERNAL_LEVEL"}, \
        {0x502, "EXTERNAL_HV"}, \
        {0x600, "ALIGNMENT"}, \
        {0x700, "PROGRAM"}, \
index 51ce091..7a9886f 100644 (file)
@@ -308,55 +308,6 @@ void register_page_bootmem_memmap(unsigned long section_nr,
 {
 }
 
-/*
- * We do not have access to the sparsemem vmemmap, so we fallback to
- * walking the list of sparsemem blocks which we already maintain for
- * the sake of crashdump. In the long run, we might want to maintain
- * a tree if performance of that linear walk becomes a problem.
- *
- * realmode_pfn_to_page functions can fail due to:
- * 1) As real sparsemem blocks do not lay in RAM continously (they
- * are in virtual address space which is not available in the real mode),
- * the requested page struct can be split between blocks so get_page/put_page
- * may fail.
- * 2) When huge pages are used, the get_page/put_page API will fail
- * in real mode as the linked addresses in the page struct are virtual
- * too.
- */
-struct page *realmode_pfn_to_page(unsigned long pfn)
-{
-       struct vmemmap_backing *vmem_back;
-       struct page *page;
-       unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
-       unsigned long pg_va = (unsigned long) pfn_to_page(pfn);
-
-       for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) {
-               if (pg_va < vmem_back->virt_addr)
-                       continue;
-
-               /* After vmemmap_list entry free is possible, need check all */
-               if ((pg_va + sizeof(struct page)) <=
-                               (vmem_back->virt_addr + page_size)) {
-                       page = (struct page *) (vmem_back->phys + pg_va -
-                               vmem_back->virt_addr);
-                       return page;
-               }
-       }
-
-       /* Probably that page struct is split between real pages */
-       return NULL;
-}
-EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
-
-#else
-
-struct page *realmode_pfn_to_page(unsigned long pfn)
-{
-       struct page *page = pfn_to_page(pfn);
-       return page;
-}
-EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
-
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
 #ifdef CONFIG_PPC_BOOK3S_64
index c9ee9e2..56c2234 100644 (file)
 #include <linux/migrate.h>
 #include <linux/hugetlb.h>
 #include <linux/swap.h>
+#include <linux/sizes.h>
 #include <asm/mmu_context.h>
 #include <asm/pte-walk.h>
 
 static DEFINE_MUTEX(mem_list_mutex);
 
+#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY        0x1
+#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1)
+
 struct mm_iommu_table_group_mem_t {
        struct list_head next;
        struct rcu_head rcu;
@@ -263,6 +267,9 @@ static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
                if (!page)
                        continue;
 
+               if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
+                       SetPageDirty(page);
+
                put_page(page);
                mem->hpas[i] = 0;
        }
@@ -360,7 +367,6 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
 
        return ret;
 }
-EXPORT_SYMBOL_GPL(mm_iommu_lookup_rm);
 
 struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
                unsigned long ua, unsigned long entries)
@@ -390,7 +396,7 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
        if (pageshift > mem->pageshift)
                return -EFAULT;
 
-       *hpa = *va | (ua & ~PAGE_MASK);
+       *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
 
        return 0;
 }
@@ -413,11 +419,31 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
        if (!pa)
                return -EFAULT;
 
-       *hpa = *pa | (ua & ~PAGE_MASK);
+       *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa_rm);
+
+extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua)
+{
+       struct mm_iommu_table_group_mem_t *mem;
+       long entry;
+       void *va;
+       unsigned long *pa;
+
+       mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE);
+       if (!mem)
+               return;
+
+       entry = (ua - mem->ua) >> PAGE_SHIFT;
+       va = &mem->hpas[entry];
+
+       pa = (void *) vmalloc_to_phys(va);
+       if (!pa)
+               return;
+
+       *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
+}
 
 long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
 {
index fef3e1e..4c4dfc4 100644 (file)
@@ -833,6 +833,15 @@ EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
 /*
  * Flush partition scoped translations from LPID (=LPIDR)
  */
+void radix__flush_tlb_lpid(unsigned int lpid)
+{
+       _tlbie_lpid(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid);
+
+/*
+ * Flush partition scoped translations from LPID (=LPIDR)
+ */
 void radix__local_flush_tlb_lpid(unsigned int lpid)
 {
        _tlbiel_lpid(lpid, RIC_FLUSH_ALL);
index 2ea3e79..fe24150 100644 (file)
@@ -482,7 +482,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                break;
        case KVM_CAP_S390_HPAGE_1M:
                r = 0;
-               if (hpage)
+               if (hpage && !kvm_is_ucontrol(kvm))
                        r = 1;
                break;
        case KVM_CAP_S390_MEM_OP:
@@ -692,7 +692,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
                mutex_lock(&kvm->lock);
                if (kvm->created_vcpus)
                        r = -EBUSY;
-               else if (!hpage || kvm->arch.use_cmma)
+               else if (!hpage || kvm->arch.use_cmma || kvm_is_ucontrol(kvm))
                        r = -EINVAL;
                else {
                        r = 0;
index d4fa0a4..1e668b9 100644 (file)
@@ -708,11 +708,13 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
                vmaddr |= gaddr & ~PMD_MASK;
                /* Find vma in the parent mm */
                vma = find_vma(gmap->mm, vmaddr);
+               if (!vma)
+                       continue;
                /*
                 * We do not discard pages that are backed by
                 * hugetlbfs, so we don't have to refault them.
                 */
-               if (vma && is_vm_hugetlb_page(vma))
+               if (is_vm_hugetlb_page(vma))
                        continue;
                size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
                zap_page_range(vma, vmaddr, size);
index acd11b3..2a356b9 100644 (file)
@@ -379,7 +379,6 @@ static int __init crypto_aegis128_aesni_module_init(void)
 {
        if (!boot_cpu_has(X86_FEATURE_XMM2) ||
            !boot_cpu_has(X86_FEATURE_AES) ||
-           !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
            !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
                return -ENODEV;
 
index 2071c3d..dbe8bb9 100644 (file)
@@ -379,7 +379,6 @@ static int __init crypto_aegis128l_aesni_module_init(void)
 {
        if (!boot_cpu_has(X86_FEATURE_XMM2) ||
            !boot_cpu_has(X86_FEATURE_AES) ||
-           !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
            !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
                return -ENODEV;
 
index b5f2a8f..8bebda2 100644 (file)
@@ -379,7 +379,6 @@ static int __init crypto_aegis256_aesni_module_init(void)
 {
        if (!boot_cpu_has(X86_FEATURE_XMM2) ||
            !boot_cpu_has(X86_FEATURE_AES) ||
-           !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
            !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
                return -ENODEV;
 
index 95cf857..f40244e 100644 (file)
@@ -40,7 +40,6 @@ MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350);
 static int __init crypto_morus1280_sse2_module_init(void)
 {
        if (!boot_cpu_has(X86_FEATURE_XMM2) ||
-           !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
            !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
                return -ENODEV;
 
index 615fb7b..9afaf8f 100644 (file)
@@ -40,7 +40,6 @@ MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400);
 static int __init crypto_morus640_sse2_module_init(void)
 {
        if (!boot_cpu_has(X86_FEATURE_XMM2) ||
-           !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
            !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
                return -ENODEV;
 
index 5b0f613..2c43e30 100644 (file)
@@ -95,8 +95,8 @@ static void hv_apic_eoi_write(u32 reg, u32 val)
  */
 static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
 {
-       struct ipi_arg_ex **arg;
-       struct ipi_arg_ex *ipi_arg;
+       struct hv_send_ipi_ex **arg;
+       struct hv_send_ipi_ex *ipi_arg;
        unsigned long flags;
        int nr_bank = 0;
        int ret = 1;
@@ -105,7 +105,7 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
                return false;
 
        local_irq_save(flags);
-       arg = (struct ipi_arg_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
+       arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
 
        ipi_arg = *arg;
        if (unlikely(!ipi_arg))
@@ -135,7 +135,7 @@ ipi_mask_ex_done:
 static bool __send_ipi_mask(const struct cpumask *mask, int vector)
 {
        int cur_cpu, vcpu;
-       struct ipi_arg_non_ex ipi_arg;
+       struct hv_send_ipi ipi_arg;
        int ret = 1;
 
        trace_hyperv_send_ipi_mask(mask, vector);
index e977b6b..00e01d2 100644 (file)
@@ -726,19 +726,21 @@ struct hv_enlightened_vmcs {
 #define HV_STIMER_AUTOENABLE           (1ULL << 3)
 #define HV_STIMER_SINT(config)         (__u8)(((config) >> 16) & 0x0F)
 
-struct ipi_arg_non_ex {
-       u32 vector;
-       u32 reserved;
-       u64 cpu_mask;
-};
-
 struct hv_vpset {
        u64 format;
        u64 valid_bank_mask;
        u64 bank_contents[];
 };
 
-struct ipi_arg_ex {
+/* HvCallSendSyntheticClusterIpi hypercall */
+struct hv_send_ipi {
+       u32 vector;
+       u32 reserved;
+       u64 cpu_mask;
+};
+
+/* HvCallSendSyntheticClusterIpiEx hypercall */
+struct hv_send_ipi_ex {
        u32 vector;
        u32 reserved;
        struct hv_vpset vp_set;
index 8e90488..09b2e3e 100644 (file)
@@ -869,6 +869,8 @@ struct kvm_arch {
 
        bool x2apic_format;
        bool x2apic_broadcast_quirk_disabled;
+
+       bool guest_can_read_msr_platform_info;
 };
 
 struct kvm_vm_stat {
@@ -1022,6 +1024,7 @@ struct kvm_x86_ops {
        void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
        void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
        void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
+       bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
        void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
        void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
        void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
@@ -1055,6 +1058,7 @@ struct kvm_x86_ops {
        bool (*umip_emulated)(void);
 
        int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
+       void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
 
        void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
 
@@ -1482,6 +1486,7 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
 
 int kvm_is_in_guest(void);
 
index 86299ef..fd23d57 100644 (file)
@@ -377,6 +377,7 @@ struct kvm_sync_regs {
 
 #define KVM_X86_QUIRK_LINT0_REENABLED  (1 << 0)
 #define KVM_X86_QUIRK_CD_NW_CLEARED    (1 << 1)
+#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE  (1 << 2)
 
 #define KVM_STATE_NESTED_GUEST_MODE    0x00000001
 #define KVM_STATE_NESTED_RUN_PENDING   0x00000002
index 17c0472..fbb0e6d 100644 (file)
@@ -1344,9 +1344,8 @@ EXPORT_SYMBOL_GPL(kvm_lapic_reg_read);
 
 static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
 {
-       return kvm_apic_hw_enabled(apic) &&
-           addr >= apic->base_address &&
-           addr < apic->base_address + LAPIC_MMIO_LENGTH;
+       return addr >= apic->base_address &&
+               addr < apic->base_address + LAPIC_MMIO_LENGTH;
 }
 
 static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
@@ -1358,6 +1357,15 @@ static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
        if (!apic_mmio_in_range(apic, address))
                return -EOPNOTSUPP;
 
+       if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
+               if (!kvm_check_has_quirk(vcpu->kvm,
+                                        KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
+                       return -EOPNOTSUPP;
+
+               memset(data, 0xff, len);
+               return 0;
+       }
+
        kvm_lapic_reg_read(apic, offset, len, data);
 
        return 0;
@@ -1917,6 +1925,14 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
        if (!apic_mmio_in_range(apic, address))
                return -EOPNOTSUPP;
 
+       if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
+               if (!kvm_check_has_quirk(vcpu->kvm,
+                                        KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
+                       return -EOPNOTSUPP;
+
+               return 0;
+       }
+
        /*
         * APIC register must be aligned on 128-bits boundary.
         * 32/64/128 bits registers must be accessed thru 32 bits.
index e24ea70..51b953a 100644 (file)
@@ -249,6 +249,17 @@ static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
  */
 static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
 
+/*
+ * In some cases, we need to preserve the GFN of a non-present or reserved
+ * SPTE when we usurp the upper five bits of the physical address space to
+ * defend against L1TF, e.g. for MMIO SPTEs.  To preserve the GFN, we'll
+ * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
+ * left into the reserved bits, i.e. the GFN in the SPTE will be split into
+ * high and low parts.  This mask covers the lower bits of the GFN.
+ */
+static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+
+
 static void mmu_spte_set(u64 *sptep, u64 spte);
 static union kvm_mmu_page_role
 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
@@ -357,9 +368,7 @@ static bool is_mmio_spte(u64 spte)
 
 static gfn_t get_mmio_spte_gfn(u64 spte)
 {
-       u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask |
-                  shadow_nonpresent_or_rsvd_mask;
-       u64 gpa = spte & ~mask;
+       u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
 
        gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
               & shadow_nonpresent_or_rsvd_mask;
@@ -423,6 +432,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
 static void kvm_mmu_reset_all_pte_masks(void)
 {
+       u8 low_phys_bits;
+
        shadow_user_mask = 0;
        shadow_accessed_mask = 0;
        shadow_dirty_mask = 0;
@@ -437,12 +448,17 @@ static void kvm_mmu_reset_all_pte_masks(void)
         * appropriate mask to guard against L1TF attacks. Otherwise, it is
         * assumed that the CPU is not vulnerable to L1TF.
         */
+       low_phys_bits = boot_cpu_data.x86_phys_bits;
        if (boot_cpu_data.x86_phys_bits <
-           52 - shadow_nonpresent_or_rsvd_mask_len)
+           52 - shadow_nonpresent_or_rsvd_mask_len) {
                shadow_nonpresent_or_rsvd_mask =
                        rsvd_bits(boot_cpu_data.x86_phys_bits -
                                  shadow_nonpresent_or_rsvd_mask_len,
                                  boot_cpu_data.x86_phys_bits - 1);
+               low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
+       }
+       shadow_nonpresent_or_rsvd_lower_gfn_mask =
+               GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
 }
 
 static int is_cpuid_PSE36(void)
@@ -899,7 +915,7 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 {
        /*
         * Make sure the write to vcpu->mode is not reordered in front of
-        * reads to sptes.  If it does, kvm_commit_zap_page() can see us
+        * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
         * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
         */
        smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
@@ -5417,7 +5433,12 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
 {
        MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
-       kvm_init_mmu(vcpu, true);
+       /*
+        * kvm_mmu_setup() is called only on vCPU initialization.  
+        * Therefore, no need to reset mmu roots as they are not yet
+        * initialized.
+        */
+       kvm_init_mmu(vcpu, false);
 }
 
 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
index 89c4c5a..d96092b 100644 (file)
@@ -1226,8 +1226,7 @@ static __init int sev_hardware_setup(void)
        min_sev_asid = cpuid_edx(0x8000001F);
 
        /* Initialize SEV ASID bitmap */
-       sev_asid_bitmap = kcalloc(BITS_TO_LONGS(max_sev_asid),
-                               sizeof(unsigned long), GFP_KERNEL);
+       sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
        if (!sev_asid_bitmap)
                return 1;
 
@@ -1405,7 +1404,7 @@ static __exit void svm_hardware_unsetup(void)
        int cpu;
 
        if (svm_sev_enabled())
-               kfree(sev_asid_bitmap);
+               bitmap_free(sev_asid_bitmap);
 
        for_each_possible_cpu(cpu)
                svm_cpu_uninit(cpu);
@@ -7149,6 +7148,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .check_intercept = svm_check_intercept,
        .handle_external_intr = svm_handle_external_intr,
 
+       .request_immediate_exit = __kvm_request_immediate_exit,
+
        .sched_in = svm_sched_in,
 
        .pmu_ops = &amd_pmu_ops,
index 533a327..612fd17 100644 (file)
@@ -121,7 +121,6 @@ module_param_named(pml, enable_pml, bool, S_IRUGO);
 
 #define MSR_BITMAP_MODE_X2APIC         1
 #define MSR_BITMAP_MODE_X2APIC_APICV   2
-#define MSR_BITMAP_MODE_LM             4
 
 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
 
@@ -397,6 +396,7 @@ struct loaded_vmcs {
        int cpu;
        bool launched;
        bool nmi_known_unmasked;
+       bool hv_timer_armed;
        /* Support for vnmi-less CPUs */
        int soft_vnmi_blocked;
        ktime_t entry_time;
@@ -856,6 +856,7 @@ struct nested_vmx {
 
        /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
        u64 vmcs01_debugctl;
+       u64 vmcs01_guest_bndcfgs;
 
        u16 vpid02;
        u16 last_vpid;
@@ -1019,6 +1020,8 @@ struct vcpu_vmx {
        int ple_window;
        bool ple_window_dirty;
 
+       bool req_immediate_exit;
+
        /* Support for PML */
 #define PML_ENTITY_NUM         512
        struct page *pml_pg;
@@ -2864,6 +2867,8 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
        u16 fs_sel, gs_sel;
        int i;
 
+       vmx->req_immediate_exit = false;
+
        if (vmx->loaded_cpu_state)
                return;
 
@@ -2894,8 +2899,7 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
                vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
        }
 
-       if (is_long_mode(&vmx->vcpu))
-               wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+       wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
 #else
        savesegment(fs, fs_sel);
        savesegment(gs, gs_sel);
@@ -2946,8 +2950,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
        vmx->loaded_cpu_state = NULL;
 
 #ifdef CONFIG_X86_64
-       if (is_long_mode(&vmx->vcpu))
-               rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+       rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
 #endif
        if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
                kvm_load_ldt(host_state->ldt_sel);
@@ -2975,24 +2978,19 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
 #ifdef CONFIG_X86_64
 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
 {
-       if (is_long_mode(&vmx->vcpu)) {
-               preempt_disable();
-               if (vmx->loaded_cpu_state)
-                       rdmsrl(MSR_KERNEL_GS_BASE,
-                              vmx->msr_guest_kernel_gs_base);
-               preempt_enable();
-       }
+       preempt_disable();
+       if (vmx->loaded_cpu_state)
+               rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+       preempt_enable();
        return vmx->msr_guest_kernel_gs_base;
 }
 
 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
 {
-       if (is_long_mode(&vmx->vcpu)) {
-               preempt_disable();
-               if (vmx->loaded_cpu_state)
-                       wrmsrl(MSR_KERNEL_GS_BASE, data);
-               preempt_enable();
-       }
+       preempt_disable();
+       if (vmx->loaded_cpu_state)
+               wrmsrl(MSR_KERNEL_GS_BASE, data);
+       preempt_enable();
        vmx->msr_guest_kernel_gs_base = data;
 }
 #endif
@@ -3528,9 +3526,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
                VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
                VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
 
-       if (kvm_mpx_supported())
-               msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
-
        /* We support free control of debug control saving. */
        msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
 
@@ -3547,8 +3542,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
                VM_ENTRY_LOAD_IA32_PAT;
        msrs->entry_ctls_high |=
                (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
-       if (kvm_mpx_supported())
-               msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
 
        /* We support free control of debug control loading. */
        msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
@@ -3596,12 +3589,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
                msrs->secondary_ctls_high);
        msrs->secondary_ctls_low = 0;
        msrs->secondary_ctls_high &=
-               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                SECONDARY_EXEC_DESC |
                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
                SECONDARY_EXEC_APIC_REGISTER_VIRT |
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                SECONDARY_EXEC_WBINVD_EXITING;
+
        /*
         * We can emulate "VMCS shadowing," even if the hardware
         * doesn't support it.
@@ -3658,6 +3651,10 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
                msrs->secondary_ctls_high |=
                        SECONDARY_EXEC_UNRESTRICTED_GUEST;
 
+       if (flexpriority_enabled)
+               msrs->secondary_ctls_high |=
+                       SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
        /* miscellaneous data */
        rdmsr(MSR_IA32_VMX_MISC,
                msrs->misc_low,
@@ -5068,19 +5065,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
        if (!msr)
                return;
 
-       /*
-        * MSR_KERNEL_GS_BASE is not intercepted when the guest is in
-        * 64-bit mode as a 64-bit kernel may frequently access the
-        * MSR.  This means we need to manually save/restore the MSR
-        * when switching between guest and host state, but only if
-        * the guest is in 64-bit mode.  Sync our cached value if the
-        * guest is transitioning to 32-bit mode and the CPU contains
-        * guest state, i.e. the cache is stale.
-        */
-#ifdef CONFIG_X86_64
-       if (!(efer & EFER_LMA))
-               (void)vmx_read_guest_kernel_gs_base(vmx);
-#endif
        vcpu->arch.efer = efer;
        if (efer & EFER_LMA) {
                vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
@@ -5393,9 +5377,10 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                 * To use VMXON (and later other VMX instructions), a guest
                 * must first be able to turn on cr4.VMXE (see handle_vmon()).
                 * So basically the check on whether to allow nested VMX
-                * is here.
+                * is here.  We operate under the default treatment of SMM,
+                * so VMX cannot be enabled under SMM.
                 */
-               if (!nested_vmx_allowed(vcpu))
+               if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
                        return 1;
        }
 
@@ -6072,9 +6057,6 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
                        mode |= MSR_BITMAP_MODE_X2APIC_APICV;
        }
 
-       if (is_long_mode(vcpu))
-               mode |= MSR_BITMAP_MODE_LM;
-
        return mode;
 }
 
@@ -6115,9 +6097,6 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
        if (!changed)
                return;
 
-       vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
-                                 !(mode & MSR_BITMAP_MODE_LM));
-
        if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
                vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
 
@@ -6183,6 +6162,32 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
        nested_mark_vmcs12_pages_dirty(vcpu);
 }
 
+static u8 vmx_get_rvi(void)
+{
+       return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
+}
+
+static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       void *vapic_page;
+       u32 vppr;
+       int rvi;
+
+       if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
+               !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
+               WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
+               return false;
+
+       rvi = vmx_get_rvi();
+
+       vapic_page = kmap(vmx->nested.virtual_apic_page);
+       vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
+       kunmap(vmx->nested.virtual_apic_page);
+
+       return ((rvi & 0xf0) > (vppr & 0xf0));
+}
+
 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
                                                     bool nested)
 {
@@ -7966,6 +7971,9 @@ static __init int hardware_setup(void)
                kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
        }
 
+       if (!cpu_has_vmx_preemption_timer())
+               kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
+
        if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
                u64 vmx_msr;
 
@@ -9208,7 +9216,8 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
 
 static int handle_preemption_timer(struct kvm_vcpu *vcpu)
 {
-       kvm_lapic_expired_hv_timer(vcpu);
+       if (!to_vmx(vcpu)->req_immediate_exit)
+               kvm_lapic_expired_hv_timer(vcpu);
        return 1;
 }
 
@@ -10214,15 +10223,16 @@ static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
        if (!lapic_in_kernel(vcpu))
                return;
 
+       if (!flexpriority_enabled &&
+           !cpu_has_vmx_virtualize_x2apic_mode())
+               return;
+
        /* Postpone execution until vmcs01 is the current VMCS. */
        if (is_guest_mode(vcpu)) {
                to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
                return;
        }
 
-       if (!cpu_need_tpr_shadow(vcpu))
-               return;
-
        sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
        sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
@@ -10344,6 +10354,14 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
        return max_irr;
 }
 
+static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
+{
+       u8 rvi = vmx_get_rvi();
+       u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
+
+       return ((rvi & 0xf0) > (vppr & 0xf0));
+}
+
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
        if (!kvm_vcpu_apicv_active(vcpu))
@@ -10595,24 +10613,43 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
                                        msrs[i].host, false);
 }
 
-static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val)
+{
+       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val);
+       if (!vmx->loaded_vmcs->hv_timer_armed)
+               vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+                             PIN_BASED_VMX_PREEMPTION_TIMER);
+       vmx->loaded_vmcs->hv_timer_armed = true;
+}
+
+static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u64 tscl;
        u32 delta_tsc;
 
-       if (vmx->hv_deadline_tsc == -1)
+       if (vmx->req_immediate_exit) {
+               vmx_arm_hv_timer(vmx, 0);
                return;
+       }
 
-       tscl = rdtsc();
-       if (vmx->hv_deadline_tsc > tscl)
-               /* sure to be 32 bit only because checked on set_hv_timer */
-               delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
-                       cpu_preemption_timer_multi);
-       else
-               delta_tsc = 0;
+       if (vmx->hv_deadline_tsc != -1) {
+               tscl = rdtsc();
+               if (vmx->hv_deadline_tsc > tscl)
+                       /* set_hv_timer ensures the delta fits in 32-bits */
+                       delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+                               cpu_preemption_timer_multi);
+               else
+                       delta_tsc = 0;
+
+               vmx_arm_hv_timer(vmx, delta_tsc);
+               return;
+       }
 
-       vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+       if (vmx->loaded_vmcs->hv_timer_armed)
+               vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+                               PIN_BASED_VMX_PREEMPTION_TIMER);
+       vmx->loaded_vmcs->hv_timer_armed = false;
 }
 
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
@@ -10672,7 +10709,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
        atomic_switch_perf_msrs(vmx);
 
-       vmx_arm_hv_timer(vcpu);
+       vmx_update_hv_timer(vcpu);
 
        /*
         * If this vCPU has touched SPEC_CTRL, restore the guest's value if
@@ -11214,6 +11251,23 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
 #undef cr4_fixed1_update
 }
 
+static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (kvm_mpx_supported()) {
+               bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
+
+               if (mpx_enabled) {
+                       vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+                       vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+               } else {
+                       vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
+                       vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
+               }
+       }
+}
+
 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -11230,8 +11284,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
                to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
                        ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
 
-       if (nested_vmx_allowed(vcpu))
+       if (nested_vmx_allowed(vcpu)) {
                nested_vmx_cr_fixed1_bits_update(vcpu);
+               nested_vmx_entry_exit_ctls_update(vcpu);
+       }
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -11427,16 +11483,18 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
        u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-       if (vcpu->arch.virtual_tsc_khz == 0)
-               return;
-
-       /* Make sure short timeouts reliably trigger an immediate vmexit.
-        * hrtimer_start does not guarantee this. */
-       if (preemption_timeout <= 1) {
+       /*
+        * A timer value of zero is architecturally guaranteed to cause
+        * a VMExit prior to executing any instructions in the guest.
+        */
+       if (preemption_timeout == 0) {
                vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
                return;
        }
 
+       if (vcpu->arch.virtual_tsc_khz == 0)
+               return;
+
        preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
        preemption_timeout *= 1000000;
        do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
@@ -11646,11 +11704,15 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
         * bits 15:8 should be zero in posted_intr_nv,
         * the descriptor address has been already checked
         * in nested_get_vmcs12_pages.
+        *
+        * bits 5:0 of posted_intr_desc_addr should be zero.
         */
        if (nested_cpu_has_posted_intr(vmcs12) &&
           (!nested_cpu_has_vid(vmcs12) ||
            !nested_exit_intr_ack_set(vcpu) ||
-           vmcs12->posted_intr_nv & 0xff00))
+           (vmcs12->posted_intr_nv & 0xff00) ||
+           (vmcs12->posted_intr_desc_addr & 0x3f) ||
+           (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr))))
                return -EINVAL;
 
        /* tpr shadow is needed by all apicv features. */
@@ -11993,8 +12055,13 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
        set_cr4_guest_host_mask(vmx);
 
-       if (vmx_mpx_supported())
-               vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+       if (kvm_mpx_supported()) {
+               if (vmx->nested.nested_run_pending &&
+                       (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+                       vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+               else
+                       vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
+       }
 
        if (enable_vpid) {
                if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
@@ -12076,11 +12143,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
        exec_control = vmcs12->pin_based_vm_exec_control;
 
-       /* Preemption timer setting is only taken from vmcs01.  */
-       exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       /* Preemption timer setting is computed directly in vmx_vcpu_run.  */
        exec_control |= vmcs_config.pin_based_exec_ctrl;
-       if (vmx->hv_deadline_tsc == -1)
-               exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+       vmx->loaded_vmcs->hv_timer_armed = false;
 
        /* Posted interrupts setting is only taken from vmcs12.  */
        if (nested_cpu_has_posted_intr(vmcs12)) {
@@ -12318,6 +12384,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
            vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
                return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
+       if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)
+               return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
        if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12))
                return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
@@ -12537,15 +12606,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        bool from_vmentry = !!exit_qual;
        u32 dummy_exit_qual;
-       u32 vmcs01_cpu_exec_ctrl;
+       bool evaluate_pending_interrupts;
        int r = 0;
 
-       vmcs01_cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+       evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
+               (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
+       if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
+               evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
 
        enter_guest_mode(vcpu);
 
        if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
                vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+       if (kvm_mpx_supported() &&
+               !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+               vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
 
        vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
        vmx_segment_cache_clear(vmx);
@@ -12585,16 +12660,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
         * to L1 or delivered directly to L2 (e.g. In case L1 don't
         * intercept EXTERNAL_INTERRUPT).
         *
-        * Usually this would be handled by L0 requesting a
-        * IRQ/NMI window by setting VMCS accordingly. However,
-        * this setting was done on VMCS01 and now VMCS02 is active
-        * instead. Thus, we force L0 to perform pending event
-        * evaluation by requesting a KVM_REQ_EVENT.
-        */
-       if (vmcs01_cpu_exec_ctrl &
-               (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING)) {
+        * Usually this would be handled by the processor noticing an
+        * IRQ/NMI window request, or checking RVI during evaluation of
+        * pending virtual interrupts.  However, this setting was done
+        * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
+        * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
+        */
+       if (unlikely(evaluate_pending_interrupts))
                kvm_make_request(KVM_REQ_EVENT, vcpu);
-       }
 
        /*
         * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
@@ -12863,6 +12936,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
        return 0;
 }
 
+static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
+{
+       to_vmx(vcpu)->req_immediate_exit = true;
+}
+
 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
 {
        ktime_t remaining =
@@ -13253,12 +13331,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
        vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
-       if (vmx->hv_deadline_tsc == -1)
-               vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
-                               PIN_BASED_VMX_PREEMPTION_TIMER);
-       else
-               vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
-                             PIN_BASED_VMX_PREEMPTION_TIMER);
+
        if (kvm_has_tsc_control)
                decache_tsc_multiplier(vmx);
 
@@ -13462,18 +13535,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
                return -ERANGE;
 
        vmx->hv_deadline_tsc = tscl + delta_tsc;
-       vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
-                       PIN_BASED_VMX_PREEMPTION_TIMER);
-
        return delta_tsc == 0;
 }
 
 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
 {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       vmx->hv_deadline_tsc = -1;
-       vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
-                       PIN_BASED_VMX_PREEMPTION_TIMER);
+       to_vmx(vcpu)->hv_deadline_tsc = -1;
 }
 #endif
 
@@ -13954,6 +14021,14 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
            ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
                return -EINVAL;
 
+       /*
+        * SMM temporarily disables VMX, so we cannot be in guest mode,
+        * nor can VMLAUNCH/VMRESUME be pending.  Outside SMM, SMM flags
+        * must be zero.
+        */
+       if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags)
+               return -EINVAL;
+
        if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
            !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
                return -EINVAL;
@@ -14097,6 +14172,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .apicv_post_state_restore = vmx_apicv_post_state_restore,
        .hwapic_irr_update = vmx_hwapic_irr_update,
        .hwapic_isr_update = vmx_hwapic_isr_update,
+       .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
        .sync_pir_to_irr = vmx_sync_pir_to_irr,
        .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
 
@@ -14130,6 +14206,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .umip_emulated = vmx_umip_emulated,
 
        .check_nested_events = vmx_check_nested_events,
+       .request_immediate_exit = vmx_request_immediate_exit,
 
        .sched_in = vmx_sched_in,
 
index 542f631..ca71773 100644 (file)
@@ -628,7 +628,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu)
        gfn_t gfn;
        int r;
 
-       if (is_long_mode(vcpu) || !is_pae(vcpu))
+       if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu))
                return false;
 
        if (!test_bit(VCPU_EXREG_PDPTR,
@@ -2537,7 +2537,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
        case MSR_PLATFORM_INFO:
                if (!msr_info->host_initiated ||
-                   data & ~MSR_PLATFORM_INFO_CPUID_FAULT ||
                    (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
                     cpuid_fault_enabled(vcpu)))
                        return 1;
@@ -2780,6 +2779,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                msr_info->data = vcpu->arch.osvw.status;
                break;
        case MSR_PLATFORM_INFO:
+               if (!msr_info->host_initiated &&
+                   !vcpu->kvm->arch.guest_can_read_msr_platform_info)
+                       return 1;
                msr_info->data = vcpu->arch.msr_platform_info;
                break;
        case MSR_MISC_FEATURES_ENABLES:
@@ -2927,6 +2929,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_SPLIT_IRQCHIP:
        case KVM_CAP_IMMEDIATE_EXIT:
        case KVM_CAP_GET_MSR_FEATURES:
+       case KVM_CAP_MSR_PLATFORM_INFO:
                r = 1;
                break;
        case KVM_CAP_SYNC_REGS:
@@ -4007,19 +4010,23 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                        break;
 
                BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
+               r = -EFAULT;
                if (get_user(user_data_size, &user_kvm_nested_state->size))
-                       return -EFAULT;
+                       break;
 
                r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
                                                  user_data_size);
                if (r < 0)
-                       return r;
+                       break;
 
                if (r > user_data_size) {
                        if (put_user(r, &user_kvm_nested_state->size))
-                               return -EFAULT;
-                       return -E2BIG;
+                               r = -EFAULT;
+                       else
+                               r = -E2BIG;
+                       break;
                }
+
                r = 0;
                break;
        }
@@ -4031,19 +4038,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                if (!kvm_x86_ops->set_nested_state)
                        break;
 
+               r = -EFAULT;
                if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
-                       return -EFAULT;
+                       break;
 
+               r = -EINVAL;
                if (kvm_state.size < sizeof(kvm_state))
-                       return -EINVAL;
+                       break;
 
                if (kvm_state.flags &
                    ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE))
-                       return -EINVAL;
+                       break;
 
                /* nested_run_pending implies guest_mode.  */
                if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
-                       return -EINVAL;
+                       break;
 
                r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
                break;
@@ -4350,6 +4359,10 @@ split_irqchip_unlock:
                        kvm->arch.pause_in_guest = true;
                r = 0;
                break;
+       case KVM_CAP_MSR_PLATFORM_INFO:
+               kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
+               r = 0;
+               break;
        default:
                r = -EINVAL;
                break;
@@ -4685,7 +4698,7 @@ static void kvm_init_msr_list(void)
                 */
                switch (msrs_to_save[i]) {
                case MSR_IA32_BNDCFGS:
-                       if (!kvm_x86_ops->mpx_supported())
+                       if (!kvm_mpx_supported())
                                continue;
                        break;
                case MSR_TSC_AUX:
@@ -7361,6 +7374,12 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
 
+void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
+{
+       smp_send_reschedule(vcpu->cpu);
+}
+EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
+
 /*
  * Returns 1 to let vcpu_run() continue the guest execution loop without
  * exiting to the userspace.  Otherwise, the value will be returned to the
@@ -7565,7 +7584,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
        if (req_immediate_exit) {
                kvm_make_request(KVM_REQ_EVENT, vcpu);
-               smp_send_reschedule(vcpu->cpu);
+               kvm_x86_ops->request_immediate_exit(vcpu);
        }
 
        trace_kvm_entry(vcpu->vcpu_id);
@@ -7829,6 +7848,29 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+/* Swap (qemu) user FPU context for the guest FPU context. */
+static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+{
+       preempt_disable();
+       copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
+       /* PKRU is separately restored in kvm_x86_ops->run.  */
+       __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
+                               ~XFEATURE_MASK_PKRU);
+       preempt_enable();
+       trace_kvm_fpu(1);
+}
+
+/* When vcpu_run ends, restore user space FPU context. */
+static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
+{
+       preempt_disable();
+       copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
+       copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
+       preempt_enable();
+       ++vcpu->stat.fpu_reload;
+       trace_kvm_fpu(0);
+}
+
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        int r;
@@ -8177,7 +8219,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
                kvm_update_cpuid(vcpu);
 
        idx = srcu_read_lock(&vcpu->kvm->srcu);
-       if (!is_long_mode(vcpu) && is_pae(vcpu)) {
+       if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) {
                load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
                mmu_reset_needed = 1;
        }
@@ -8406,29 +8448,6 @@ static void fx_init(struct kvm_vcpu *vcpu)
        vcpu->arch.cr0 |= X86_CR0_ET;
 }
 
-/* Swap (qemu) user FPU context for the guest FPU context. */
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
-{
-       preempt_disable();
-       copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
-       /* PKRU is separately restored in kvm_x86_ops->run.  */
-       __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
-                               ~XFEATURE_MASK_PKRU);
-       preempt_enable();
-       trace_kvm_fpu(1);
-}
-
-/* When vcpu_run ends, restore user space FPU context. */
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
-{
-       preempt_disable();
-       copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
-       copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
-       preempt_enable();
-       ++vcpu->stat.fpu_reload;
-       trace_kvm_fpu(0);
-}
-
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
        void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
@@ -8852,6 +8871,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
        pvclock_update_vm_gtod_copy(kvm);
 
+       kvm->arch.guest_can_read_msr_platform_info = true;
+
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
        INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
 
@@ -9200,6 +9221,13 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
        kvm_page_track_flush_slot(kvm, slot);
 }
 
+static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+       return (is_guest_mode(vcpu) &&
+                       kvm_x86_ops->guest_apic_has_interrupt &&
+                       kvm_x86_ops->guest_apic_has_interrupt(vcpu));
+}
+
 static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 {
        if (!list_empty_careful(&vcpu->async_pf.done))
@@ -9224,7 +9252,8 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
                return true;
 
        if (kvm_arch_interrupt_allowed(vcpu) &&
-           kvm_cpu_has_interrupt(vcpu))
+           (kvm_cpu_has_interrupt(vcpu) ||
+           kvm_guest_apic_has_interrupt(vcpu)))
                return true;
 
        if (kvm_hv_has_stimer_pending(vcpu))
index 599e01b..a9dd4ea 100644 (file)
@@ -5359,10 +5359,20 @@ void ata_qc_complete(struct ata_queued_cmd *qc)
  */
 int ata_qc_complete_multiple(struct ata_port *ap, u64 qc_active)
 {
+       u64 done_mask, ap_qc_active = ap->qc_active;
        int nr_done = 0;
-       u64 done_mask;
 
-       done_mask = ap->qc_active ^ qc_active;
+       /*
+        * If the internal tag is set on ap->qc_active, then we care about
+        * bit0 on the passed in qc_active mask. Move that bit up to match
+        * the internal tag.
+        */
+       if (ap_qc_active & (1ULL << ATA_TAG_INTERNAL)) {
+               qc_active |= (qc_active & 0x01) << ATA_TAG_INTERNAL;
+               qc_active ^= qc_active & 0x01;
+       }
+
+       done_mask = ap_qc_active ^ qc_active;
 
        if (unlikely(done_mask & qc_active)) {
                ata_port_err(ap, "illegal qc_active transition (%08llx->%08llx)\n",
index 48f6227..f2b6f4d 100644 (file)
@@ -3467,6 +3467,9 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
                                          (struct floppy_struct **)&outparam);
                if (ret)
                        return ret;
+               memcpy(&inparam.g, outparam,
+                               offsetof(struct floppy_struct, name));
+               outparam = &inparam.g;
                break;
        case FDMSGON:
                UDP->flags |= FTD_MSG;
index 963bb03..ea6238e 100644 (file)
@@ -543,6 +543,8 @@ static void hci_uart_tty_close(struct tty_struct *tty)
        }
        clear_bit(HCI_UART_PROTO_SET, &hu->flags);
 
+       percpu_free_rwsem(&hu->proto_lock);
+
        kfree(hu);
 }
 
index 08ef699..d977193 100644 (file)
@@ -55,6 +55,7 @@ struct clk_plt_data {
        u8 nparents;
        struct clk_plt *clks[PMC_CLK_NUM];
        struct clk_lookup *mclk_lookup;
+       struct clk_lookup *ether_clk_lookup;
 };
 
 /* Return an index in parent table */
@@ -186,13 +187,6 @@ static struct clk_plt *plt_clk_register(struct platform_device *pdev, int id,
        pclk->reg = base + PMC_CLK_CTL_OFFSET + id * PMC_CLK_CTL_SIZE;
        spin_lock_init(&pclk->lock);
 
-       /*
-        * If the clock was already enabled by the firmware mark it as critical
-        * to avoid it being gated by the clock framework if no driver owns it.
-        */
-       if (plt_clk_is_enabled(&pclk->hw))
-               init.flags |= CLK_IS_CRITICAL;
-
        ret = devm_clk_hw_register(&pdev->dev, &pclk->hw);
        if (ret) {
                pclk = ERR_PTR(ret);
@@ -351,11 +345,20 @@ static int plt_clk_probe(struct platform_device *pdev)
                goto err_unreg_clk_plt;
        }
 
+       data->ether_clk_lookup = clkdev_hw_create(&data->clks[4]->hw,
+                                                 "ether_clk", NULL);
+       if (!data->ether_clk_lookup) {
+               err = -ENOMEM;
+               goto err_drop_mclk;
+       }
+
        plt_clk_free_parent_names_loop(parent_names, data->nparents);
 
        platform_set_drvdata(pdev, data);
        return 0;
 
+err_drop_mclk:
+       clkdev_drop(data->mclk_lookup);
 err_unreg_clk_plt:
        plt_clk_unregister_loop(data, i);
        plt_clk_unregister_parents(data);
@@ -369,6 +372,7 @@ static int plt_clk_remove(struct platform_device *pdev)
 
        data = platform_get_drvdata(pdev);
 
+       clkdev_drop(data->ether_clk_lookup);
        clkdev_drop(data->mclk_lookup);
        plt_clk_unregister_loop(data, PMC_CLK_NUM);
        plt_clk_unregister_parents(data);
index 218739b..72790d8 100644 (file)
@@ -38,6 +38,17 @@ static DEFINE_MUTEX(sev_cmd_mutex);
 static struct sev_misc_dev *misc_dev;
 static struct psp_device *psp_master;
 
+static int psp_cmd_timeout = 100;
+module_param(psp_cmd_timeout, int, 0644);
+MODULE_PARM_DESC(psp_cmd_timeout, " default timeout value, in seconds, for PSP commands");
+
+static int psp_probe_timeout = 5;
+module_param(psp_probe_timeout, int, 0644);
+MODULE_PARM_DESC(psp_probe_timeout, " default timeout value, in seconds, during PSP device probe");
+
+static bool psp_dead;
+static int psp_timeout;
+
 static struct psp_device *psp_alloc_struct(struct sp_device *sp)
 {
        struct device *dev = sp->dev;
@@ -82,10 +93,19 @@ done:
        return IRQ_HANDLED;
 }
 
-static void sev_wait_cmd_ioc(struct psp_device *psp, unsigned int *reg)
+static int sev_wait_cmd_ioc(struct psp_device *psp,
+                           unsigned int *reg, unsigned int timeout)
 {
-       wait_event(psp->sev_int_queue, psp->sev_int_rcvd);
+       int ret;
+
+       ret = wait_event_timeout(psp->sev_int_queue,
+                       psp->sev_int_rcvd, timeout * HZ);
+       if (!ret)
+               return -ETIMEDOUT;
+
        *reg = ioread32(psp->io_regs + psp->vdata->cmdresp_reg);
+
+       return 0;
 }
 
 static int sev_cmd_buffer_len(int cmd)
@@ -133,12 +153,15 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
        if (!psp)
                return -ENODEV;
 
+       if (psp_dead)
+               return -EBUSY;
+
        /* Get the physical address of the command buffer */
        phys_lsb = data ? lower_32_bits(__psp_pa(data)) : 0;
        phys_msb = data ? upper_32_bits(__psp_pa(data)) : 0;
 
-       dev_dbg(psp->dev, "sev command id %#x buffer 0x%08x%08x\n",
-               cmd, phys_msb, phys_lsb);
+       dev_dbg(psp->dev, "sev command id %#x buffer 0x%08x%08x timeout %us\n",
+               cmd, phys_msb, phys_lsb, psp_timeout);
 
        print_hex_dump_debug("(in):  ", DUMP_PREFIX_OFFSET, 16, 2, data,
                             sev_cmd_buffer_len(cmd), false);
@@ -154,7 +177,18 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
        iowrite32(reg, psp->io_regs + psp->vdata->cmdresp_reg);
 
        /* wait for command completion */
-       sev_wait_cmd_ioc(psp, &reg);
+       ret = sev_wait_cmd_ioc(psp, &reg, psp_timeout);
+       if (ret) {
+               if (psp_ret)
+                       *psp_ret = 0;
+
+               dev_err(psp->dev, "sev command %#x timed out, disabling PSP \n", cmd);
+               psp_dead = true;
+
+               return ret;
+       }
+
+       psp_timeout = psp_cmd_timeout;
 
        if (psp_ret)
                *psp_ret = reg & PSP_CMDRESP_ERR_MASK;
@@ -888,6 +922,8 @@ void psp_pci_init(void)
 
        psp_master = sp->psp_data;
 
+       psp_timeout = psp_probe_timeout;
+
        if (sev_get_api_version())
                goto err;
 
index f8bbbb3..0c791e3 100644 (file)
@@ -272,7 +272,7 @@ void amdgpu_amdkfd_gpu_reset(struct kgd_dev *kgd)
 
 int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
                        void **mem_obj, uint64_t *gpu_addr,
-                       void **cpu_ptr)
+                       void **cpu_ptr, bool mqd_gfx9)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
        struct amdgpu_bo *bo = NULL;
@@ -287,6 +287,10 @@ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
        bp.flags = AMDGPU_GEM_CREATE_CPU_GTT_USWC;
        bp.type = ttm_bo_type_kernel;
        bp.resv = NULL;
+
+       if (mqd_gfx9)
+               bp.flags |= AMDGPU_GEM_CREATE_MQD_GFX9;
+
        r = amdgpu_bo_create(adev, &bp, &bo);
        if (r) {
                dev_err(adev->dev,
index 2f379c1..cc9aeab 100644 (file)
@@ -136,7 +136,7 @@ void amdgpu_amdkfd_gpu_reset(struct kgd_dev *kgd);
 /* Shared API */
 int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
                        void **mem_obj, uint64_t *gpu_addr,
-                       void **cpu_ptr);
+                       void **cpu_ptr, bool mqd_gfx9);
 void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj);
 void get_local_mem_info(struct kgd_dev *kgd,
                        struct kfd_local_mem_info *mem_info);
index ea3f698..9803b91 100644 (file)
@@ -685,7 +685,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
 
        while (true) {
                temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
-               if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT)
+               if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
                        break;
                if (time_after(jiffies, end_jiffies))
                        return -ETIME;
index 693ec5e..8816c69 100644 (file)
@@ -367,12 +367,14 @@ static int amdgpu_cgs_get_firmware_info(struct cgs_device *cgs_device,
                                break;
                        case CHIP_POLARIS10:
                                if (type == CGS_UCODE_ID_SMU) {
-                                       if ((adev->pdev->device == 0x67df) &&
-                                           ((adev->pdev->revision == 0xe0) ||
-                                            (adev->pdev->revision == 0xe3) ||
-                                            (adev->pdev->revision == 0xe4) ||
-                                            (adev->pdev->revision == 0xe5) ||
-                                            (adev->pdev->revision == 0xe7) ||
+                                       if (((adev->pdev->device == 0x67df) &&
+                                            ((adev->pdev->revision == 0xe0) ||
+                                             (adev->pdev->revision == 0xe3) ||
+                                             (adev->pdev->revision == 0xe4) ||
+                                             (adev->pdev->revision == 0xe5) ||
+                                             (adev->pdev->revision == 0xe7) ||
+                                             (adev->pdev->revision == 0xef))) ||
+                                           ((adev->pdev->device == 0x6fdf) &&
                                             (adev->pdev->revision == 0xef))) {
                                                info->is_kicker = true;
                                                strcpy(fw_name, "amdgpu/polaris10_k_smc.bin");
index 8843a06..0f41d86 100644 (file)
@@ -740,6 +740,7 @@ static const struct pci_device_id pciidlist[] = {
        {0x1002, 0x67CA, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
        {0x1002, 0x67CC, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
        {0x1002, 0x67CF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
+       {0x1002, 0x6FDF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS10},
        /* Polaris12 */
        {0x1002, 0x6980, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12},
        {0x1002, 0x6981, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12},
index 1b04871..29ac74f 100644 (file)
@@ -457,7 +457,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 
        if (kfd->kfd2kgd->init_gtt_mem_allocation(
                        kfd->kgd, size, &kfd->gtt_mem,
-                       &kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr)){
+                       &kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr,
+                       false)) {
                dev_err(kfd_device, "Could not allocate %d bytes\n", size);
                goto out;
        }
index 7a61f38..0149475 100644 (file)
@@ -62,9 +62,20 @@ int kfd_iommu_device_init(struct kfd_dev *kfd)
        struct amd_iommu_device_info iommu_info;
        unsigned int pasid_limit;
        int err;
+       struct kfd_topology_device *top_dev;
 
-       if (!kfd->device_info->needs_iommu_device)
+       top_dev = kfd_topology_device_by_id(kfd->id);
+
+       /*
+        * Overwrite ATS capability according to needs_iommu_device to fix
+        * potential missing corresponding bit in CRAT of BIOS.
+        */
+       if (!kfd->device_info->needs_iommu_device) {
+               top_dev->node_props.capability &= ~HSA_CAP_ATS_PRESENT;
                return 0;
+       }
+
+       top_dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
 
        iommu_info.flags = 0;
        err = amd_iommu_device_info(kfd->pdev, &iommu_info);
index f5fc367..0cedb37 100644 (file)
@@ -88,7 +88,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
                                ALIGN(sizeof(struct v9_mqd), PAGE_SIZE),
                        &((*mqd_mem_obj)->gtt_mem),
                        &((*mqd_mem_obj)->gpu_addr),
-                       (void *)&((*mqd_mem_obj)->cpu_ptr));
+                       (void *)&((*mqd_mem_obj)->cpu_ptr), true);
        } else
                retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd),
                                mqd_mem_obj);
index f971710..92b285c 100644 (file)
@@ -806,6 +806,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu);
 int kfd_topology_remove_device(struct kfd_dev *gpu);
 struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
                                                uint32_t proximity_domain);
+struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id);
 struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
 struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
 int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev);
index bc95d4d..80f5db4 100644 (file)
@@ -63,22 +63,33 @@ struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
        return device;
 }
 
-struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
+struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id)
 {
-       struct kfd_topology_device *top_dev;
-       struct kfd_dev *device = NULL;
+       struct kfd_topology_device *top_dev = NULL;
+       struct kfd_topology_device *ret = NULL;
 
        down_read(&topology_lock);
 
        list_for_each_entry(top_dev, &topology_device_list, list)
                if (top_dev->gpu_id == gpu_id) {
-                       device = top_dev->gpu;
+                       ret = top_dev;
                        break;
                }
 
        up_read(&topology_lock);
 
-       return device;
+       return ret;
+}
+
+struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
+{
+       struct kfd_topology_device *top_dev;
+
+       top_dev = kfd_topology_device_by_id(gpu_id);
+       if (!top_dev)
+               return NULL;
+
+       return top_dev->gpu;
 }
 
 struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
index 14391b0..43b82e1 100644 (file)
@@ -292,7 +292,7 @@ struct tile_config {
 struct kfd2kgd_calls {
        int (*init_gtt_mem_allocation)(struct kgd_dev *kgd, size_t size,
                                        void **mem_obj, uint64_t *gpu_addr,
-                                       void **cpu_ptr);
+                                       void **cpu_ptr, bool mqd_gfx9);
 
        void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj);
 
index 3eb061e..018fcdb 100644 (file)
@@ -2067,7 +2067,7 @@ static void __drm_state_dump(struct drm_device *dev, struct drm_printer *p,
        struct drm_connector *connector;
        struct drm_connector_list_iter conn_iter;
 
-       if (!drm_core_check_feature(dev, DRIVER_ATOMIC))
+       if (!drm_drv_uses_atomic_modeset(dev))
                return;
 
        list_for_each_entry(plane, &config->plane_list, head) {
index 6f28fe5..373bd4c 100644 (file)
@@ -151,7 +151,7 @@ int drm_debugfs_init(struct drm_minor *minor, int minor_id,
                return ret;
        }
 
-       if (drm_core_check_feature(dev, DRIVER_ATOMIC)) {
+       if (drm_drv_uses_atomic_modeset(dev)) {
                ret = drm_atomic_debugfs_init(minor);
                if (ret) {
                        DRM_ERROR("Failed to create atomic debugfs files\n");
index 4b0dd20..16ec93b 100644 (file)
@@ -2370,7 +2370,6 @@ static int drm_pick_crtcs(struct drm_fb_helper *fb_helper,
 {
        int c, o;
        struct drm_connector *connector;
-       const struct drm_connector_helper_funcs *connector_funcs;
        int my_score, best_score, score;
        struct drm_fb_helper_crtc **crtcs, *crtc;
        struct drm_fb_helper_connector *fb_helper_conn;
@@ -2399,8 +2398,6 @@ static int drm_pick_crtcs(struct drm_fb_helper *fb_helper,
        if (drm_has_preferred_mode(fb_helper_conn, width, height))
                my_score++;
 
-       connector_funcs = connector->helper_private;
-
        /*
         * select a crtc for this connector and then attempt to configure
         * remaining connectors
index 72afa51..94c1089 100644 (file)
@@ -3210,6 +3210,7 @@ static int init_bxt_mmio_info(struct intel_gvt *gvt)
        MMIO_D(BXT_DSI_PLL_ENABLE, D_BXT);
 
        MMIO_D(GEN9_CLKGATE_DIS_0, D_BXT);
+       MMIO_D(GEN9_CLKGATE_DIS_4, D_BXT);
 
        MMIO_D(HSW_TVIDEO_DIP_GCP(TRANSCODER_A), D_BXT);
        MMIO_D(HSW_TVIDEO_DIP_GCP(TRANSCODER_B), D_BXT);
index c7afee3..9ad89e3 100644 (file)
@@ -1833,6 +1833,8 @@ static bool kvmgt_is_valid_gfn(unsigned long handle, unsigned long gfn)
 {
        struct kvmgt_guest_info *info;
        struct kvm *kvm;
+       int idx;
+       bool ret;
 
        if (!handle_valid(handle))
                return false;
@@ -1840,8 +1842,11 @@ static bool kvmgt_is_valid_gfn(unsigned long handle, unsigned long gfn)
        info = (struct kvmgt_guest_info *)handle;
        kvm = info->kvm;
 
-       return kvm_is_visible_gfn(kvm, gfn);
+       idx = srcu_read_lock(&kvm->srcu);
+       ret = kvm_is_visible_gfn(kvm, gfn);
+       srcu_read_unlock(&kvm->srcu, idx);
 
+       return ret;
 }
 
 struct intel_gvt_mpt kvmgt_mpt = {
index 9943660..9bb9a85 100644 (file)
@@ -244,6 +244,34 @@ void intel_vgpu_reset_mmio(struct intel_vgpu *vgpu, bool dmlr)
 
                /* set the bit 0:2(Core C-State ) to C0 */
                vgpu_vreg_t(vgpu, GEN6_GT_CORE_STATUS) = 0;
+
+               if (IS_BROXTON(vgpu->gvt->dev_priv)) {
+                       vgpu_vreg_t(vgpu, BXT_P_CR_GT_DISP_PWRON) &=
+                                   ~(BIT(0) | BIT(1));
+                       vgpu_vreg_t(vgpu, BXT_PORT_CL1CM_DW0(DPIO_PHY0)) &=
+                                   ~PHY_POWER_GOOD;
+                       vgpu_vreg_t(vgpu, BXT_PORT_CL1CM_DW0(DPIO_PHY1)) &=
+                                   ~PHY_POWER_GOOD;
+                       vgpu_vreg_t(vgpu, BXT_PHY_CTL_FAMILY(DPIO_PHY0)) &=
+                                   ~BIT(30);
+                       vgpu_vreg_t(vgpu, BXT_PHY_CTL_FAMILY(DPIO_PHY1)) &=
+                                   ~BIT(30);
+                       vgpu_vreg_t(vgpu, BXT_PHY_CTL(PORT_A)) &=
+                                   ~BXT_PHY_LANE_ENABLED;
+                       vgpu_vreg_t(vgpu, BXT_PHY_CTL(PORT_A)) |=
+                                   BXT_PHY_CMNLANE_POWERDOWN_ACK |
+                                   BXT_PHY_LANE_POWERDOWN_ACK;
+                       vgpu_vreg_t(vgpu, BXT_PHY_CTL(PORT_B)) &=
+                                   ~BXT_PHY_LANE_ENABLED;
+                       vgpu_vreg_t(vgpu, BXT_PHY_CTL(PORT_B)) |=
+                                   BXT_PHY_CMNLANE_POWERDOWN_ACK |
+                                   BXT_PHY_LANE_POWERDOWN_ACK;
+                       vgpu_vreg_t(vgpu, BXT_PHY_CTL(PORT_C)) &=
+                                   ~BXT_PHY_LANE_ENABLED;
+                       vgpu_vreg_t(vgpu, BXT_PHY_CTL(PORT_C)) |=
+                                   BXT_PHY_CMNLANE_POWERDOWN_ACK |
+                                   BXT_PHY_LANE_POWERDOWN_ACK;
+               }
        } else {
 #define GVT_GEN8_MMIO_RESET_OFFSET             (0x44200)
                /* only reset the engine related, so starting with 0x44200
index a4e8e3c..c628be0 100644 (file)
@@ -281,6 +281,7 @@ void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu)
        intel_vgpu_clean_submission(vgpu);
        intel_vgpu_clean_display(vgpu);
        intel_vgpu_clean_opregion(vgpu);
+       intel_vgpu_reset_ggtt(vgpu, true);
        intel_vgpu_clean_gtt(vgpu);
        intel_gvt_hypervisor_detach_vgpu(vgpu);
        intel_vgpu_free_resource(vgpu);
index a534b22..5fa0441 100644 (file)
@@ -111,7 +111,8 @@ static int vexpress_muxfpga_probe(struct platform_device *pdev)
 }
 
 static const struct of_device_id vexpress_muxfpga_match[] = {
-       { .compatible = "arm,vexpress-muxfpga", }
+       { .compatible = "arm,vexpress-muxfpga", },
+       {}
 };
 
 static struct platform_driver vexpress_muxfpga_driver = {
index dd19d67..8b0cd08 100644 (file)
@@ -418,7 +418,6 @@ static const struct of_device_id sun4i_drv_of_table[] = {
        { .compatible = "allwinner,sun8i-a33-display-engine" },
        { .compatible = "allwinner,sun8i-a83t-display-engine" },
        { .compatible = "allwinner,sun8i-h3-display-engine" },
-       { .compatible = "allwinner,sun8i-r40-display-engine" },
        { .compatible = "allwinner,sun8i-v3s-display-engine" },
        { .compatible = "allwinner,sun9i-a80-display-engine" },
        { }
index 82502b3..a564b5d 100644 (file)
@@ -398,7 +398,6 @@ static struct regmap_config sun8i_hdmi_phy_regmap_config = {
 
 static const struct sun8i_hdmi_phy_variant sun50i_a64_hdmi_phy = {
        .has_phy_clk = true,
-       .has_second_pll = true,
        .phy_init = &sun8i_hdmi_phy_init_h3,
        .phy_disable = &sun8i_hdmi_phy_disable_h3,
        .phy_config = &sun8i_hdmi_phy_config_h3,
index fc37136..cb65b0e 100644 (file)
@@ -545,22 +545,6 @@ static const struct sun8i_mixer_cfg sun8i_h3_mixer0_cfg = {
        .vi_num         = 1,
 };
 
-static const struct sun8i_mixer_cfg sun8i_r40_mixer0_cfg = {
-       .ccsc           = 0,
-       .mod_rate       = 297000000,
-       .scaler_mask    = 0xf,
-       .ui_num         = 3,
-       .vi_num         = 1,
-};
-
-static const struct sun8i_mixer_cfg sun8i_r40_mixer1_cfg = {
-       .ccsc           = 1,
-       .mod_rate       = 297000000,
-       .scaler_mask    = 0x3,
-       .ui_num         = 1,
-       .vi_num         = 1,
-};
-
 static const struct sun8i_mixer_cfg sun8i_v3s_mixer_cfg = {
        .vi_num = 2,
        .ui_num = 1,
@@ -583,14 +567,6 @@ static const struct of_device_id sun8i_mixer_of_table[] = {
                .data = &sun8i_h3_mixer0_cfg,
        },
        {
-               .compatible = "allwinner,sun8i-r40-de2-mixer-0",
-               .data = &sun8i_r40_mixer0_cfg,
-       },
-       {
-               .compatible = "allwinner,sun8i-r40-de2-mixer-1",
-               .data = &sun8i_r40_mixer1_cfg,
-       },
-       {
                .compatible = "allwinner,sun8i-v3s-de2-mixer",
                .data = &sun8i_v3s_mixer_cfg,
        },
index 55fe398..d5240b7 100644 (file)
@@ -253,7 +253,6 @@ static int sun8i_tcon_top_remove(struct platform_device *pdev)
 
 /* sun4i_drv uses this list to check if a device node is a TCON TOP */
 const struct of_device_id sun8i_tcon_top_of_table[] = {
-       { .compatible = "allwinner,sun8i-r40-tcon-top" },
        { /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, sun8i_tcon_top_of_table);
index dbb62f6..dd9ffde 100644 (file)
@@ -432,9 +432,11 @@ static void udl_fbdev_destroy(struct drm_device *dev,
 {
        drm_fb_helper_unregister_fbi(&ufbdev->helper);
        drm_fb_helper_fini(&ufbdev->helper);
-       drm_framebuffer_unregister_private(&ufbdev->ufb.base);
-       drm_framebuffer_cleanup(&ufbdev->ufb.base);
-       drm_gem_object_put_unlocked(&ufbdev->ufb.obj->base);
+       if (ufbdev->ufb.obj) {
+               drm_framebuffer_unregister_private(&ufbdev->ufb.base);
+               drm_framebuffer_cleanup(&ufbdev->ufb.base);
+               drm_gem_object_put_unlocked(&ufbdev->ufb.obj->base);
+       }
 }
 
 int udl_fbdev_init(struct drm_device *dev)
index cfb50fe..a3275fa 100644 (file)
@@ -297,6 +297,9 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state)
        vc4_state->y_scaling[0] = vc4_get_scaling_mode(vc4_state->src_h[0],
                                                       vc4_state->crtc_h);
 
+       vc4_state->is_unity = (vc4_state->x_scaling[0] == VC4_SCALING_NONE &&
+                              vc4_state->y_scaling[0] == VC4_SCALING_NONE);
+
        if (num_planes > 1) {
                vc4_state->is_yuv = true;
 
@@ -312,24 +315,17 @@ static int vc4_plane_setup_clipping_and_scaling(struct drm_plane_state *state)
                        vc4_get_scaling_mode(vc4_state->src_h[1],
                                             vc4_state->crtc_h);
 
-               /* YUV conversion requires that scaling be enabled,
-                * even on a plane that's otherwise 1:1.  Choose TPZ
-                * for simplicity.
+               /* YUV conversion requires that horizontal scaling be enabled,
+                * even on a plane that's otherwise 1:1. Looks like only PPF
+                * works in that case, so let's pick that one.
                 */
-               if (vc4_state->x_scaling[0] == VC4_SCALING_NONE)
-                       vc4_state->x_scaling[0] = VC4_SCALING_TPZ;
-               if (vc4_state->y_scaling[0] == VC4_SCALING_NONE)
-                       vc4_state->y_scaling[0] = VC4_SCALING_TPZ;
+               if (vc4_state->is_unity)
+                       vc4_state->x_scaling[0] = VC4_SCALING_PPF;
        } else {
                vc4_state->x_scaling[1] = VC4_SCALING_NONE;
                vc4_state->y_scaling[1] = VC4_SCALING_NONE;
        }
 
-       vc4_state->is_unity = (vc4_state->x_scaling[0] == VC4_SCALING_NONE &&
-                              vc4_state->y_scaling[0] == VC4_SCALING_NONE &&
-                              vc4_state->x_scaling[1] == VC4_SCALING_NONE &&
-                              vc4_state->y_scaling[1] == VC4_SCALING_NONE);
-
        /* No configuring scaling on the cursor plane, since it gets
           non-vblank-synced updates, and scaling requires requires
           LBM changes which have to be vblank-synced.
@@ -672,7 +668,10 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
                vc4_dlist_write(vc4_state, SCALER_CSC2_ITR_R_601_5);
        }
 
-       if (!vc4_state->is_unity) {
+       if (vc4_state->x_scaling[0] != VC4_SCALING_NONE ||
+           vc4_state->x_scaling[1] != VC4_SCALING_NONE ||
+           vc4_state->y_scaling[0] != VC4_SCALING_NONE ||
+           vc4_state->y_scaling[1] != VC4_SCALING_NONE) {
                /* LBM Base Address. */
                if (vc4_state->y_scaling[0] != VC4_SCALING_NONE ||
                    vc4_state->y_scaling[1] != VC4_SCALING_NONE) {
index 1f13457..f0ab6b2 100644 (file)
@@ -3729,7 +3729,7 @@ int vmw_validate_single_buffer(struct vmw_private *dev_priv,
 {
        struct vmw_buffer_object *vbo =
                container_of(bo, struct vmw_buffer_object, base);
-       struct ttm_operation_ctx ctx = { interruptible, true };
+       struct ttm_operation_ctx ctx = { interruptible, false };
        int ret;
 
        if (vbo->pin_count > 0)
index 23beff5..6a712a8 100644 (file)
@@ -1512,21 +1512,19 @@ static int vmw_kms_check_display_memory(struct drm_device *dev,
                                        struct drm_rect *rects)
 {
        struct vmw_private *dev_priv = vmw_priv(dev);
-       struct drm_mode_config *mode_config = &dev->mode_config;
        struct drm_rect bounding_box = {0};
        u64 total_pixels = 0, pixel_mem, bb_mem;
        int i;
 
        for (i = 0; i < num_rects; i++) {
                /*
-                * Currently this check is limiting the topology within max
-                * texture/screentarget size. This should change in future when
-                * user-space support multiple fb with topology.
+                * For STDU only individual screen (screen target) is limited by
+                * SCREENTARGET_MAX_WIDTH/HEIGHT registers.
                 */
-               if (rects[i].x1 < 0 ||  rects[i].y1 < 0 ||
-                   rects[i].x2 > mode_config->max_width ||
-                   rects[i].y2 > mode_config->max_height) {
-                       DRM_ERROR("Invalid GUI layout.\n");
+               if (dev_priv->active_display_unit == vmw_du_screen_target &&
+                   (drm_rect_width(&rects[i]) > dev_priv->stdu_max_width ||
+                    drm_rect_height(&rects[i]) > dev_priv->stdu_max_height)) {
+                       DRM_ERROR("Screen size not supported.\n");
                        return -EINVAL;
                }
 
@@ -1615,7 +1613,7 @@ static int vmw_kms_check_topology(struct drm_device *dev,
                struct drm_connector_state *conn_state;
                struct vmw_connector_state *vmw_conn_state;
 
-               if (!new_crtc_state->enable && old_crtc_state->enable) {
+               if (!new_crtc_state->enable) {
                        rects[i].x1 = 0;
                        rects[i].y1 = 0;
                        rects[i].x2 = 0;
@@ -2216,12 +2214,16 @@ int vmw_du_connector_fill_modes(struct drm_connector *connector,
        if (dev_priv->assume_16bpp)
                assumed_bpp = 2;
 
+       max_width  = min(max_width,  dev_priv->texture_max_width);
+       max_height = min(max_height, dev_priv->texture_max_height);
+
+       /*
+        * For STDU extra limit for a mode on SVGA_REG_SCREENTARGET_MAX_WIDTH/
+        * HEIGHT registers.
+        */
        if (dev_priv->active_display_unit == vmw_du_screen_target) {
                max_width  = min(max_width,  dev_priv->stdu_max_width);
-               max_width  = min(max_width,  dev_priv->texture_max_width);
-
                max_height = min(max_height, dev_priv->stdu_max_height);
-               max_height = min(max_height, dev_priv->texture_max_height);
        }
 
        /* Add preferred mode */
@@ -2376,6 +2378,7 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data,
                                struct drm_file *file_priv)
 {
        struct vmw_private *dev_priv = vmw_priv(dev);
+       struct drm_mode_config *mode_config = &dev->mode_config;
        struct drm_vmw_update_layout_arg *arg =
                (struct drm_vmw_update_layout_arg *)data;
        void __user *user_rects;
@@ -2421,6 +2424,21 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data,
                drm_rects[i].y1 = curr_rect.y;
                drm_rects[i].x2 = curr_rect.x + curr_rect.w;
                drm_rects[i].y2 = curr_rect.y + curr_rect.h;
+
+               /*
+                * Currently this check is limiting the topology within
+                * mode_config->max (which actually is max texture size
+                * supported by virtual device). This limit is here to address
+                * window managers that create a big framebuffer for whole
+                * topology.
+                */
+               if (drm_rects[i].x1 < 0 ||  drm_rects[i].y1 < 0 ||
+                   drm_rects[i].x2 > mode_config->max_width ||
+                   drm_rects[i].y2 > mode_config->max_height) {
+                       DRM_ERROR("Invalid GUI layout.\n");
+                       ret = -EINVAL;
+                       goto out_free;
+               }
        }
 
        ret = vmw_kms_check_display_memory(dev, arg->num_outputs, drm_rects);
index 93f6b96..f30e839 100644 (file)
@@ -1600,31 +1600,6 @@ int vmw_kms_stdu_init_display(struct vmw_private *dev_priv)
 
        dev_priv->active_display_unit = vmw_du_screen_target;
 
-       if (dev_priv->capabilities & SVGA_CAP_3D) {
-               /*
-                * For 3D VMs, display (scanout) buffer size is the smaller of
-                * max texture and max STDU
-                */
-               uint32_t max_width, max_height;
-
-               max_width = min(dev_priv->texture_max_width,
-                               dev_priv->stdu_max_width);
-               max_height = min(dev_priv->texture_max_height,
-                                dev_priv->stdu_max_height);
-
-               dev->mode_config.max_width = max_width;
-               dev->mode_config.max_height = max_height;
-       } else {
-               /*
-                * Given various display aspect ratios, there's no way to
-                * estimate these using prim_bb_mem.  So just set these to
-                * something arbitrarily large and we will reject any layout
-                * that doesn't fit prim_bb_mem later
-                */
-               dev->mode_config.max_width = 8192;
-               dev->mode_config.max_height = 8192;
-       }
-
        vmw_kms_create_implicit_placement_property(dev_priv, false);
 
        for (i = 0; i < VMWGFX_NUM_DISPLAY_UNITS; ++i) {
index e125233..80a01cd 100644 (file)
@@ -1404,22 +1404,17 @@ int vmw_surface_gb_priv_define(struct drm_device *dev,
        *srf_out = NULL;
 
        if (for_scanout) {
-               uint32_t max_width, max_height;
-
                if (!svga3dsurface_is_screen_target_format(format)) {
                        DRM_ERROR("Invalid Screen Target surface format.");
                        return -EINVAL;
                }
 
-               max_width = min(dev_priv->texture_max_width,
-                               dev_priv->stdu_max_width);
-               max_height = min(dev_priv->texture_max_height,
-                                dev_priv->stdu_max_height);
-
-               if (size.width > max_width || size.height > max_height) {
+               if (size.width > dev_priv->texture_max_width ||
+                   size.height > dev_priv->texture_max_height) {
                        DRM_ERROR("%ux%u\n, exceeds max surface size %ux%u",
                                  size.width, size.height,
-                                 max_width, max_height);
+                                 dev_priv->texture_max_width,
+                                 dev_priv->texture_max_height);
                        return -EINVAL;
                }
        } else {
@@ -1495,8 +1490,17 @@ int vmw_surface_gb_priv_define(struct drm_device *dev,
        if (srf->flags & SVGA3D_SURFACE_BIND_STREAM_OUTPUT)
                srf->res.backup_size += sizeof(SVGA3dDXSOState);
 
+       /*
+        * Don't set SVGA3D_SURFACE_SCREENTARGET flag for a scanout surface with
+        * size greater than STDU max width/height. This is really a workaround
+        * to support creation of big framebuffer requested by some user-space
+        * for whole topology. That big framebuffer won't really be used for
+        * binding with screen target as during prepare_fb a separate surface is
+        * created so it's safe to ignore SVGA3D_SURFACE_SCREENTARGET flag.
+        */
        if (dev_priv->active_display_unit == vmw_du_screen_target &&
-           for_scanout)
+           for_scanout && size.width <= dev_priv->stdu_max_width &&
+           size.height <= dev_priv->stdu_max_height)
                srf->flags |= SVGA3D_SURFACE_SCREENTARGET;
 
        /*
index a96bf46..cf2a185 100644 (file)
@@ -215,6 +215,8 @@ static void vga_switcheroo_enable(void)
                        return;
 
                client->id = ret | ID_BIT_AUDIO;
+               if (client->ops->gpu_bound)
+                       client->ops->gpu_bound(client->pdev, ret);
        }
 
        vga_switcheroo_debugfs_init(&vgasr_priv);
index 944f5b6..78603b7 100644 (file)
@@ -207,8 +207,6 @@ superio_exit(int ioreg)
 
 #define NUM_FAN                7
 
-#define TEMP_SOURCE_VIRTUAL    0x1f
-
 /* Common and NCT6775 specific data */
 
 /* Voltage min/max registers for nr=7..14 are in bank 5 */
@@ -299,8 +297,9 @@ static const u16 NCT6775_REG_PWM_READ[] = {
 
 static const u16 NCT6775_REG_FAN[] = { 0x630, 0x632, 0x634, 0x636, 0x638 };
 static const u16 NCT6775_REG_FAN_MIN[] = { 0x3b, 0x3c, 0x3d };
-static const u16 NCT6775_REG_FAN_PULSES[] = { 0x641, 0x642, 0x643, 0x644, 0 };
-static const u16 NCT6775_FAN_PULSE_SHIFT[] = { 0, 0, 0, 0, 0, 0 };
+static const u16 NCT6775_REG_FAN_PULSES[NUM_FAN] = {
+       0x641, 0x642, 0x643, 0x644 };
+static const u16 NCT6775_FAN_PULSE_SHIFT[NUM_FAN] = { };
 
 static const u16 NCT6775_REG_TEMP[] = {
        0x27, 0x150, 0x250, 0x62b, 0x62c, 0x62d };
@@ -373,6 +372,7 @@ static const char *const nct6775_temp_label[] = {
 };
 
 #define NCT6775_TEMP_MASK      0x001ffffe
+#define NCT6775_VIRT_TEMP_MASK 0x00000000
 
 static const u16 NCT6775_REG_TEMP_ALTERNATE[32] = {
        [13] = 0x661,
@@ -425,8 +425,8 @@ static const u8 NCT6776_PWM_MODE_MASK[] = { 0x01, 0, 0, 0, 0, 0 };
 
 static const u16 NCT6776_REG_FAN_MIN[] = {
        0x63a, 0x63c, 0x63e, 0x640, 0x642, 0x64a, 0x64c };
-static const u16 NCT6776_REG_FAN_PULSES[] = {
-       0x644, 0x645, 0x646, 0x647, 0x648, 0x649, 0 };
+static const u16 NCT6776_REG_FAN_PULSES[NUM_FAN] = {
+       0x644, 0x645, 0x646, 0x647, 0x648, 0x649 };
 
 static const u16 NCT6776_REG_WEIGHT_DUTY_BASE[] = {
        0x13e, 0x23e, 0x33e, 0x83e, 0x93e, 0xa3e };
@@ -461,6 +461,7 @@ static const char *const nct6776_temp_label[] = {
 };
 
 #define NCT6776_TEMP_MASK      0x007ffffe
+#define NCT6776_VIRT_TEMP_MASK 0x00000000
 
 static const u16 NCT6776_REG_TEMP_ALTERNATE[32] = {
        [14] = 0x401,
@@ -501,9 +502,9 @@ static const s8 NCT6779_BEEP_BITS[] = {
        30, 31 };                       /* intrusion0, intrusion1 */
 
 static const u16 NCT6779_REG_FAN[] = {
-       0x4b0, 0x4b2, 0x4b4, 0x4b6, 0x4b8, 0x4ba, 0x660 };
-static const u16 NCT6779_REG_FAN_PULSES[] = {
-       0x644, 0x645, 0x646, 0x647, 0x648, 0x649, 0 };
+       0x4c0, 0x4c2, 0x4c4, 0x4c6, 0x4c8, 0x4ca, 0x4ce };
+static const u16 NCT6779_REG_FAN_PULSES[NUM_FAN] = {
+       0x644, 0x645, 0x646, 0x647, 0x648, 0x649 };
 
 static const u16 NCT6779_REG_CRITICAL_PWM_ENABLE[] = {
        0x136, 0x236, 0x336, 0x836, 0x936, 0xa36, 0xb36 };
@@ -559,7 +560,9 @@ static const char *const nct6779_temp_label[] = {
 };
 
 #define NCT6779_TEMP_MASK      0x07ffff7e
+#define NCT6779_VIRT_TEMP_MASK 0x00000000
 #define NCT6791_TEMP_MASK      0x87ffff7e
+#define NCT6791_VIRT_TEMP_MASK 0x80000000
 
 static const u16 NCT6779_REG_TEMP_ALTERNATE[32]
        = { 0x490, 0x491, 0x492, 0x493, 0x494, 0x495, 0, 0,
@@ -638,6 +641,7 @@ static const char *const nct6792_temp_label[] = {
 };
 
 #define NCT6792_TEMP_MASK      0x9fffff7e
+#define NCT6792_VIRT_TEMP_MASK 0x80000000
 
 static const char *const nct6793_temp_label[] = {
        "",
@@ -675,6 +679,7 @@ static const char *const nct6793_temp_label[] = {
 };
 
 #define NCT6793_TEMP_MASK      0xbfff037e
+#define NCT6793_VIRT_TEMP_MASK 0x80000000
 
 static const char *const nct6795_temp_label[] = {
        "",
@@ -712,6 +717,7 @@ static const char *const nct6795_temp_label[] = {
 };
 
 #define NCT6795_TEMP_MASK      0xbfffff7e
+#define NCT6795_VIRT_TEMP_MASK 0x80000000
 
 static const char *const nct6796_temp_label[] = {
        "",
@@ -724,8 +730,8 @@ static const char *const nct6796_temp_label[] = {
        "AUXTIN4",
        "SMBUSMASTER 0",
        "SMBUSMASTER 1",
-       "",
-       "",
+       "Virtual_TEMP",
+       "Virtual_TEMP",
        "",
        "",
        "",
@@ -748,7 +754,8 @@ static const char *const nct6796_temp_label[] = {
        "Virtual_TEMP"
 };
 
-#define NCT6796_TEMP_MASK      0xbfff03fe
+#define NCT6796_TEMP_MASK      0xbfff0ffe
+#define NCT6796_VIRT_TEMP_MASK 0x80000c00
 
 /* NCT6102D/NCT6106D specific data */
 
@@ -779,8 +786,8 @@ static const u16 NCT6106_REG_TEMP_CONFIG[] = {
 
 static const u16 NCT6106_REG_FAN[] = { 0x20, 0x22, 0x24 };
 static const u16 NCT6106_REG_FAN_MIN[] = { 0xe0, 0xe2, 0xe4 };
-static const u16 NCT6106_REG_FAN_PULSES[] = { 0xf6, 0xf6, 0xf6, 0, 0 };
-static const u16 NCT6106_FAN_PULSE_SHIFT[] = { 0, 2, 4, 0, 0 };
+static const u16 NCT6106_REG_FAN_PULSES[] = { 0xf6, 0xf6, 0xf6 };
+static const u16 NCT6106_FAN_PULSE_SHIFT[] = { 0, 2, 4 };
 
 static const u8 NCT6106_REG_PWM_MODE[] = { 0xf3, 0xf3, 0xf3 };
 static const u8 NCT6106_PWM_MODE_MASK[] = { 0x01, 0x02, 0x04 };
@@ -917,6 +924,11 @@ static unsigned int fan_from_reg16(u16 reg, unsigned int divreg)
        return 1350000U / (reg << divreg);
 }
 
+static unsigned int fan_from_reg_rpm(u16 reg, unsigned int divreg)
+{
+       return reg;
+}
+
 static u16 fan_to_reg(u32 fan, unsigned int divreg)
 {
        if (!fan)
@@ -969,6 +981,7 @@ struct nct6775_data {
        u16 reg_temp_config[NUM_TEMP];
        const char * const *temp_label;
        u32 temp_mask;
+       u32 virt_temp_mask;
 
        u16 REG_CONFIG;
        u16 REG_VBAT;
@@ -1276,11 +1289,11 @@ static bool is_word_sized(struct nct6775_data *data, u16 reg)
        case nct6795:
        case nct6796:
                return reg == 0x150 || reg == 0x153 || reg == 0x155 ||
-                 ((reg & 0xfff0) == 0x4b0 && (reg & 0x000f) < 0x0b) ||
+                 (reg & 0xfff0) == 0x4c0 ||
                  reg == 0x402 ||
                  reg == 0x63a || reg == 0x63c || reg == 0x63e ||
                  reg == 0x640 || reg == 0x642 || reg == 0x64a ||
-                 reg == 0x64c || reg == 0x660 ||
+                 reg == 0x64c ||
                  reg == 0x73 || reg == 0x75 || reg == 0x77 || reg == 0x79 ||
                  reg == 0x7b || reg == 0x7d;
        }
@@ -1558,7 +1571,7 @@ static void nct6775_update_pwm(struct device *dev)
                reg = nct6775_read_value(data, data->REG_WEIGHT_TEMP_SEL[i]);
                data->pwm_weight_temp_sel[i] = reg & 0x1f;
                /* If weight is disabled, report weight source as 0 */
-               if (j == 1 && !(reg & 0x80))
+               if (!(reg & 0x80))
                        data->pwm_weight_temp_sel[i] = 0;
 
                /* Weight temp data */
@@ -1682,9 +1695,13 @@ static struct nct6775_data *nct6775_update_device(struct device *dev)
                        if (data->has_fan_min & BIT(i))
                                data->fan_min[i] = nct6775_read_value(data,
                                           data->REG_FAN_MIN[i]);
-                       data->fan_pulses[i] =
-                         (nct6775_read_value(data, data->REG_FAN_PULSES[i])
-                               >> data->FAN_PULSE_SHIFT[i]) & 0x03;
+
+                       if (data->REG_FAN_PULSES[i]) {
+                               data->fan_pulses[i] =
+                                 (nct6775_read_value(data,
+                                                     data->REG_FAN_PULSES[i])
+                                  >> data->FAN_PULSE_SHIFT[i]) & 0x03;
+                       }
 
                        nct6775_select_fan_div(dev, data, i, reg);
                }
@@ -3639,6 +3656,7 @@ static int nct6775_probe(struct platform_device *pdev)
 
                data->temp_label = nct6776_temp_label;
                data->temp_mask = NCT6776_TEMP_MASK;
+               data->virt_temp_mask = NCT6776_VIRT_TEMP_MASK;
 
                data->REG_VBAT = NCT6106_REG_VBAT;
                data->REG_DIODE = NCT6106_REG_DIODE;
@@ -3717,6 +3735,7 @@ static int nct6775_probe(struct platform_device *pdev)
 
                data->temp_label = nct6775_temp_label;
                data->temp_mask = NCT6775_TEMP_MASK;
+               data->virt_temp_mask = NCT6775_VIRT_TEMP_MASK;
 
                data->REG_CONFIG = NCT6775_REG_CONFIG;
                data->REG_VBAT = NCT6775_REG_VBAT;
@@ -3789,6 +3808,7 @@ static int nct6775_probe(struct platform_device *pdev)
 
                data->temp_label = nct6776_temp_label;
                data->temp_mask = NCT6776_TEMP_MASK;
+               data->virt_temp_mask = NCT6776_VIRT_TEMP_MASK;
 
                data->REG_CONFIG = NCT6775_REG_CONFIG;
                data->REG_VBAT = NCT6775_REG_VBAT;
@@ -3853,7 +3873,7 @@ static int nct6775_probe(struct platform_device *pdev)
                data->ALARM_BITS = NCT6779_ALARM_BITS;
                data->BEEP_BITS = NCT6779_BEEP_BITS;
 
-               data->fan_from_reg = fan_from_reg13;
+               data->fan_from_reg = fan_from_reg_rpm;
                data->fan_from_reg_min = fan_from_reg13;
                data->target_temp_mask = 0xff;
                data->tolerance_mask = 0x07;
@@ -3861,6 +3881,7 @@ static int nct6775_probe(struct platform_device *pdev)
 
                data->temp_label = nct6779_temp_label;
                data->temp_mask = NCT6779_TEMP_MASK;
+               data->virt_temp_mask = NCT6779_VIRT_TEMP_MASK;
 
                data->REG_CONFIG = NCT6775_REG_CONFIG;
                data->REG_VBAT = NCT6775_REG_VBAT;
@@ -3933,7 +3954,7 @@ static int nct6775_probe(struct platform_device *pdev)
                data->ALARM_BITS = NCT6791_ALARM_BITS;
                data->BEEP_BITS = NCT6779_BEEP_BITS;
 
-               data->fan_from_reg = fan_from_reg13;
+               data->fan_from_reg = fan_from_reg_rpm;
                data->fan_from_reg_min = fan_from_reg13;
                data->target_temp_mask = 0xff;
                data->tolerance_mask = 0x07;
@@ -3944,22 +3965,27 @@ static int nct6775_probe(struct platform_device *pdev)
                case nct6791:
                        data->temp_label = nct6779_temp_label;
                        data->temp_mask = NCT6791_TEMP_MASK;
+                       data->virt_temp_mask = NCT6791_VIRT_TEMP_MASK;
                        break;
                case nct6792:
                        data->temp_label = nct6792_temp_label;
                        data->temp_mask = NCT6792_TEMP_MASK;
+                       data->virt_temp_mask = NCT6792_VIRT_TEMP_MASK;
                        break;
                case nct6793:
                        data->temp_label = nct6793_temp_label;
                        data->temp_mask = NCT6793_TEMP_MASK;
+                       data->virt_temp_mask = NCT6793_VIRT_TEMP_MASK;
                        break;
                case nct6795:
                        data->temp_label = nct6795_temp_label;
                        data->temp_mask = NCT6795_TEMP_MASK;
+                       data->virt_temp_mask = NCT6795_VIRT_TEMP_MASK;
                        break;
                case nct6796:
                        data->temp_label = nct6796_temp_label;
                        data->temp_mask = NCT6796_TEMP_MASK;
+                       data->virt_temp_mask = NCT6796_VIRT_TEMP_MASK;
                        break;
                }
 
@@ -4143,7 +4169,7 @@ static int nct6775_probe(struct platform_device *pdev)
                 * for each fan reflects a different temperature, and there
                 * are no duplicates.
                 */
-               if (src != TEMP_SOURCE_VIRTUAL) {
+               if (!(data->virt_temp_mask & BIT(src))) {
                        if (mask & BIT(src))
                                continue;
                        mask |= BIT(src);
index cbfafc4..270d3c9 100644 (file)
@@ -39,13 +39,23 @@ static int m25p80_read_reg(struct spi_nor *nor, u8 code, u8 *val, int len)
        struct spi_mem_op op = SPI_MEM_OP(SPI_MEM_OP_CMD(code, 1),
                                          SPI_MEM_OP_NO_ADDR,
                                          SPI_MEM_OP_NO_DUMMY,
-                                         SPI_MEM_OP_DATA_IN(len, val, 1));
+                                         SPI_MEM_OP_DATA_IN(len, NULL, 1));
+       void *scratchbuf;
        int ret;
 
+       scratchbuf = kmalloc(len, GFP_KERNEL);
+       if (!scratchbuf)
+               return -ENOMEM;
+
+       op.data.buf.in = scratchbuf;
        ret = spi_mem_exec_op(flash->spimem, &op);
        if (ret < 0)
                dev_err(&flash->spimem->spi->dev, "error %d reading %x\n", ret,
                        code);
+       else
+               memcpy(val, scratchbuf, len);
+
+       kfree(scratchbuf);
 
        return ret;
 }
@@ -56,9 +66,19 @@ static int m25p80_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
        struct spi_mem_op op = SPI_MEM_OP(SPI_MEM_OP_CMD(opcode, 1),
                                          SPI_MEM_OP_NO_ADDR,
                                          SPI_MEM_OP_NO_DUMMY,
-                                         SPI_MEM_OP_DATA_OUT(len, buf, 1));
+                                         SPI_MEM_OP_DATA_OUT(len, NULL, 1));
+       void *scratchbuf;
+       int ret;
 
-       return spi_mem_exec_op(flash->spimem, &op);
+       scratchbuf = kmemdup(buf, len, GFP_KERNEL);
+       if (!scratchbuf)
+               return -ENOMEM;
+
+       op.data.buf.out = scratchbuf;
+       ret = spi_mem_exec_op(flash->spimem, &op);
+       kfree(scratchbuf);
+
+       return ret;
 }
 
 static ssize_t m25p80_write(struct spi_nor *nor, loff_t to, size_t len,
index 52e2cb3..99c460f 100644 (file)
@@ -873,8 +873,11 @@ static int mtd_part_of_parse(struct mtd_info *master,
        int ret, err = 0;
 
        np = mtd_get_of_node(master);
-       if (!mtd_is_partition(master))
+       if (mtd_is_partition(master))
+               of_node_get(np);
+       else
                np = of_get_child_by_name(np, "partitions");
+
        of_property_for_each_string(np, "compatible", prop, compat) {
                parser = mtd_part_get_compatible_parser(compat);
                if (!parser)
index 67b2065..b864b93 100644 (file)
@@ -596,6 +596,12 @@ static int denali_dma_xfer(struct denali_nand_info *denali, void *buf,
        }
 
        iowrite32(DMA_ENABLE__FLAG, denali->reg + DMA_ENABLE);
+       /*
+        * The ->setup_dma() hook kicks DMA by using the data/command
+        * interface, which belongs to a different AXI port from the
+        * register interface.  Read back the register to avoid a race.
+        */
+       ioread32(denali->reg + DMA_ENABLE);
 
        denali_reset_irq(denali);
        denali->setup_dma(denali, dma_addr, page, write);
index 7af4d62..bc2ef52 100644 (file)
@@ -1547,7 +1547,7 @@ static void marvell_nfc_parse_instructions(struct nand_chip *chip,
        for (op_id = 0; op_id < subop->ninstrs; op_id++) {
                unsigned int offset, naddrs;
                const u8 *addrs;
-               int len = nand_subop_get_data_len(subop, op_id);
+               int len;
 
                instr = &subop->instrs[op_id];
 
@@ -1593,6 +1593,7 @@ static void marvell_nfc_parse_instructions(struct nand_chip *chip,
                                nfc_op->ndcb[0] |=
                                        NDCB0_CMD_XTYPE(XTYPE_MONOLITHIC_RW) |
                                        NDCB0_LEN_OVRD;
+                               len = nand_subop_get_data_len(subop, op_id);
                                nfc_op->ndcb[3] |= round_up(len, FIFO_DEPTH);
                        }
                        nfc_op->data_delay_ns = instr->delay_ns;
@@ -1606,6 +1607,7 @@ static void marvell_nfc_parse_instructions(struct nand_chip *chip,
                                nfc_op->ndcb[0] |=
                                        NDCB0_CMD_XTYPE(XTYPE_MONOLITHIC_RW) |
                                        NDCB0_LEN_OVRD;
+                               len = nand_subop_get_data_len(subop, op_id);
                                nfc_op->ndcb[3] |= round_up(len, FIFO_DEPTH);
                        }
                        nfc_op->data_delay_ns = instr->delay_ns;
index 9375cef..3d27616 100644 (file)
@@ -283,8 +283,12 @@ static int ipddp_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
                 case SIOCFINDIPDDPRT:
                        spin_lock_bh(&ipddp_route_lock);
                        rp = __ipddp_find_route(&rcp);
-                       if (rp)
-                               memcpy(&rcp2, rp, sizeof(rcp2));
+                       if (rp) {
+                               memset(&rcp2, 0, sizeof(rcp2));
+                               rcp2.ip    = rp->ip;
+                               rcp2.at    = rp->at;
+                               rcp2.flags = rp->flags;
+                       }
                        spin_unlock_bh(&ipddp_route_lock);
 
                        if (rp) {
index 7c791c1..bef0133 100644 (file)
 #define MV88E6XXX_G1_ATU_OP_GET_CLR_VIOLATION          0x7000
 #define MV88E6XXX_G1_ATU_OP_AGE_OUT_VIOLATION          BIT(7)
 #define MV88E6XXX_G1_ATU_OP_MEMBER_VIOLATION           BIT(6)
-#define MV88E6XXX_G1_ATU_OP_MISS_VIOLTATION            BIT(5)
+#define MV88E6XXX_G1_ATU_OP_MISS_VIOLATION             BIT(5)
 #define MV88E6XXX_G1_ATU_OP_FULL_VIOLATION             BIT(4)
 
 /* Offset 0x0C: ATU Data Register */
index 3074108..5200e4b 100644 (file)
@@ -349,7 +349,7 @@ static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id)
                chip->ports[entry.portvec].atu_member_violation++;
        }
 
-       if (val & MV88E6XXX_G1_ATU_OP_MEMBER_VIOLATION) {
+       if (val & MV88E6XXX_G1_ATU_OP_MISS_VIOLATION) {
                dev_err_ratelimited(chip->dev,
                                    "ATU miss violation for %pM portvec %x\n",
                                    entry.mac, entry.portvec);
index cecbb1d..177587f 100644 (file)
@@ -8027,7 +8027,7 @@ static int bnxt_change_mac_addr(struct net_device *dev, void *p)
        if (ether_addr_equal(addr->sa_data, dev->dev_addr))
                return 0;
 
-       rc = bnxt_approve_mac(bp, addr->sa_data);
+       rc = bnxt_approve_mac(bp, addr->sa_data, true);
        if (rc)
                return rc;
 
@@ -8827,14 +8827,19 @@ static int bnxt_init_mac_addr(struct bnxt *bp)
        } else {
 #ifdef CONFIG_BNXT_SRIOV
                struct bnxt_vf_info *vf = &bp->vf;
+               bool strict_approval = true;
 
                if (is_valid_ether_addr(vf->mac_addr)) {
                        /* overwrite netdev dev_addr with admin VF MAC */
                        memcpy(bp->dev->dev_addr, vf->mac_addr, ETH_ALEN);
+                       /* Older PF driver or firmware may not approve this
+                        * correctly.
+                        */
+                       strict_approval = false;
                } else {
                        eth_hw_addr_random(bp->dev);
                }
-               rc = bnxt_approve_mac(bp, bp->dev->dev_addr);
+               rc = bnxt_approve_mac(bp, bp->dev->dev_addr, strict_approval);
 #endif
        }
        return rc;
index fcd085a..3962f6f 100644 (file)
@@ -1104,7 +1104,7 @@ update_vf_mac_exit:
        mutex_unlock(&bp->hwrm_cmd_lock);
 }
 
-int bnxt_approve_mac(struct bnxt *bp, u8 *mac)
+int bnxt_approve_mac(struct bnxt *bp, u8 *mac, bool strict)
 {
        struct hwrm_func_vf_cfg_input req = {0};
        int rc = 0;
@@ -1122,12 +1122,13 @@ int bnxt_approve_mac(struct bnxt *bp, u8 *mac)
        memcpy(req.dflt_mac_addr, mac, ETH_ALEN);
        rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 mac_done:
-       if (rc) {
+       if (rc && strict) {
                rc = -EADDRNOTAVAIL;
                netdev_warn(bp->dev, "VF MAC address %pM not approved by the PF\n",
                            mac);
+               return rc;
        }
-       return rc;
+       return 0;
 }
 #else
 
@@ -1144,7 +1145,7 @@ void bnxt_update_vf_mac(struct bnxt *bp)
 {
 }
 
-int bnxt_approve_mac(struct bnxt *bp, u8 *mac)
+int bnxt_approve_mac(struct bnxt *bp, u8 *mac, bool strict)
 {
        return 0;
 }
index e9b20cd..2eed9ed 100644 (file)
@@ -39,5 +39,5 @@ int bnxt_sriov_configure(struct pci_dev *pdev, int num_vfs);
 void bnxt_sriov_disable(struct bnxt *);
 void bnxt_hwrm_exec_fwd_req(struct bnxt *);
 void bnxt_update_vf_mac(struct bnxt *);
-int bnxt_approve_mac(struct bnxt *, u8 *);
+int bnxt_approve_mac(struct bnxt *, u8 *, bool);
 #endif
index 16e4ef7..f1a86b4 100644 (file)
@@ -3837,6 +3837,13 @@ static const struct macb_config at91sam9260_config = {
        .init = macb_init,
 };
 
+static const struct macb_config sama5d3macb_config = {
+       .caps = MACB_CAPS_SG_DISABLED
+             | MACB_CAPS_USRIO_HAS_CLKEN | MACB_CAPS_USRIO_DEFAULT_IS_MII_GMII,
+       .clk_init = macb_clk_init,
+       .init = macb_init,
+};
+
 static const struct macb_config pc302gem_config = {
        .caps = MACB_CAPS_SG_DISABLED | MACB_CAPS_GIGABIT_MODE_AVAILABLE,
        .dma_burst_length = 16,
@@ -3904,6 +3911,7 @@ static const struct of_device_id macb_dt_ids[] = {
        { .compatible = "cdns,gem", .data = &pc302gem_config },
        { .compatible = "atmel,sama5d2-gem", .data = &sama5d2_config },
        { .compatible = "atmel,sama5d3-gem", .data = &sama5d3_config },
+       { .compatible = "atmel,sama5d3-macb", .data = &sama5d3macb_config },
        { .compatible = "atmel,sama5d4-gem", .data = &sama5d4_config },
        { .compatible = "cdns,at91rm9200-emac", .data = &emac_config },
        { .compatible = "cdns,emac", .data = &emac_config },
index c8c7ad2..9b5a68b 100644 (file)
@@ -2634,7 +2634,7 @@ static int hp100_login_to_vg_hub(struct net_device *dev, u_short force_relogin)
                /* Wait for link to drop */
                time = jiffies + (HZ / 10);
                do {
-                       if (~(hp100_inb(VG_LAN_CFG_1) & HP100_LINK_UP_ST))
+                       if (!(hp100_inb(VG_LAN_CFG_1) & HP100_LINK_UP_ST))
                                break;
                        if (!in_interrupt())
                                schedule_timeout_interruptible(1);
index 2850041..702fec8 100644 (file)
@@ -58,6 +58,8 @@ static struct {
  */
 static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
                             const struct phylink_link_state *state);
+static void mvpp2_mac_link_up(struct net_device *dev, unsigned int mode,
+                             phy_interface_t interface, struct phy_device *phy);
 
 /* Queue modes */
 #define MVPP2_QDIST_SINGLE_MODE        0
@@ -3142,6 +3144,7 @@ static void mvpp2_start_dev(struct mvpp2_port *port)
                mvpp22_mode_reconfigure(port);
 
        if (port->phylink) {
+               netif_carrier_off(port->dev);
                phylink_start(port->phylink);
        } else {
                /* Phylink isn't used as of now for ACPI, so the MAC has to be
@@ -3150,9 +3153,10 @@ static void mvpp2_start_dev(struct mvpp2_port *port)
                 */
                struct phylink_link_state state = {
                        .interface = port->phy_interface,
-                       .link = 1,
                };
                mvpp2_mac_config(port->dev, MLO_AN_INBAND, &state);
+               mvpp2_mac_link_up(port->dev, MLO_AN_INBAND, port->phy_interface,
+                                 NULL);
        }
 
        netif_tx_start_all_queues(port->dev);
@@ -4495,10 +4499,6 @@ static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
                return;
        }
 
-       netif_tx_stop_all_queues(port->dev);
-       if (!port->has_phy)
-               netif_carrier_off(port->dev);
-
        /* Make sure the port is disabled when reconfiguring the mode */
        mvpp2_port_disable(port);
 
@@ -4523,16 +4523,7 @@ static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
        if (port->priv->hw_version == MVPP21 && port->flags & MVPP2_F_LOOPBACK)
                mvpp2_port_loopback_set(port, state);
 
-       /* If the port already was up, make sure it's still in the same state */
-       if (state->link || !port->has_phy) {
-               mvpp2_port_enable(port);
-
-               mvpp2_egress_enable(port);
-               mvpp2_ingress_enable(port);
-               if (!port->has_phy)
-                       netif_carrier_on(dev);
-               netif_tx_wake_all_queues(dev);
-       }
+       mvpp2_port_enable(port);
 }
 
 static void mvpp2_mac_link_up(struct net_device *dev, unsigned int mode,
index e7dce79..001b5f7 100644 (file)
@@ -2850,7 +2850,7 @@ static void lan743x_pcidev_shutdown(struct pci_dev *pdev)
        lan743x_hardware_cleanup(adapter);
 }
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 static u16 lan743x_pm_wakeframe_crc16(const u8 *buf, int len)
 {
        return bitrev16(crc16(0xFFFF, buf, len));
@@ -3016,7 +3016,7 @@ static int lan743x_pm_resume(struct device *dev)
 static const struct dev_pm_ops lan743x_pm_ops = {
        SET_SYSTEM_SLEEP_PM_OPS(lan743x_pm_suspend, lan743x_pm_resume)
 };
-#endif /*CONFIG_PM */
+#endif /* CONFIG_PM_SLEEP */
 
 static const struct pci_device_id lan743x_pcidev_tbl[] = {
        { PCI_DEVICE(PCI_VENDOR_ID_SMSC, PCI_DEVICE_ID_SMSC_LAN7430) },
@@ -3028,7 +3028,7 @@ static struct pci_driver lan743x_pcidev_driver = {
        .id_table = lan743x_pcidev_tbl,
        .probe    = lan743x_pcidev_probe,
        .remove   = lan743x_pcidev_remove,
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
        .driver.pm = &lan743x_pm_ops,
 #endif
        .shutdown = lan743x_pcidev_shutdown,
index 1d86313..bb529ff 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/pci.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
+#include <linux/clk.h>
 #include <linux/delay.h>
 #include <linux/ethtool.h>
 #include <linux/phy.h>
@@ -665,6 +666,7 @@ struct rtl8169_private {
 
        u16 event_slow;
        const struct rtl_coalesce_info *coalesce_info;
+       struct clk *clk;
 
        struct mdio_ops {
                void (*write)(struct rtl8169_private *, int, int);
@@ -4775,12 +4777,14 @@ static void rtl_pcie_state_l2l3_enable(struct rtl8169_private *tp, bool enable)
 static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable)
 {
        if (enable) {
-               RTL_W8(tp, Config2, RTL_R8(tp, Config2) | ClkReqEn);
                RTL_W8(tp, Config5, RTL_R8(tp, Config5) | ASPM_en);
+               RTL_W8(tp, Config2, RTL_R8(tp, Config2) | ClkReqEn);
        } else {
                RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~ClkReqEn);
                RTL_W8(tp, Config5, RTL_R8(tp, Config5) & ~ASPM_en);
        }
+
+       udelay(10);
 }
 
 static void rtl_hw_start_8168bb(struct rtl8169_private *tp)
@@ -5625,6 +5629,8 @@ static void rtl_hw_start_8402(struct rtl8169_private *tp)
 
 static void rtl_hw_start_8106(struct rtl8169_private *tp)
 {
+       rtl_hw_aspm_clkreq_enable(tp, false);
+
        /* Force LAN exit from ASPM if Rx/Tx are not idle */
        RTL_W32(tp, FuncEvent, RTL_R32(tp, FuncEvent) | 0x002800);
 
@@ -5633,6 +5639,7 @@ static void rtl_hw_start_8106(struct rtl8169_private *tp)
        RTL_W8(tp, DLLPR, RTL_R8(tp, DLLPR) & ~PFM_EN);
 
        rtl_pcie_state_l2l3_enable(tp, false);
+       rtl_hw_aspm_clkreq_enable(tp, true);
 }
 
 static void rtl_hw_start_8101(struct rtl8169_private *tp)
@@ -7257,6 +7264,11 @@ static int rtl_jumbo_max(struct rtl8169_private *tp)
        }
 }
 
+static void rtl_disable_clk(void *data)
+{
+       clk_disable_unprepare(data);
+}
+
 static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
        const struct rtl_cfg_info *cfg = rtl_cfg_infos + ent->driver_data;
@@ -7277,6 +7289,32 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
        tp->msg_enable = netif_msg_init(debug.msg_enable, R8169_MSG_DEFAULT);
        tp->supports_gmii = cfg->has_gmii;
 
+       /* Get the *optional* external "ether_clk" used on some boards */
+       tp->clk = devm_clk_get(&pdev->dev, "ether_clk");
+       if (IS_ERR(tp->clk)) {
+               rc = PTR_ERR(tp->clk);
+               if (rc == -ENOENT) {
+                       /* clk-core allows NULL (for suspend / resume) */
+                       tp->clk = NULL;
+               } else if (rc == -EPROBE_DEFER) {
+                       return rc;
+               } else {
+                       dev_err(&pdev->dev, "failed to get clk: %d\n", rc);
+                       return rc;
+               }
+       } else {
+               rc = clk_prepare_enable(tp->clk);
+               if (rc) {
+                       dev_err(&pdev->dev, "failed to enable clk: %d\n", rc);
+                       return rc;
+               }
+
+               rc = devm_add_action_or_reset(&pdev->dev, rtl_disable_clk,
+                                             tp->clk);
+               if (rc)
+                       return rc;
+       }
+
        /* enable device (incl. PCI PM wakeup and hotplug setup) */
        rc = pcim_enable_device(pdev);
        if (rc < 0) {
index 3609c7b..2b800ce 100644 (file)
@@ -67,7 +67,7 @@ static int dwmac1000_validate_mcast_bins(int mcast_bins)
  * Description:
  * This function validates the number of Unicast address entries supported
  * by a particular Synopsys 10/100/1000 controller. The Synopsys controller
- * supports 132, 64, or 128 Unicast filter entries for it's Unicast filter
+ * supports 1..32, 64, or 128 Unicast filter entries for it's Unicast filter
  * logic. This function validates a valid, supported configuration is
  * selected, and defaults to 1 Unicast address if an unsupported
  * configuration is selected.
@@ -77,8 +77,7 @@ static int dwmac1000_validate_ucast_entries(int ucast_entries)
        int x = ucast_entries;
 
        switch (x) {
-       case 1:
-       case 32:
+       case 1 ... 32:
        case 64:
        case 128:
                break;
index 9263d63..f932923 100644 (file)
@@ -41,6 +41,7 @@ config TI_DAVINCI_MDIO
 config TI_DAVINCI_CPDMA
        tristate "TI DaVinci CPDMA Support"
        depends on ARCH_DAVINCI || ARCH_OMAP2PLUS || COMPILE_TEST
+       select GENERIC_ALLOCATOR
        ---help---
          This driver supports TI's DaVinci CPDMA dma engine.
 
index 31c3d77..fe01e14 100644 (file)
@@ -1203,6 +1203,9 @@ static void netvsc_send_vf(struct net_device *ndev,
 
        net_device_ctx->vf_alloc = nvmsg->msg.v4_msg.vf_assoc.allocated;
        net_device_ctx->vf_serial = nvmsg->msg.v4_msg.vf_assoc.serial;
+       netdev_info(ndev, "VF slot %u %s\n",
+                   net_device_ctx->vf_serial,
+                   net_device_ctx->vf_alloc ? "added" : "removed");
 }
 
 static  void netvsc_receive_inband(struct net_device *ndev,
index 70921bb..3af6d8d 100644 (file)
@@ -1894,20 +1894,6 @@ out_unlock:
        rtnl_unlock();
 }
 
-static struct net_device *get_netvsc_bymac(const u8 *mac)
-{
-       struct net_device_context *ndev_ctx;
-
-       list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) {
-               struct net_device *dev = hv_get_drvdata(ndev_ctx->device_ctx);
-
-               if (ether_addr_equal(mac, dev->perm_addr))
-                       return dev;
-       }
-
-       return NULL;
-}
-
 static struct net_device *get_netvsc_byref(struct net_device *vf_netdev)
 {
        struct net_device_context *net_device_ctx;
@@ -2036,26 +2022,48 @@ static void netvsc_vf_setup(struct work_struct *w)
        rtnl_unlock();
 }
 
+/* Find netvsc by VMBus serial number.
+ * The PCI hyperv controller records the serial number as the slot.
+ */
+static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev)
+{
+       struct device *parent = vf_netdev->dev.parent;
+       struct net_device_context *ndev_ctx;
+       struct pci_dev *pdev;
+
+       if (!parent || !dev_is_pci(parent))
+               return NULL; /* not a PCI device */
+
+       pdev = to_pci_dev(parent);
+       if (!pdev->slot) {
+               netdev_notice(vf_netdev, "no PCI slot information\n");
+               return NULL;
+       }
+
+       list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) {
+               if (!ndev_ctx->vf_alloc)
+                       continue;
+
+               if (ndev_ctx->vf_serial == pdev->slot->number)
+                       return hv_get_drvdata(ndev_ctx->device_ctx);
+       }
+
+       netdev_notice(vf_netdev,
+                     "no netdev found for slot %u\n", pdev->slot->number);
+       return NULL;
+}
+
 static int netvsc_register_vf(struct net_device *vf_netdev)
 {
-       struct net_device *ndev;
        struct net_device_context *net_device_ctx;
-       struct device *pdev = vf_netdev->dev.parent;
        struct netvsc_device *netvsc_dev;
+       struct net_device *ndev;
        int ret;
 
        if (vf_netdev->addr_len != ETH_ALEN)
                return NOTIFY_DONE;
 
-       if (!pdev || !dev_is_pci(pdev) || dev_is_pf(pdev))
-               return NOTIFY_DONE;
-
-       /*
-        * We will use the MAC address to locate the synthetic interface to
-        * associate with the VF interface. If we don't find a matching
-        * synthetic interface, move on.
-        */
-       ndev = get_netvsc_bymac(vf_netdev->perm_addr);
+       ndev = get_netvsc_byslot(vf_netdev);
        if (!ndev)
                return NOTIFY_DONE;
 
@@ -2272,17 +2280,15 @@ static int netvsc_remove(struct hv_device *dev)
 
        cancel_delayed_work_sync(&ndev_ctx->dwork);
 
-       rcu_read_lock();
-       nvdev = rcu_dereference(ndev_ctx->nvdev);
-
-       if  (nvdev)
+       rtnl_lock();
+       nvdev = rtnl_dereference(ndev_ctx->nvdev);
+       if (nvdev)
                cancel_work_sync(&nvdev->subchan_work);
 
        /*
         * Call to the vsc driver to let it know that the device is being
         * removed. Also blocks mtu and channel changes.
         */
-       rtnl_lock();
        vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
        if (vf_netdev)
                netvsc_unregister_vf(vf_netdev);
@@ -2294,7 +2300,6 @@ static int netvsc_remove(struct hv_device *dev)
        list_del(&ndev_ctx->list);
 
        rtnl_unlock();
-       rcu_read_unlock();
 
        hv_set_drvdata(dev, NULL);
 
index ce61231..62dc564 100644 (file)
@@ -429,6 +429,9 @@ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev,
        if (!skb)
                goto out;
 
+       if (skb_mac_header_len(skb) < ETH_HLEN)
+               goto drop;
+
        if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr)))
                goto drop;
 
index e3270de..533b6fb 100644 (file)
@@ -1213,13 +1213,13 @@ static const struct usb_device_id products[] = {
        {QMI_FIXED_INTF(0x1199, 0x9061, 8)},    /* Sierra Wireless Modem */
        {QMI_FIXED_INTF(0x1199, 0x9063, 8)},    /* Sierra Wireless EM7305 */
        {QMI_FIXED_INTF(0x1199, 0x9063, 10)},   /* Sierra Wireless EM7305 */
-       {QMI_FIXED_INTF(0x1199, 0x9071, 8)},    /* Sierra Wireless MC74xx */
-       {QMI_FIXED_INTF(0x1199, 0x9071, 10)},   /* Sierra Wireless MC74xx */
-       {QMI_FIXED_INTF(0x1199, 0x9079, 8)},    /* Sierra Wireless EM74xx */
-       {QMI_FIXED_INTF(0x1199, 0x9079, 10)},   /* Sierra Wireless EM74xx */
-       {QMI_FIXED_INTF(0x1199, 0x907b, 8)},    /* Sierra Wireless EM74xx */
-       {QMI_FIXED_INTF(0x1199, 0x907b, 10)},   /* Sierra Wireless EM74xx */
-       {QMI_FIXED_INTF(0x1199, 0x9091, 8)},    /* Sierra Wireless EM7565 */
+       {QMI_QUIRK_SET_DTR(0x1199, 0x9071, 8)}, /* Sierra Wireless MC74xx */
+       {QMI_QUIRK_SET_DTR(0x1199, 0x9071, 10)},/* Sierra Wireless MC74xx */
+       {QMI_QUIRK_SET_DTR(0x1199, 0x9079, 8)}, /* Sierra Wireless EM74xx */
+       {QMI_QUIRK_SET_DTR(0x1199, 0x9079, 10)},/* Sierra Wireless EM74xx */
+       {QMI_QUIRK_SET_DTR(0x1199, 0x907b, 8)}, /* Sierra Wireless EM74xx */
+       {QMI_QUIRK_SET_DTR(0x1199, 0x907b, 10)},/* Sierra Wireless EM74xx */
+       {QMI_QUIRK_SET_DTR(0x1199, 0x9091, 8)}, /* Sierra Wireless EM7565 */
        {QMI_FIXED_INTF(0x1bbb, 0x011e, 4)},    /* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */
        {QMI_FIXED_INTF(0x1bbb, 0x0203, 2)},    /* Alcatel L800MA */
        {QMI_FIXED_INTF(0x2357, 0x0201, 4)},    /* TP-LINK HSUPA Modem MA180 */
index 8d679c8..41a00cd 100644 (file)
@@ -463,6 +463,8 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb,
        int mac_len, delta, off;
        struct xdp_buff xdp;
 
+       skb_orphan(skb);
+
        rcu_read_lock();
        xdp_prog = rcu_dereference(rq->xdp_prog);
        if (unlikely(!xdp_prog)) {
@@ -508,8 +510,6 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb,
                skb_copy_header(nskb, skb);
                head_off = skb_headroom(nskb) - skb_headroom(skb);
                skb_headers_offset_update(nskb, head_off);
-               if (skb->sk)
-                       skb_set_owner_w(nskb, skb->sk);
                consume_skb(skb);
                skb = nskb;
        }
index 9407acb..f17f602 100644 (file)
@@ -908,7 +908,11 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue,
                        BUG_ON(pull_to <= skb_headlen(skb));
                        __pskb_pull_tail(skb, pull_to - skb_headlen(skb));
                }
-               BUG_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS);
+               if (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) {
+                       queue->rx.rsp_cons = ++cons;
+                       kfree_skb(nskb);
+                       return ~0U;
+               }
 
                skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
                                skb_frag_page(nfrag),
@@ -1045,6 +1049,8 @@ err:
                skb->len += rx->status;
 
                i = xennet_fill_frags(queue, skb, &tmpq);
+               if (unlikely(i == ~0U))
+                       goto err;
 
                if (rx->flags & XEN_NETRXF_csum_blank)
                        skb->ip_summed = CHECKSUM_PARTIAL;
index a21caea..2008fa6 100644 (file)
@@ -245,6 +245,10 @@ static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
                offset += len;
                ngrps++;
        }
+       for ( ; grpid <= NVMET_MAX_ANAGRPS; grpid++) {
+               if (nvmet_ana_group_enabled[grpid])
+                       ngrps++;
+       }
 
        hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt);
        hdr.ngrps = cpu_to_le16(ngrps);
index c00f82c..ee80e79 100644 (file)
@@ -89,6 +89,9 @@ static enum pci_protocol_version_t pci_protocol_version;
 
 #define STATUS_REVISION_MISMATCH 0xC0000059
 
+/* space for 32bit serial number as string */
+#define SLOT_NAME_SIZE 11
+
 /*
  * Message Types
  */
@@ -494,6 +497,7 @@ struct hv_pci_dev {
        struct list_head list_entry;
        refcount_t refs;
        enum hv_pcichild_state state;
+       struct pci_slot *pci_slot;
        struct pci_function_description desc;
        bool reported_missing;
        struct hv_pcibus_device *hbus;
@@ -1457,6 +1461,34 @@ static void prepopulate_bars(struct hv_pcibus_device *hbus)
        spin_unlock_irqrestore(&hbus->device_list_lock, flags);
 }
 
+/*
+ * Assign entries in sysfs pci slot directory.
+ *
+ * Note that this function does not need to lock the children list
+ * because it is called from pci_devices_present_work which
+ * is serialized with hv_eject_device_work because they are on the
+ * same ordered workqueue. Therefore hbus->children list will not change
+ * even when pci_create_slot sleeps.
+ */
+static void hv_pci_assign_slots(struct hv_pcibus_device *hbus)
+{
+       struct hv_pci_dev *hpdev;
+       char name[SLOT_NAME_SIZE];
+       int slot_nr;
+
+       list_for_each_entry(hpdev, &hbus->children, list_entry) {
+               if (hpdev->pci_slot)
+                       continue;
+
+               slot_nr = PCI_SLOT(wslot_to_devfn(hpdev->desc.win_slot.slot));
+               snprintf(name, SLOT_NAME_SIZE, "%u", hpdev->desc.ser);
+               hpdev->pci_slot = pci_create_slot(hbus->pci_bus, slot_nr,
+                                         name, NULL);
+               if (!hpdev->pci_slot)
+                       pr_warn("pci_create slot %s failed\n", name);
+       }
+}
+
 /**
  * create_root_hv_pci_bus() - Expose a new root PCI bus
  * @hbus:      Root PCI bus, as understood by this driver
@@ -1480,6 +1512,7 @@ static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus)
        pci_lock_rescan_remove();
        pci_scan_child_bus(hbus->pci_bus);
        pci_bus_assign_resources(hbus->pci_bus);
+       hv_pci_assign_slots(hbus);
        pci_bus_add_devices(hbus->pci_bus);
        pci_unlock_rescan_remove();
        hbus->state = hv_pcibus_installed;
@@ -1742,6 +1775,7 @@ static void pci_devices_present_work(struct work_struct *work)
                 */
                pci_lock_rescan_remove();
                pci_scan_child_bus(hbus->pci_bus);
+               hv_pci_assign_slots(hbus);
                pci_unlock_rescan_remove();
                break;
 
@@ -1858,6 +1892,9 @@ static void hv_eject_device_work(struct work_struct *work)
        list_del(&hpdev->list_entry);
        spin_unlock_irqrestore(&hpdev->hbus->device_list_lock, flags);
 
+       if (hpdev->pci_slot)
+               pci_destroy_slot(hpdev->pci_slot);
+
        memset(&ctxt, 0, sizeof(ctxt));
        ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
        ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE;
index d975462..f10af5c 100644 (file)
@@ -536,6 +536,7 @@ static acpi_status alienware_wmax_command(struct wmax_basic_args *in_args,
                if (obj && obj->type == ACPI_TYPE_INTEGER)
                        *out_data = (u32) obj->integer.value;
        }
+       kfree(output.pointer);
        return status;
 
 }
index 88afe56..cf2229e 100644 (file)
@@ -78,6 +78,7 @@ static int run_smbios_call(struct wmi_device *wdev)
        dev_dbg(&wdev->dev, "result: [%08x,%08x,%08x,%08x]\n",
                priv->buf->std.output[0], priv->buf->std.output[1],
                priv->buf->std.output[2], priv->buf->std.output[3]);
+       kfree(output.pointer);
 
        return 0;
 }
index fecf96f..199d3ba 100644 (file)
@@ -374,8 +374,8 @@ struct atio_from_isp {
 static inline int fcpcmd_is_corrupted(struct atio *atio)
 {
        if (atio->entry_type == ATIO_TYPE7 &&
-           (le16_to_cpu(atio->attr_n_length & FCP_CMD_LENGTH_MASK) <
-           FCP_CMD_LENGTH_MIN))
+           ((le16_to_cpu(atio->attr_n_length) & FCP_CMD_LENGTH_MASK) <
+            FCP_CMD_LENGTH_MIN))
                return 1;
        else
                return 0;
index 7cb3ab0..3082e72 100644 (file)
 
 #define DRIVER_NAME "fsl-dspi"
 
+#ifdef CONFIG_M5441x
+#define DSPI_FIFO_SIZE                 16
+#else
 #define DSPI_FIFO_SIZE                 4
+#endif
 #define DSPI_DMA_BUFSIZE               (DSPI_FIFO_SIZE * 1024)
 
 #define SPI_MCR                0x00
@@ -623,9 +627,11 @@ static void dspi_tcfq_read(struct fsl_dspi *dspi)
 static void dspi_eoq_write(struct fsl_dspi *dspi)
 {
        int fifo_size = DSPI_FIFO_SIZE;
+       u16 xfer_cmd = dspi->tx_cmd;
 
        /* Fill TX FIFO with as many transfers as possible */
        while (dspi->len && fifo_size--) {
+               dspi->tx_cmd = xfer_cmd;
                /* Request EOQF for last transfer in FIFO */
                if (dspi->len == dspi->bytes_per_word || fifo_size == 0)
                        dspi->tx_cmd |= SPI_PUSHR_CMD_EOQ;
index ec395a6..9da0bc5 100644 (file)
@@ -2143,8 +2143,17 @@ int spi_register_controller(struct spi_controller *ctlr)
         */
        if (ctlr->num_chipselect == 0)
                return -EINVAL;
-       /* allocate dynamic bus number using Linux idr */
-       if ((ctlr->bus_num < 0) && ctlr->dev.of_node) {
+       if (ctlr->bus_num >= 0) {
+               /* devices with a fixed bus num must check-in with the num */
+               mutex_lock(&board_lock);
+               id = idr_alloc(&spi_master_idr, ctlr, ctlr->bus_num,
+                       ctlr->bus_num + 1, GFP_KERNEL);
+               mutex_unlock(&board_lock);
+               if (WARN(id < 0, "couldn't get idr"))
+                       return id == -ENOSPC ? -EBUSY : id;
+               ctlr->bus_num = id;
+       } else if (ctlr->dev.of_node) {
+               /* allocate dynamic bus number using Linux idr */
                id = of_alias_get_id(ctlr->dev.of_node, "spi");
                if (id >= 0) {
                        ctlr->bus_num = id;
index 9cdfccb..cc756a1 100644 (file)
@@ -1416,7 +1416,8 @@ static void iscsit_do_crypto_hash_buf(struct ahash_request *hash,
 
        sg_init_table(sg, ARRAY_SIZE(sg));
        sg_set_buf(sg, buf, payload_length);
-       sg_set_buf(sg + 1, pad_bytes, padding);
+       if (padding)
+               sg_set_buf(sg + 1, pad_bytes, padding);
 
        ahash_request_set_crypt(hash, sg, data_crc, payload_length + padding);
 
@@ -3910,10 +3911,14 @@ static bool iscsi_target_check_conn_state(struct iscsi_conn *conn)
 static void iscsit_get_rx_pdu(struct iscsi_conn *conn)
 {
        int ret;
-       u8 buffer[ISCSI_HDR_LEN], opcode;
+       u8 *buffer, opcode;
        u32 checksum = 0, digest = 0;
        struct kvec iov;
 
+       buffer = kcalloc(ISCSI_HDR_LEN, sizeof(*buffer), GFP_KERNEL);
+       if (!buffer)
+               return;
+
        while (!kthread_should_stop()) {
                /*
                 * Ensure that both TX and RX per connection kthreads
@@ -3921,7 +3926,6 @@ static void iscsit_get_rx_pdu(struct iscsi_conn *conn)
                 */
                iscsit_thread_check_cpumask(conn, current, 0);
 
-               memset(buffer, 0, ISCSI_HDR_LEN);
                memset(&iov, 0, sizeof(struct kvec));
 
                iov.iov_base    = buffer;
@@ -3930,7 +3934,7 @@ static void iscsit_get_rx_pdu(struct iscsi_conn *conn)
                ret = rx_data(conn, &iov, 1, ISCSI_HDR_LEN);
                if (ret != ISCSI_HDR_LEN) {
                        iscsit_rx_thread_wait_for_tcp(conn);
-                       return;
+                       break;
                }
 
                if (conn->conn_ops->HeaderDigest) {
@@ -3940,7 +3944,7 @@ static void iscsit_get_rx_pdu(struct iscsi_conn *conn)
                        ret = rx_data(conn, &iov, 1, ISCSI_CRC_LEN);
                        if (ret != ISCSI_CRC_LEN) {
                                iscsit_rx_thread_wait_for_tcp(conn);
-                               return;
+                               break;
                        }
 
                        iscsit_do_crypto_hash_buf(conn->conn_rx_hash, buffer,
@@ -3964,7 +3968,7 @@ static void iscsit_get_rx_pdu(struct iscsi_conn *conn)
                }
 
                if (conn->conn_state == TARG_CONN_STATE_IN_LOGOUT)
-                       return;
+                       break;
 
                opcode = buffer[0] & ISCSI_OPCODE_MASK;
 
@@ -3975,13 +3979,15 @@ static void iscsit_get_rx_pdu(struct iscsi_conn *conn)
                        " while in Discovery Session, rejecting.\n", opcode);
                        iscsit_add_reject(conn, ISCSI_REASON_PROTOCOL_ERROR,
                                          buffer);
-                       return;
+                       break;
                }
 
                ret = iscsi_target_rx_opcode(conn, buffer);
                if (ret < 0)
-                       return;
+                       break;
        }
+
+       kfree(buffer);
 }
 
 int iscsi_target_rx_thread(void *arg)
index e2902d3..f93f988 100644 (file)
@@ -76,7 +76,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
        else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
                error_msg = "rec_len is too small for name_len";
        else if (unlikely(((char *) de - buf) + rlen > size))
-               error_msg = "directory entry across range";
+               error_msg = "directory entry overrun";
        else if (unlikely(le32_to_cpu(de->inode) >
                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
@@ -85,18 +85,16 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
 
        if (filp)
                ext4_error_file(filp, function, line, bh->b_blocknr,
-                               "bad entry in directory: %s - offset=%u(%u), "
-                               "inode=%u, rec_len=%d, name_len=%d",
-                               error_msg, (unsigned) (offset % size),
-                               offset, le32_to_cpu(de->inode),
-                               rlen, de->name_len);
+                               "bad entry in directory: %s - offset=%u, "
+                               "inode=%u, rec_len=%d, name_len=%d, size=%d",
+                               error_msg, offset, le32_to_cpu(de->inode),
+                               rlen, de->name_len, size);
        else
                ext4_error_inode(dir, function, line, bh->b_blocknr,
-                               "bad entry in directory: %s - offset=%u(%u), "
-                               "inode=%u, rec_len=%d, name_len=%d",
-                               error_msg, (unsigned) (offset % size),
-                               offset, le32_to_cpu(de->inode),
-                               rlen, de->name_len);
+                               "bad entry in directory: %s - offset=%u, "
+                               "inode=%u, rec_len=%d, name_len=%d, size=%d",
+                                error_msg, offset, le32_to_cpu(de->inode),
+                                rlen, de->name_len, size);
 
        return 1;
 }
index 0f0edd1..caff935 100644 (file)
 #define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION)
 #include <linux/fscrypt.h>
 
+#include <linux/compiler.h>
+
+/* Until this gets included into linux/compiler-gcc.h */
+#ifndef __nonstring
+#if defined(GCC_VERSION) && (GCC_VERSION >= 80000)
+#define __nonstring __attribute__((nonstring))
+#else
+#define __nonstring
+#endif
+#endif
+
 /*
  * The fourth extended filesystem constants/structures
  */
@@ -675,6 +686,9 @@ enum {
 /* Max physical block we can address w/o extents */
 #define EXT4_MAX_BLOCK_FILE_PHYS       0xFFFFFFFF
 
+/* Max logical block we can support */
+#define EXT4_MAX_LOGICAL_BLOCK         0xFFFFFFFF
+
 /*
  * Structure of an inode on the disk
  */
@@ -1226,7 +1240,7 @@ struct ext4_super_block {
        __le32  s_feature_ro_compat;    /* readonly-compatible feature set */
 /*68*/ __u8    s_uuid[16];             /* 128-bit uuid for volume */
 /*78*/ char    s_volume_name[16];      /* volume name */
-/*88*/ char    s_last_mounted[64];     /* directory where last mounted */
+/*88*/ char    s_last_mounted[64] __nonstring; /* directory where last mounted */
 /*C8*/ __le32  s_algorithm_usage_bitmap; /* For compression */
        /*
         * Performance hints.  Directory preallocation should only
@@ -1277,13 +1291,13 @@ struct ext4_super_block {
        __le32  s_first_error_time;     /* first time an error happened */
        __le32  s_first_error_ino;      /* inode involved in first error */
        __le64  s_first_error_block;    /* block involved of first error */
-       __u8    s_first_error_func[32]; /* function where the error happened */
+       __u8    s_first_error_func[32] __nonstring;     /* function where the error happened */
        __le32  s_first_error_line;     /* line number where error happened */
        __le32  s_last_error_time;      /* most recent time of an error */
        __le32  s_last_error_ino;       /* inode involved in last error */
        __le32  s_last_error_line;      /* line number where error happened */
        __le64  s_last_error_block;     /* block involved of last error */
-       __u8    s_last_error_func[32];  /* function where the error happened */
+       __u8    s_last_error_func[32] __nonstring;      /* function where the error happened */
 #define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
        __u8    s_mount_opts[64];
        __le32  s_usr_quota_inum;       /* inode for tracking user quota */
index 3543fe8..7b47360 100644 (file)
@@ -1753,6 +1753,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
 {
        int err, inline_size;
        struct ext4_iloc iloc;
+       size_t inline_len;
        void *inline_pos;
        unsigned int offset;
        struct ext4_dir_entry_2 *de;
@@ -1780,8 +1781,9 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
                goto out;
        }
 
+       inline_len = ext4_get_inline_size(dir);
        offset = EXT4_INLINE_DOTDOT_SIZE;
-       while (offset < dir->i_size) {
+       while (offset < inline_len) {
                de = ext4_get_inline_entry(dir, &iloc, offset,
                                           &inline_pos, &inline_size);
                if (ext4_check_dir_entry(dir, NULL, de,
index d0dd585..d767e99 100644 (file)
@@ -3413,12 +3413,16 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        unsigned int blkbits = inode->i_blkbits;
-       unsigned long first_block = offset >> blkbits;
-       unsigned long last_block = (offset + length - 1) >> blkbits;
+       unsigned long first_block, last_block;
        struct ext4_map_blocks map;
        bool delalloc = false;
        int ret;
 
+       if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+               return -EINVAL;
+       first_block = offset >> blkbits;
+       last_block = min_t(loff_t, (offset + length - 1) >> blkbits,
+                          EXT4_MAX_LOGICAL_BLOCK);
 
        if (flags & IOMAP_REPORT) {
                if (ext4_has_inline_data(inode)) {
@@ -3948,6 +3952,7 @@ static const struct address_space_operations ext4_dax_aops = {
        .writepages             = ext4_dax_writepages,
        .direct_IO              = noop_direct_IO,
        .set_page_dirty         = noop_set_page_dirty,
+       .bmap                   = ext4_bmap,
        .invalidatepage         = noop_invalidatepage,
 };
 
@@ -4192,9 +4197,8 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
        return 0;
 }
 
-static void ext4_wait_dax_page(struct ext4_inode_info *ei, bool *did_unlock)
+static void ext4_wait_dax_page(struct ext4_inode_info *ei)
 {
-       *did_unlock = true;
        up_write(&ei->i_mmap_sem);
        schedule();
        down_write(&ei->i_mmap_sem);
@@ -4204,14 +4208,12 @@ int ext4_break_layouts(struct inode *inode)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct page *page;
-       bool retry;
        int error;
 
        if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))
                return -EINVAL;
 
        do {
-               retry = false;
                page = dax_layout_busy_page(inode->i_mapping);
                if (!page)
                        return 0;
@@ -4219,8 +4221,8 @@ int ext4_break_layouts(struct inode *inode)
                error = ___wait_var_event(&page->_refcount,
                                atomic_read(&page->_refcount) == 1,
                                TASK_INTERRUPTIBLE, 0, 0,
-                               ext4_wait_dax_page(ei, &retry));
-       } while (error == 0 && retry);
+                               ext4_wait_dax_page(ei));
+       } while (error == 0);
 
        return error;
 }
@@ -4895,6 +4897,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                 * not initialized on a new filesystem. */
        }
        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+       ext4_set_inode_flags(inode);
        inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
        ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
        if (ext4_has_feature_64bit(sb))
@@ -5041,7 +5044,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                goto bad_inode;
        }
        brelse(iloc.bh);
-       ext4_set_inode_flags(inode);
 
        unlock_new_inode(inode);
        return inode;
index 39b07c2..2305b43 100644 (file)
@@ -49,7 +49,6 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
         */
        sb_start_write(sb);
        ext4_mmp_csum_set(sb, mmp);
-       mark_buffer_dirty(bh);
        lock_buffer(bh);
        bh->b_end_io = end_buffer_write_sync;
        get_bh(bh);
index 116ff68..377d516 100644 (file)
@@ -3478,6 +3478,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        int credits;
        u8 old_file_type;
 
+       if (new.inode && new.inode->i_nlink == 0) {
+               EXT4_ERROR_INODE(new.inode,
+                                "target of rename is already freed");
+               return -EFSCORRUPTED;
+       }
+
        if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) &&
            (!projid_eq(EXT4_I(new_dir)->i_projid,
                        EXT4_I(old_dentry->d_inode)->i_projid)))
index e5fb384..ebbc663 100644 (file)
@@ -19,6 +19,7 @@
 
 int ext4_resize_begin(struct super_block *sb)
 {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
        int ret = 0;
 
        if (!capable(CAP_SYS_RESOURCE))
@@ -29,7 +30,7 @@ int ext4_resize_begin(struct super_block *sb)
          * because the user tools have no way of handling this.  Probably a
          * bad time to do it anyways.
          */
-       if (EXT4_SB(sb)->s_sbh->b_blocknr !=
+       if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) !=
            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
                ext4_warning(sb, "won't resize using backup superblock at %llu",
                        (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
@@ -1986,6 +1987,26 @@ retry:
                }
        }
 
+       /*
+        * Make sure the last group has enough space so that it's
+        * guaranteed to have enough space for all metadata blocks
+        * that it might need to hold.  (We might not need to store
+        * the inode table blocks in the last block group, but there
+        * will be cases where this might be needed.)
+        */
+       if ((ext4_group_first_block_no(sb, n_group) +
+            ext4_group_overhead_blocks(sb, n_group) + 2 +
+            sbi->s_itb_per_group + sbi->s_cluster_ratio) >= n_blocks_count) {
+               n_blocks_count = ext4_group_first_block_no(sb, n_group);
+               n_group--;
+               n_blocks_count_retry = 0;
+               if (resize_inode) {
+                       iput(resize_inode);
+                       resize_inode = NULL;
+               }
+               goto retry;
+       }
+
        /* extend the last group */
        if (n_group == o_group)
                add = n_blocks_count - o_blocks_count;
index 5863fd2..1145109 100644 (file)
@@ -2145,6 +2145,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
                SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
        if (test_opt(sb, DATA_ERR_ABORT))
                SEQ_OPTS_PUTS("data_err=abort");
+       if (DUMMY_ENCRYPTION_ENABLED(sbi))
+               SEQ_OPTS_PUTS("test_dummy_encryption");
 
        ext4_show_quota_options(seq, sb);
        return 0;
@@ -4378,11 +4380,13 @@ no_journal:
        block = ext4_count_free_clusters(sb);
        ext4_free_blocks_count_set(sbi->s_es, 
                                   EXT4_C2B(sbi, block));
+       ext4_superblock_csum_set(sb);
        err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
                                  GFP_KERNEL);
        if (!err) {
                unsigned long freei = ext4_count_free_inodes(sb);
                sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
+               ext4_superblock_csum_set(sb);
                err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
                                          GFP_KERNEL);
        }
index d9ebe11..1d098c3 100644 (file)
@@ -342,6 +342,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
                                 * for this bh as it's not marked locally
                                 * uptodate. */
                                status = -EIO;
+                               clear_buffer_needs_validate(bh);
                                put_bh(bh);
                                bhs[i] = NULL;
                                continue;
index ad72261..d297fe4 100644 (file)
@@ -464,6 +464,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
                                ret = -EFAULT;
                                goto out;
                        }
+                       m = NULL;       /* skip the list anchor */
                } else if (m->type == KCORE_VMALLOC) {
                        vread(buf, (char *)start, tsz);
                        /* we have to zero-fill user buffer even if no read */
index 23e7042..bf000c8 100644 (file)
@@ -1912,7 +1912,9 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
                mutex_unlock(&c->bu_mutex);
        }
 
-       ubifs_assert(c, c->lst.taken_empty_lebs > 0);
+       if (!c->need_recovery)
+               ubifs_assert(c, c->lst.taken_empty_lebs > 0);
+
        return 0;
 }
 
@@ -1954,6 +1956,9 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode)
        int dev, vol;
        char *endptr;
 
+       if (!name || !*name)
+               return ERR_PTR(-EINVAL);
+
        /* First, try to open using the device node path method */
        ubi = ubi_open_volume_path(name, mode);
        if (!IS_ERR(ubi))
index 61afdfe..f5ad1ed 100644 (file)
@@ -152,12 +152,6 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
        ui->data_len = size;
 
        mutex_lock(&host_ui->ui_mutex);
-
-       if (!host->i_nlink) {
-               err = -ENOENT;
-               goto out_noent;
-       }
-
        host->i_ctime = current_time(host);
        host_ui->xattr_cnt += 1;
        host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm));
@@ -190,7 +184,6 @@ out_cancel:
        host_ui->xattr_size -= CALC_XATTR_BYTES(size);
        host_ui->xattr_names -= fname_len(nm);
        host_ui->flags &= ~UBIFS_CRYPT_FL;
-out_noent:
        mutex_unlock(&host_ui->ui_mutex);
 out_free:
        make_bad_inode(inode);
@@ -242,12 +235,6 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
        mutex_unlock(&ui->ui_mutex);
 
        mutex_lock(&host_ui->ui_mutex);
-
-       if (!host->i_nlink) {
-               err = -ENOENT;
-               goto out_noent;
-       }
-
        host->i_ctime = current_time(host);
        host_ui->xattr_size -= CALC_XATTR_BYTES(old_size);
        host_ui->xattr_size += CALC_XATTR_BYTES(size);
@@ -269,7 +256,6 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
 out_cancel:
        host_ui->xattr_size -= CALC_XATTR_BYTES(size);
        host_ui->xattr_size += CALC_XATTR_BYTES(old_size);
-out_noent:
        mutex_unlock(&host_ui->ui_mutex);
        make_bad_inode(inode);
 out_free:
@@ -496,12 +482,6 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
                return err;
 
        mutex_lock(&host_ui->ui_mutex);
-
-       if (!host->i_nlink) {
-               err = -ENOENT;
-               goto out_noent;
-       }
-
        host->i_ctime = current_time(host);
        host_ui->xattr_cnt -= 1;
        host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm));
@@ -521,7 +501,6 @@ out_cancel:
        host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm));
        host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
        host_ui->xattr_names += fname_len(nm);
-out_noent:
        mutex_unlock(&host_ui->ui_mutex);
        ubifs_release_budget(c, &req);
        make_bad_inode(inode);
@@ -561,9 +540,6 @@ static int ubifs_xattr_remove(struct inode *host, const char *name)
 
        ubifs_assert(c, inode_is_locked(host));
 
-       if (!host->i_nlink)
-               return -ENOENT;
-
        if (fname_len(&nm) > UBIFS_MAX_NLEN)
                return -ENAMETOOLONG;
 
index 46a8009..152b305 100644 (file)
@@ -675,7 +675,7 @@ static inline bool drm_core_check_feature(struct drm_device *dev, int feature)
 static inline bool drm_drv_uses_atomic_modeset(struct drm_device *dev)
 {
        return drm_core_check_feature(dev, DRIVER_ATOMIC) ||
-               dev->mode_config.funcs->atomic_commit != NULL;
+               (dev->mode_config.funcs && dev->mode_config.funcs->atomic_commit != NULL);
 }
 
 
index 763bbad..4d36b27 100644 (file)
 #define __noretpoline __attribute__((indirect_branch("keep")))
 #endif
 
-/*
- * it doesn't make sense on ARM (currently the only user of __naked)
- * to trace naked functions because then mcount is called without
- * stack and frame pointer being set up and there is no chance to
- * restore the lr register to the value before mcount was called.
- *
- * The asm() bodies of naked functions often depend on standard calling
- * conventions, therefore they must be noinline and noclone.
- *
- * GCC 4.[56] currently fail to enforce this, so we must do so ourselves.
- * See GCC PR44290.
- */
-#define __naked                __attribute__((naked)) noinline __noclone notrace
-
 #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
 
 #define __optimize(level)      __attribute__((__optimize__(level)))
index 3525c17..db192be 100644 (file)
@@ -226,6 +226,14 @@ struct ftrace_likely_data {
 #define notrace                        __attribute__((no_instrument_function))
 #endif
 
+/*
+ * it doesn't make sense on ARM (currently the only user of __naked)
+ * to trace naked functions because then mcount is called without
+ * stack and frame pointer being set up and there is no chance to
+ * restore the lr register to the value before mcount was called.
+ */
+#define __naked                        __attribute__((naked)) notrace
+
 #define __compiler_offsetof(a, b)      __builtin_offsetof(a, b)
 
 /*
index 0205aee..c926698 100644 (file)
@@ -733,8 +733,6 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible);
-void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
-void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_reload_remote_mmus(struct kvm *kvm);
index a34539b..7e6ac01 100644 (file)
@@ -133,15 +133,18 @@ struct vga_switcheroo_handler {
  * @can_switch: check if the device is in a position to switch now.
  *     Mandatory. The client should return false if a user space process
  *     has one of its device files open
+ * @gpu_bound: notify the client id to audio client when the GPU is bound.
  *
  * Client callbacks. A client can be either a GPU or an audio device on a GPU.
  * The @set_gpu_state and @can_switch methods are mandatory, @reprobe may be
  * set to NULL. For audio clients, the @reprobe member is bogus.
+ * OTOH, @gpu_bound is only for audio clients, and not used for GPU clients.
  */
 struct vga_switcheroo_client_ops {
        void (*set_gpu_state)(struct pci_dev *dev, enum vga_switcheroo_state);
        void (*reprobe)(struct pci_dev *dev);
        bool (*can_switch)(struct pci_dev *dev);
+       void (*gpu_bound)(struct pci_dev *dev, enum vga_switcheroo_client_id);
 };
 
 #if defined(CONFIG_VGA_SWITCHEROO)
index d5c683e..0a769cf 100644 (file)
@@ -171,15 +171,14 @@ struct cipher_context {
        char *rec_seq;
 };
 
+union tls_crypto_context {
+       struct tls_crypto_info info;
+       struct tls12_crypto_info_aes_gcm_128 aes_gcm_128;
+};
+
 struct tls_context {
-       union {
-               struct tls_crypto_info crypto_send;
-               struct tls12_crypto_info_aes_gcm_128 crypto_send_aes_gcm_128;
-       };
-       union {
-               struct tls_crypto_info crypto_recv;
-               struct tls12_crypto_info_aes_gcm_128 crypto_recv_aes_gcm_128;
-       };
+       union tls_crypto_context crypto_send;
+       union tls_crypto_context crypto_recv;
 
        struct list_head list;
        struct net_device *netdev;
@@ -367,8 +366,8 @@ static inline void tls_fill_prepend(struct tls_context *ctx,
         * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE
         */
        buf[0] = record_type;
-       buf[1] = TLS_VERSION_MINOR(ctx->crypto_send.version);
-       buf[2] = TLS_VERSION_MAJOR(ctx->crypto_send.version);
+       buf[1] = TLS_VERSION_MINOR(ctx->crypto_send.info.version);
+       buf[2] = TLS_VERSION_MAJOR(ctx->crypto_send.info.version);
        /* we can use IV for nonce explicit according to spec */
        buf[3] = pkt_len >> 8;
        buf[4] = pkt_len & 0xFF;
index 6f1e1f3..cd1773d 100644 (file)
@@ -412,6 +412,7 @@ void snd_hdac_bus_init_cmd_io(struct hdac_bus *bus);
 void snd_hdac_bus_stop_cmd_io(struct hdac_bus *bus);
 void snd_hdac_bus_enter_link_reset(struct hdac_bus *bus);
 void snd_hdac_bus_exit_link_reset(struct hdac_bus *bus);
+int snd_hdac_bus_reset_link(struct hdac_bus *bus, bool full_reset);
 
 void snd_hdac_bus_update_rirb(struct hdac_bus *bus);
 int snd_hdac_bus_handle_stream_irq(struct hdac_bus *bus, unsigned int status,
index af9ef16..fdaaafd 100644 (file)
@@ -407,6 +407,7 @@ int snd_soc_dapm_new_dai_widgets(struct snd_soc_dapm_context *dapm,
 int snd_soc_dapm_link_dai_widgets(struct snd_soc_card *card);
 void snd_soc_dapm_connect_dai_link_widgets(struct snd_soc_card *card);
 int snd_soc_dapm_new_pcm(struct snd_soc_card *card,
+                        struct snd_soc_pcm_runtime *rtd,
                         const struct snd_soc_pcm_stream *params,
                         unsigned int num_params,
                         struct snd_soc_dapm_widget *source,
index 07548de..7f2ff3a 100644 (file)
@@ -719,6 +719,7 @@ struct kvm_ppc_one_seg_page_size {
 
 #define KVM_PPC_PAGE_SIZES_REAL                0x00000001
 #define KVM_PPC_1T_SEGMENTS            0x00000002
+#define KVM_PPC_NO_HASH                        0x00000004
 
 struct kvm_ppc_smmu_info {
        __u64 flags;
@@ -952,6 +953,8 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_HPAGE_1M 156
 #define KVM_CAP_NESTED_STATE 157
 #define KVM_CAP_ARM_INJECT_SERROR_ESR 158
+#define KVM_CAP_MSR_PLATFORM_INFO 159
+#define KVM_CAP_PPC_NESTED_HV 160
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
index f58cafa..f39352c 100644 (file)
@@ -10,6 +10,8 @@
 #ifndef __HDA_TPLG_INTERFACE_H__
 #define __HDA_TPLG_INTERFACE_H__
 
+#include <linux/types.h>
+
 /*
  * Default types range from 0~12. type can range from 0 to 0xff
  * SST types start at higher to avoid any overlapping in future
@@ -143,10 +145,10 @@ enum skl_module_param_type {
 };
 
 struct skl_dfw_algo_data {
-       u32 set_params:2;
-       u32 rsvd:30;
-       u32 param_id;
-       u32 max;
+       __u32 set_params:2;
+       __u32 rsvd:30;
+       __u32 param_id;
+       __u32 max;
        char params[0];
 } __packed;
 
@@ -163,68 +165,68 @@ enum skl_tuple_type {
 /* v4 configuration data */
 
 struct skl_dfw_v4_module_pin {
-       u16 module_id;
-       u16 instance_id;
+       __u16 module_id;
+       __u16 instance_id;
 } __packed;
 
 struct skl_dfw_v4_module_fmt {
-       u32 channels;
-       u32 freq;
-       u32 bit_depth;
-       u32 valid_bit_depth;
-       u32 ch_cfg;
-       u32 interleaving_style;
-       u32 sample_type;
-       u32 ch_map;
+       __u32 channels;
+       __u32 freq;
+       __u32 bit_depth;
+       __u32 valid_bit_depth;
+       __u32 ch_cfg;
+       __u32 interleaving_style;
+       __u32 sample_type;
+       __u32 ch_map;
 } __packed;
 
 struct skl_dfw_v4_module_caps {
-       u32 set_params:2;
-       u32 rsvd:30;
-       u32 param_id;
-       u32 caps_size;
-       u32 caps[HDA_SST_CFG_MAX];
+       __u32 set_params:2;
+       __u32 rsvd:30;
+       __u32 param_id;
+       __u32 caps_size;
+       __u32 caps[HDA_SST_CFG_MAX];
 } __packed;
 
 struct skl_dfw_v4_pipe {
-       u8 pipe_id;
-       u8 pipe_priority;
-       u16 conn_type:4;
-       u16 rsvd:4;
-       u16 memory_pages:8;
+       __u8 pipe_id;
+       __u8 pipe_priority;
+       __u16 conn_type:4;
+       __u16 rsvd:4;
+       __u16 memory_pages:8;
 } __packed;
 
 struct skl_dfw_v4_module {
        char uuid[SKL_UUID_STR_SZ];
 
-       u16 module_id;
-       u16 instance_id;
-       u32 max_mcps;
-       u32 mem_pages;
-       u32 obs;
-       u32 ibs;
-       u32 vbus_id;
-
-       u32 max_in_queue:8;
-       u32 max_out_queue:8;
-       u32 time_slot:8;
-       u32 core_id:4;
-       u32 rsvd1:4;
-
-       u32 module_type:8;
-       u32 conn_type:4;
-       u32 dev_type:4;
-       u32 hw_conn_type:4;
-       u32 rsvd2:12;
-
-       u32 params_fixup:8;
-       u32 converter:8;
-       u32 input_pin_type:1;
-       u32 output_pin_type:1;
-       u32 is_dynamic_in_pin:1;
-       u32 is_dynamic_out_pin:1;
-       u32 is_loadable:1;
-       u32 rsvd3:11;
+       __u16 module_id;
+       __u16 instance_id;
+       __u32 max_mcps;
+       __u32 mem_pages;
+       __u32 obs;
+       __u32 ibs;
+       __u32 vbus_id;
+
+       __u32 max_in_queue:8;
+       __u32 max_out_queue:8;
+       __u32 time_slot:8;
+       __u32 core_id:4;
+       __u32 rsvd1:4;
+
+       __u32 module_type:8;
+       __u32 conn_type:4;
+       __u32 dev_type:4;
+       __u32 hw_conn_type:4;
+       __u32 rsvd2:12;
+
+       __u32 params_fixup:8;
+       __u32 converter:8;
+       __u32 input_pin_type:1;
+       __u32 output_pin_type:1;
+       __u32 is_dynamic_in_pin:1;
+       __u32 is_dynamic_out_pin:1;
+       __u32 is_loadable:1;
+       __u32 rsvd3:11;
 
        struct skl_dfw_v4_pipe pipe;
        struct skl_dfw_v4_module_fmt in_fmt[MAX_IN_QUEUE];
index 2590700..138f030 100644 (file)
@@ -1844,7 +1844,7 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
 
        hdr = &btf->hdr;
        cur = btf->nohdr_data + hdr->type_off;
-       end = btf->nohdr_data + hdr->type_len;
+       end = cur + hdr->type_len;
 
        env->log_type_id = 1;
        while (cur < end) {
index 9224611..bb07e74 100644 (file)
@@ -3163,7 +3163,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
                                 * an arbitrary scalar. Disallow all math except
                                 * pointer subtraction
                                 */
-                               if (opcode == BPF_SUB){
+                               if (opcode == BPF_SUB && env->allow_ptr_leaks) {
                                        mark_reg_unknown(env, regs, insn->dst_reg);
                                        return 0;
                                }
index de1cfc4..cdf63e5 100644 (file)
@@ -195,7 +195,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
                idr_preload_end();
 
                if (nr < 0) {
-                       retval = nr;
+                       retval = (nr == -ENOSPC) ? -EAGAIN : nr;
                        goto out_free;
                }
 
index cf5c675..123bd73 100644 (file)
@@ -71,9 +71,6 @@
 #include <asm/io.h>
 #include <asm/unistd.h>
 
-/* Hardening for Spectre-v1 */
-#include <linux/nospec.h>
-
 #include "uid16.h"
 
 #ifndef SET_UNALIGN_CTL
index 1d92d4a..65bd461 100644 (file)
@@ -1546,6 +1546,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
        tmp_iter_page = first_page;
 
        do {
+               cond_resched();
+
                to_remove_page = tmp_iter_page;
                rb_inc_page(cpu_buffer, &tmp_iter_page);
 
index a550635..de64ea6 100644 (file)
@@ -637,6 +637,7 @@ config DEFERRED_STRUCT_PAGE_INIT
        depends on NO_BOOTMEM
        depends on SPARSEMEM
        depends on !NEED_PER_CPU_KM
+       depends on 64BIT
        help
          Ordinarily all struct pages are initialised during early boot in a
          single thread. On very large machines this can take a considerable
index 0376c12..4469426 100644 (file)
@@ -2227,6 +2227,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
                        mpol_shared_policy_init(&info->policy, NULL);
                        break;
                }
+
+               lockdep_annotate_inode_mutex_key(inode);
        } else
                shmem_free_inode(sb);
        return inode;
index 7e7d255..c7ce2c1 100644 (file)
@@ -476,6 +476,17 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
        delta = freeable >> priority;
        delta *= 4;
        do_div(delta, shrinker->seeks);
+
+       /*
+        * Make sure we apply some minimal pressure on default priority
+        * even on small cgroups. Stale objects are not only consuming memory
+        * by themselves, but can also hold a reference to a dying cgroup,
+        * preventing it from being reclaimed. A dying cgroup with all
+        * corresponding structures like per-cpu stats and kmem caches
+        * can be really big, so it may lead to a significant waste of memory.
+        */
+       delta = max_t(unsigned long long, delta, min(freeable, batch_size));
+
        total_scan += delta;
        if (total_scan < 0) {
                pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
index ae91e2d..3a7b077 100644 (file)
@@ -83,6 +83,7 @@ enum {
 
 struct smp_dev {
        /* Secure Connections OOB data */
+       bool                    local_oob;
        u8                      local_pk[64];
        u8                      local_rand[16];
        bool                    debug_key;
@@ -599,6 +600,8 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16])
 
        memcpy(rand, smp->local_rand, 16);
 
+       smp->local_oob = true;
+
        return 0;
 }
 
@@ -1785,7 +1788,7 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
         * successfully received our local OOB data - therefore set the
         * flag to indicate that local OOB is in use.
         */
-       if (req->oob_flag == SMP_OOB_PRESENT)
+       if (req->oob_flag == SMP_OOB_PRESENT && SMP_DEV(hdev)->local_oob)
                set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags);
 
        /* SMP over BR/EDR requires special treatment */
@@ -1967,7 +1970,7 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb)
         * successfully received our local OOB data - therefore set the
         * flag to indicate that local OOB is in use.
         */
-       if (rsp->oob_flag == SMP_OOB_PRESENT)
+       if (rsp->oob_flag == SMP_OOB_PRESENT && SMP_DEV(hdev)->local_oob)
                set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags);
 
        smp->prsp[0] = SMP_CMD_PAIRING_RSP;
@@ -2697,7 +2700,13 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb)
         * key was set/generated.
         */
        if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) {
-               struct smp_dev *smp_dev = chan->data;
+               struct l2cap_chan *hchan = hdev->smp_data;
+               struct smp_dev *smp_dev;
+
+               if (!hchan || !hchan->data)
+                       return SMP_UNSPECIFIED;
+
+               smp_dev = hchan->data;
 
                tfm_ecdh = smp_dev->tfm_ecdh;
        } else {
@@ -3230,6 +3239,7 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
                return ERR_CAST(tfm_ecdh);
        }
 
+       smp->local_oob = false;
        smp->tfm_aes = tfm_aes;
        smp->tfm_cmac = tfm_cmac;
        smp->tfm_ecdh = tfm_ecdh;
index aecdeba..5e00f2b 100644 (file)
@@ -2344,7 +2344,8 @@ BPF_CALL_4(bpf_msg_pull_data,
        if (unlikely(bytes_sg_total > copy))
                return -EINVAL;
 
-       page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy));
+       page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
+                          get_order(copy));
        if (unlikely(!page))
                return -ENOMEM;
        p = page_address(page);
index aa19d86..91592fc 100644 (file)
@@ -1180,6 +1180,12 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
                lladdr = neigh->ha;
        }
 
+       /* Update confirmed timestamp for neighbour entry after we
+        * received ARP packet even if it doesn't change IP to MAC binding.
+        */
+       if (new & NUD_CONNECTED)
+               neigh->confirmed = jiffies;
+
        /* If entry was valid and address is not changed,
           do not change entry state, if new one is STALE.
         */
@@ -1201,15 +1207,12 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
                }
        }
 
-       /* Update timestamps only once we know we will make a change to the
+       /* Update timestamp only once we know we will make a change to the
         * neighbour entry. Otherwise we risk to move the locktime window with
         * noop updates and ignore relevant ARP updates.
         */
-       if (new != old || lladdr != neigh->ha) {
-               if (new & NUD_CONNECTED)
-                       neigh->confirmed = jiffies;
+       if (new != old || lladdr != neigh->ha)
                neigh->updated = jiffies;
-       }
 
        if (new != old) {
                neigh_del_timer(neigh);
index 60c9288..63ce228 100644 (file)
@@ -2810,7 +2810,7 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
        }
 
        if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
-               __dev_notify_flags(dev, old_flags, 0U);
+               __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags));
        } else {
                dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
                __dev_notify_flags(dev, old_flags, ~0U);
index 20fda8f..1fbe2f8 100644 (file)
@@ -1377,6 +1377,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
                if (encap)
                        skb_reset_inner_headers(skb);
                skb->network_header = (u8 *)iph - skb->head;
+               skb_reset_mac_len(skb);
        } while ((skb = skb->next));
 
 out:
index f4e35b2..7d69dd6 100644 (file)
@@ -2124,6 +2124,28 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
                                                         inet_compute_pseudo);
 }
 
+/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
+ * return code conversion for ip layer consumption
+ */
+static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
+                              struct udphdr *uh)
+{
+       int ret;
+
+       if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
+               skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+                                        inet_compute_pseudo);
+
+       ret = udp_queue_rcv_skb(sk, skb);
+
+       /* a return value > 0 means to resubmit the input, but
+        * it wants the return to be -protocol, or 0
+        */
+       if (ret > 0)
+               return -ret;
+       return 0;
+}
+
 /*
  *     All we need to do is get the socket, and then do a checksum.
  */
@@ -2170,14 +2192,9 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
                if (unlikely(sk->sk_rx_dst != dst))
                        udp_sk_rx_dst_set(sk, dst);
 
-               ret = udp_queue_rcv_skb(sk, skb);
+               ret = udp_unicast_rcv_skb(sk, skb, uh);
                sock_put(sk);
-               /* a return value > 0 means to resubmit the input, but
-                * it wants the return to be -protocol, or 0
-                */
-               if (ret > 0)
-                       return -ret;
-               return 0;
+               return ret;
        }
 
        if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
@@ -2185,22 +2202,8 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
                                                saddr, daddr, udptable, proto);
 
        sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
-       if (sk) {
-               int ret;
-
-               if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
-                       skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
-                                                inet_compute_pseudo);
-
-               ret = udp_queue_rcv_skb(sk, skb);
-
-               /* a return value > 0 means to resubmit the input, but
-                * it wants the return to be -protocol, or 0
-                */
-               if (ret > 0)
-                       return -ret;
-               return 0;
-       }
+       if (sk)
+               return udp_unicast_rcv_skb(sk, skb, uh);
 
        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
                goto drop;
index 37ff480..c7e495f 100644 (file)
@@ -115,6 +115,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
                        payload_len = skb->len - nhoff - sizeof(*ipv6h);
                ipv6h->payload_len = htons(payload_len);
                skb->network_header = (u8 *)ipv6h - skb->head;
+               skb_reset_mac_len(skb);
 
                if (udpfrag) {
                        int err = ip6_find_1stfragopt(skb, &prevhdr);
index 16f200f..f9f8f55 100644 (file)
@@ -219,12 +219,10 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
                                kfree_skb(skb);
                                return -ENOBUFS;
                        }
+                       if (skb->sk)
+                               skb_set_owner_w(skb2, skb->sk);
                        consume_skb(skb);
                        skb = skb2;
-                       /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
-                        * it is safe to call in our context (socket lock not held)
-                        */
-                       skb_set_owner_w(skb, (struct sock *)sk);
                }
                if (opt->opt_flen)
                        ipv6_push_frag_opts(skb, opt, &proto);
index 18e00ce..480a79f 100644 (file)
@@ -946,8 +946,6 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
 
 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
 {
-       rt->dst.flags |= fib6_info_dst_flags(ort);
-
        if (ort->fib6_flags & RTF_REJECT) {
                ip6_rt_init_dst_reject(rt, ort);
                return;
@@ -4670,20 +4668,31 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
                         int iif, int type, u32 portid, u32 seq,
                         unsigned int flags)
 {
-       struct rtmsg *rtm;
+       struct rt6_info *rt6 = (struct rt6_info *)dst;
+       struct rt6key *rt6_dst, *rt6_src;
+       u32 *pmetrics, table, rt6_flags;
        struct nlmsghdr *nlh;
+       struct rtmsg *rtm;
        long expires = 0;
-       u32 *pmetrics;
-       u32 table;
 
        nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
        if (!nlh)
                return -EMSGSIZE;
 
+       if (rt6) {
+               rt6_dst = &rt6->rt6i_dst;
+               rt6_src = &rt6->rt6i_src;
+               rt6_flags = rt6->rt6i_flags;
+       } else {
+               rt6_dst = &rt->fib6_dst;
+               rt6_src = &rt->fib6_src;
+               rt6_flags = rt->fib6_flags;
+       }
+
        rtm = nlmsg_data(nlh);
        rtm->rtm_family = AF_INET6;
-       rtm->rtm_dst_len = rt->fib6_dst.plen;
-       rtm->rtm_src_len = rt->fib6_src.plen;
+       rtm->rtm_dst_len = rt6_dst->plen;
+       rtm->rtm_src_len = rt6_src->plen;
        rtm->rtm_tos = 0;
        if (rt->fib6_table)
                table = rt->fib6_table->tb6_id;
@@ -4698,7 +4707,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
        rtm->rtm_scope = RT_SCOPE_UNIVERSE;
        rtm->rtm_protocol = rt->fib6_protocol;
 
-       if (rt->fib6_flags & RTF_CACHE)
+       if (rt6_flags & RTF_CACHE)
                rtm->rtm_flags |= RTM_F_CLONED;
 
        if (dest) {
@@ -4706,7 +4715,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
                        goto nla_put_failure;
                rtm->rtm_dst_len = 128;
        } else if (rtm->rtm_dst_len)
-               if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
+               if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
                        goto nla_put_failure;
 #ifdef CONFIG_IPV6_SUBTREES
        if (src) {
@@ -4714,12 +4723,12 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
                        goto nla_put_failure;
                rtm->rtm_src_len = 128;
        } else if (rtm->rtm_src_len &&
-                  nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
+                  nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
                goto nla_put_failure;
 #endif
        if (iif) {
 #ifdef CONFIG_IPV6_MROUTE
-               if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
+               if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
                        int err = ip6mr_get_route(net, skb, rtm, portid);
 
                        if (err == 0)
@@ -4754,7 +4763,14 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
        /* For multipath routes, walk the siblings list and add
         * each as a nexthop within RTA_MULTIPATH.
         */
-       if (rt->fib6_nsiblings) {
+       if (rt6) {
+               if (rt6_flags & RTF_GATEWAY &&
+                   nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
+                       goto nla_put_failure;
+
+               if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
+                       goto nla_put_failure;
+       } else if (rt->fib6_nsiblings) {
                struct fib6_info *sibling, *next_sibling;
                struct nlattr *mp;
 
@@ -4777,7 +4793,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
                        goto nla_put_failure;
        }
 
-       if (rt->fib6_flags & RTF_EXPIRES) {
+       if (rt6_flags & RTF_EXPIRES) {
                expires = dst ? dst->expires : rt->expires;
                expires -= jiffies;
        }
@@ -4785,7 +4801,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
        if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
                goto nla_put_failure;
 
-       if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
+       if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
                goto nla_put_failure;
 
 
index 83f4c77..28c4aa5 100644 (file)
@@ -752,6 +752,28 @@ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
        }
 }
 
+/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
+ * return code conversion for ip layer consumption
+ */
+static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
+                               struct udphdr *uh)
+{
+       int ret;
+
+       if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
+               skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+                                        ip6_compute_pseudo);
+
+       ret = udpv6_queue_rcv_skb(sk, skb);
+
+       /* a return value > 0 means to resubmit the input, but
+        * it wants the return to be -protocol, or 0
+        */
+       if (ret > 0)
+               return -ret;
+       return 0;
+}
+
 int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
                   int proto)
 {
@@ -803,13 +825,14 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
                if (unlikely(sk->sk_rx_dst != dst))
                        udp6_sk_rx_dst_set(sk, dst);
 
-               ret = udpv6_queue_rcv_skb(sk, skb);
-               sock_put(sk);
+               if (!uh->check && !udp_sk(sk)->no_check6_rx) {
+                       sock_put(sk);
+                       goto report_csum_error;
+               }
 
-               /* a return value > 0 means to resubmit the input */
-               if (ret > 0)
-                       return ret;
-               return 0;
+               ret = udp6_unicast_rcv_skb(sk, skb, uh);
+               sock_put(sk);
+               return ret;
        }
 
        /*
@@ -822,30 +845,13 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
        /* Unicast */
        sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
        if (sk) {
-               int ret;
-
-               if (!uh->check && !udp_sk(sk)->no_check6_rx) {
-                       udp6_csum_zero_error(skb);
-                       goto csum_error;
-               }
-
-               if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
-                       skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
-                                                ip6_compute_pseudo);
-
-               ret = udpv6_queue_rcv_skb(sk, skb);
-
-               /* a return value > 0 means to resubmit the input */
-               if (ret > 0)
-                       return ret;
-
-               return 0;
+               if (!uh->check && !udp_sk(sk)->no_check6_rx)
+                       goto report_csum_error;
+               return udp6_unicast_rcv_skb(sk, skb, uh);
        }
 
-       if (!uh->check) {
-               udp6_csum_zero_error(skb);
-               goto csum_error;
-       }
+       if (!uh->check)
+               goto report_csum_error;
 
        if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
                goto discard;
@@ -866,6 +872,9 @@ short_packet:
                            ulen, skb->len,
                            daddr, ntohs(uh->dest));
        goto discard;
+
+report_csum_error:
+       udp6_csum_zero_error(skb);
 csum_error:
        __UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
 discard:
index 44e9c00..6b67aa1 100644 (file)
@@ -69,7 +69,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 
        if (!exists) {
                ret = tcf_idr_create(tn, parm->index, est, a,
-                                    &act_sample_ops, bind, false);
+                                    &act_sample_ops, bind, true);
                if (ret) {
                        tcf_idr_cleanup(tn, parm->index);
                        return ret;
index 1a67af8..0a75cb2 100644 (file)
@@ -1902,6 +1902,8 @@ replay:
                                RTM_NEWCHAIN, false);
                break;
        case RTM_DELCHAIN:
+               tfilter_notify_chain(net, skb, block, q, parent, n,
+                                    chain, RTM_DELTFILTER);
                /* Flush the chain first as the user requested chain removal. */
                tcf_chain_flush(chain);
                /* In case the chain was successfully deleted, put a reference
index e6945e3..01f3f8f 100644 (file)
@@ -941,7 +941,8 @@ void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
 EXPORT_SYMBOL(dlci_ioctl_set);
 
 static long sock_do_ioctl(struct net *net, struct socket *sock,
-                                unsigned int cmd, unsigned long arg)
+                         unsigned int cmd, unsigned long arg,
+                         unsigned int ifreq_size)
 {
        int err;
        void __user *argp = (void __user *)arg;
@@ -967,11 +968,11 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
        } else {
                struct ifreq ifr;
                bool need_copyout;
-               if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
+               if (copy_from_user(&ifr, argp, ifreq_size))
                        return -EFAULT;
                err = dev_ioctl(net, cmd, &ifr, &need_copyout);
                if (!err && need_copyout)
-                       if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
+                       if (copy_to_user(argp, &ifr, ifreq_size))
                                return -EFAULT;
        }
        return err;
@@ -1070,7 +1071,8 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
                        err = open_related_ns(&net->ns, get_net_ns);
                        break;
                default:
-                       err = sock_do_ioctl(net, sock, cmd, arg);
+                       err = sock_do_ioctl(net, sock, cmd, arg,
+                                           sizeof(struct ifreq));
                        break;
                }
        return err;
@@ -2750,7 +2752,8 @@ static int do_siocgstamp(struct net *net, struct socket *sock,
        int err;
 
        set_fs(KERNEL_DS);
-       err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv);
+       err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv,
+                           sizeof(struct compat_ifreq));
        set_fs(old_fs);
        if (!err)
                err = compat_put_timeval(&ktv, up);
@@ -2766,7 +2769,8 @@ static int do_siocgstampns(struct net *net, struct socket *sock,
        int err;
 
        set_fs(KERNEL_DS);
-       err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts);
+       err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts,
+                           sizeof(struct compat_ifreq));
        set_fs(old_fs);
        if (!err)
                err = compat_put_timespec(&kts, up);
@@ -3072,7 +3076,8 @@ static int routing_ioctl(struct net *net, struct socket *sock,
        }
 
        set_fs(KERNEL_DS);
-       ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r);
+       ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r,
+                           sizeof(struct compat_ifreq));
        set_fs(old_fs);
 
 out:
@@ -3185,7 +3190,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
        case SIOCBONDSETHWADDR:
        case SIOCBONDCHANGEACTIVE:
        case SIOCGIFNAME:
-               return sock_do_ioctl(net, sock, cmd, arg);
+               return sock_do_ioctl(net, sock, cmd, arg,
+                                    sizeof(struct compat_ifreq));
        }
 
        return -ENOIOCTLCMD;
index 292742e..961b07d 100644 (file)
@@ -686,7 +686,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
                goto free_marker_record;
        }
 
-       crypto_info = &ctx->crypto_send;
+       crypto_info = &ctx->crypto_send.info;
        switch (crypto_info->cipher_type) {
        case TLS_CIPHER_AES_GCM_128:
                nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
@@ -780,7 +780,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
 
        ctx->priv_ctx_tx = offload_ctx;
        rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX,
-                                            &ctx->crypto_send,
+                                            &ctx->crypto_send.info,
                                             tcp_sk(sk)->write_seq);
        if (rc)
                goto release_netdev;
@@ -862,7 +862,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
                goto release_ctx;
 
        rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX,
-                                            &ctx->crypto_recv,
+                                            &ctx->crypto_recv.info,
                                             tcp_sk(sk)->copied_seq);
        if (rc) {
                pr_err_ratelimited("%s: The netdev has refused to offload this socket\n",
index 6102169..450a6db 100644 (file)
@@ -320,7 +320,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx,
                goto free_req;
 
        iv = buf;
-       memcpy(iv, tls_ctx->crypto_send_aes_gcm_128.salt,
+       memcpy(iv, tls_ctx->crypto_send.aes_gcm_128.salt,
               TLS_CIPHER_AES_GCM_128_SALT_SIZE);
        aad = buf + TLS_CIPHER_AES_GCM_128_SALT_SIZE +
              TLS_CIPHER_AES_GCM_128_IV_SIZE;
index 180b664..523622d 100644 (file)
@@ -241,6 +241,16 @@ static void tls_write_space(struct sock *sk)
        ctx->sk_write_space(sk);
 }
 
+static void tls_ctx_free(struct tls_context *ctx)
+{
+       if (!ctx)
+               return;
+
+       memzero_explicit(&ctx->crypto_send, sizeof(ctx->crypto_send));
+       memzero_explicit(&ctx->crypto_recv, sizeof(ctx->crypto_recv));
+       kfree(ctx);
+}
+
 static void tls_sk_proto_close(struct sock *sk, long timeout)
 {
        struct tls_context *ctx = tls_get_ctx(sk);
@@ -294,7 +304,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 #else
        {
 #endif
-               kfree(ctx);
+               tls_ctx_free(ctx);
                ctx = NULL;
        }
 
@@ -305,7 +315,7 @@ skip_tx_cleanup:
         * for sk->sk_prot->unhash [tls_hw_unhash]
         */
        if (free_ctx)
-               kfree(ctx);
+               tls_ctx_free(ctx);
 }
 
 static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
@@ -330,7 +340,7 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
        }
 
        /* get user crypto info */
-       crypto_info = &ctx->crypto_send;
+       crypto_info = &ctx->crypto_send.info;
 
        if (!TLS_CRYPTO_INFO_READY(crypto_info)) {
                rc = -EBUSY;
@@ -417,9 +427,9 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
        }
 
        if (tx)
-               crypto_info = &ctx->crypto_send;
+               crypto_info = &ctx->crypto_send.info;
        else
-               crypto_info = &ctx->crypto_recv;
+               crypto_info = &ctx->crypto_recv.info;
 
        /* Currently we don't support set crypto info more than one time */
        if (TLS_CRYPTO_INFO_READY(crypto_info)) {
@@ -499,7 +509,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
        goto out;
 
 err_crypto_info:
-       memset(crypto_info, 0, sizeof(*crypto_info));
+       memzero_explicit(crypto_info, sizeof(union tls_crypto_context));
 out:
        return rc;
 }
index e28a6ff..b9c6ecf 100644 (file)
@@ -931,7 +931,15 @@ int tls_sw_recvmsg(struct sock *sk,
                                if (control != TLS_RECORD_TYPE_DATA)
                                        goto recv_end;
                        }
+               } else {
+                       /* MSG_PEEK right now cannot look beyond current skb
+                        * from strparser, meaning we cannot advance skb here
+                        * and thus unpause strparser since we'd loose original
+                        * one.
+                        */
+                       break;
                }
+
                /* If we have a new message from strparser, continue now. */
                if (copied >= target && !ctx->recv_pkt)
                        break;
@@ -1055,8 +1063,8 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
                goto read_failure;
        }
 
-       if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.version) ||
-           header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.version)) {
+       if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.info.version) ||
+           header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.info.version)) {
                ret = -EINVAL;
                goto read_failure;
        }
@@ -1136,7 +1144,6 @@ void tls_sw_free_resources_rx(struct sock *sk)
 
 int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 {
-       char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
        struct tls_crypto_info *crypto_info;
        struct tls12_crypto_info_aes_gcm_128 *gcm_128_info;
        struct tls_sw_context_tx *sw_ctx_tx = NULL;
@@ -1181,12 +1188,12 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 
        if (tx) {
                crypto_init_wait(&sw_ctx_tx->async_wait);
-               crypto_info = &ctx->crypto_send;
+               crypto_info = &ctx->crypto_send.info;
                cctx = &ctx->tx;
                aead = &sw_ctx_tx->aead_send;
        } else {
                crypto_init_wait(&sw_ctx_rx->async_wait);
-               crypto_info = &ctx->crypto_recv;
+               crypto_info = &ctx->crypto_recv.info;
                cctx = &ctx->rx;
                aead = &sw_ctx_rx->aead_recv;
        }
@@ -1265,9 +1272,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 
        ctx->push_pending_record = tls_sw_push_pending_record;
 
-       memcpy(keyval, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
-
-       rc = crypto_aead_setkey(*aead, keyval,
+       rc = crypto_aead_setkey(*aead, gcm_128_info->key,
                                TLS_CIPHER_AES_GCM_128_KEY_SIZE);
        if (rc)
                goto free_aead;
diff --git a/scripts/subarch.include b/scripts/subarch.include
new file mode 100644 (file)
index 0000000..6506828
--- /dev/null
@@ -0,0 +1,13 @@
+# SUBARCH tells the usermode build what the underlying arch is.  That is set
+# first, and if a usermode build is happening, the "ARCH=um" on the command
+# line overrides the setting of ARCH below.  If a native build is happening,
+# then ARCH is assigned, getting whatever value it gets normally, and
+# SUBARCH is subsequently ignored.
+
+SUBARCH := $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ \
+                                 -e s/sun4u/sparc64/ \
+                                 -e s/arm.*/arm/ -e s/sa110/arm/ \
+                                 -e s/s390x/s390/ -e s/parisc64/parisc/ \
+                                 -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \
+                                 -e s/sh[234].*/sh/ -e s/aarch64.*/arm64/ \
+                                 -e s/riscv.*/riscv/)
index 730ea91..9367635 100644 (file)
@@ -263,6 +263,8 @@ do_registration(struct work_struct *work)
 error:
        mutex_unlock(&devices_mutex);
        snd_bebob_stream_destroy_duplex(bebob);
+       kfree(bebob->maudio_special_quirk);
+       bebob->maudio_special_quirk = NULL;
        snd_card_free(bebob->card);
        dev_info(&bebob->unit->device,
                 "Sound card registration failed: %d\n", err);
index bd55620..c266997 100644 (file)
@@ -96,17 +96,13 @@ int snd_bebob_maudio_load_firmware(struct fw_unit *unit)
        struct fw_device *device = fw_parent_device(unit);
        int err, rcode;
        u64 date;
-       __le32 cues[3] = {
-               cpu_to_le32(MAUDIO_BOOTLOADER_CUE1),
-               cpu_to_le32(MAUDIO_BOOTLOADER_CUE2),
-               cpu_to_le32(MAUDIO_BOOTLOADER_CUE3)
-       };
+       __le32 *cues;
 
        /* check date of software used to build */
        err = snd_bebob_read_block(unit, INFO_OFFSET_SW_DATE,
                                   &date, sizeof(u64));
        if (err < 0)
-               goto end;
+               return err;
        /*
         * firmware version 5058 or later has date later than "20070401", but
         * 'date' is not null-terminated.
@@ -114,20 +110,28 @@ int snd_bebob_maudio_load_firmware(struct fw_unit *unit)
        if (date < 0x3230303730343031LL) {
                dev_err(&unit->device,
                        "Use firmware version 5058 or later\n");
-               err = -ENOSYS;
-               goto end;
+               return -ENXIO;
        }
 
+       cues = kmalloc_array(3, sizeof(*cues), GFP_KERNEL);
+       if (!cues)
+               return -ENOMEM;
+
+       cues[0] = cpu_to_le32(MAUDIO_BOOTLOADER_CUE1);
+       cues[1] = cpu_to_le32(MAUDIO_BOOTLOADER_CUE2);
+       cues[2] = cpu_to_le32(MAUDIO_BOOTLOADER_CUE3);
+
        rcode = fw_run_transaction(device->card, TCODE_WRITE_BLOCK_REQUEST,
                                   device->node_id, device->generation,
                                   device->max_speed, BEBOB_ADDR_REG_REQ,
-                                  cues, sizeof(cues));
+                                  cues, 3 * sizeof(*cues));
+       kfree(cues);
        if (rcode != RCODE_COMPLETE) {
                dev_err(&unit->device,
                        "Failed to send a cue to load firmware\n");
                err = -EIO;
        }
-end:
+
        return err;
 }
 
@@ -290,10 +294,6 @@ snd_bebob_maudio_special_discover(struct snd_bebob *bebob, bool is1814)
                bebob->midi_output_ports = 2;
        }
 end:
-       if (err < 0) {
-               kfree(params);
-               bebob->maudio_special_quirk = NULL;
-       }
        mutex_unlock(&bebob->mutex);
        return err;
 }
index 1f5e1d2..ef68999 100644 (file)
@@ -49,6 +49,7 @@ static void dg00x_free(struct snd_dg00x *dg00x)
        fw_unit_put(dg00x->unit);
 
        mutex_destroy(&dg00x->mutex);
+       kfree(dg00x);
 }
 
 static void dg00x_card_free(struct snd_card *card)
index ad7a0a3..64c3cb0 100644 (file)
@@ -146,6 +146,7 @@ static int ff400_switch_fetching_mode(struct snd_ff *ff, bool enable)
 {
        __le32 *reg;
        int i;
+       int err;
 
        reg = kcalloc(18, sizeof(__le32), GFP_KERNEL);
        if (reg == NULL)
@@ -163,9 +164,11 @@ static int ff400_switch_fetching_mode(struct snd_ff *ff, bool enable)
                        reg[i] = cpu_to_le32(0x00000001);
        }
 
-       return snd_fw_transaction(ff->unit, TCODE_WRITE_BLOCK_REQUEST,
-                                 FF400_FETCH_PCM_FRAMES, reg,
-                                 sizeof(__le32) * 18, 0);
+       err = snd_fw_transaction(ff->unit, TCODE_WRITE_BLOCK_REQUEST,
+                                FF400_FETCH_PCM_FRAMES, reg,
+                                sizeof(__le32) * 18, 0);
+       kfree(reg);
+       return err;
 }
 
 static void ff400_dump_sync_status(struct snd_ff *ff,
index 71a0613..f2d0733 100644 (file)
@@ -301,6 +301,8 @@ error:
        snd_efw_transaction_remove_instance(efw);
        snd_efw_stream_destroy_duplex(efw);
        snd_card_free(efw->card);
+       kfree(efw->resp_buf);
+       efw->resp_buf = NULL;
        dev_info(&efw->unit->device,
                 "Sound card registration failed: %d\n", err);
 }
index 1e5b2c8..2ea8be6 100644 (file)
@@ -130,6 +130,7 @@ static void oxfw_free(struct snd_oxfw *oxfw)
 
        kfree(oxfw->spec);
        mutex_destroy(&oxfw->mutex);
+       kfree(oxfw);
 }
 
 /*
@@ -207,6 +208,7 @@ static int detect_quirks(struct snd_oxfw *oxfw)
 static void do_registration(struct work_struct *work)
 {
        struct snd_oxfw *oxfw = container_of(work, struct snd_oxfw, dwork.work);
+       int i;
        int err;
 
        if (oxfw->registered)
@@ -269,7 +271,15 @@ error:
        snd_oxfw_stream_destroy_simplex(oxfw, &oxfw->rx_stream);
        if (oxfw->has_output)
                snd_oxfw_stream_destroy_simplex(oxfw, &oxfw->tx_stream);
+       for (i = 0; i < SND_OXFW_STREAM_FORMAT_ENTRIES; ++i) {
+               kfree(oxfw->tx_stream_formats[i]);
+               oxfw->tx_stream_formats[i] = NULL;
+               kfree(oxfw->rx_stream_formats[i]);
+               oxfw->rx_stream_formats[i] = NULL;
+       }
        snd_card_free(oxfw->card);
+       kfree(oxfw->spec);
+       oxfw->spec = NULL;
        dev_info(&oxfw->unit->device,
                 "Sound card registration failed: %d\n", err);
 }
index 44ad41f..d3fdc46 100644 (file)
@@ -93,6 +93,7 @@ static void tscm_free(struct snd_tscm *tscm)
        fw_unit_put(tscm->unit);
 
        mutex_destroy(&tscm->mutex);
+       kfree(tscm);
 }
 
 static void tscm_card_free(struct snd_card *card)
index 560ec09..74244d8 100644 (file)
@@ -40,6 +40,8 @@ static void azx_clear_corbrp(struct hdac_bus *bus)
  */
 void snd_hdac_bus_init_cmd_io(struct hdac_bus *bus)
 {
+       WARN_ON_ONCE(!bus->rb.area);
+
        spin_lock_irq(&bus->reg_lock);
        /* CORB set up */
        bus->corb.addr = bus->rb.addr;
@@ -383,7 +385,7 @@ void snd_hdac_bus_exit_link_reset(struct hdac_bus *bus)
 EXPORT_SYMBOL_GPL(snd_hdac_bus_exit_link_reset);
 
 /* reset codec link */
-static int azx_reset(struct hdac_bus *bus, bool full_reset)
+int snd_hdac_bus_reset_link(struct hdac_bus *bus, bool full_reset)
 {
        if (!full_reset)
                goto skip_reset;
@@ -408,7 +410,7 @@ static int azx_reset(struct hdac_bus *bus, bool full_reset)
  skip_reset:
        /* check to see if controller is ready */
        if (!snd_hdac_chip_readb(bus, GCTL)) {
-               dev_dbg(bus->dev, "azx_reset: controller not ready!\n");
+               dev_dbg(bus->dev, "controller not ready!\n");
                return -EBUSY;
        }
 
@@ -423,6 +425,7 @@ static int azx_reset(struct hdac_bus *bus, bool full_reset)
 
        return 0;
 }
+EXPORT_SYMBOL_GPL(snd_hdac_bus_reset_link);
 
 /* enable interrupts */
 static void azx_int_enable(struct hdac_bus *bus)
@@ -477,15 +480,17 @@ bool snd_hdac_bus_init_chip(struct hdac_bus *bus, bool full_reset)
                return false;
 
        /* reset controller */
-       azx_reset(bus, full_reset);
+       snd_hdac_bus_reset_link(bus, full_reset);
 
-       /* initialize interrupts */
+       /* clear interrupts */
        azx_int_clear(bus);
-       azx_int_enable(bus);
 
        /* initialize the codec command I/O */
        snd_hdac_bus_init_cmd_io(bus);
 
+       /* enable interrupts after CORB/RIRB buffers are initialized above */
+       azx_int_enable(bus);
+
        /* program the position buffer */
        if (bus->use_posbuf && bus->posbuf.addr) {
                snd_hdac_chip_writel(bus, DPLBASE, (u32)bus->posbuf.addr);
index 9071374..6ebe817 100644 (file)
@@ -2540,7 +2540,7 @@ static int snd_emu10k1_fx8010_ioctl(struct snd_hwdep * hw, struct file *file, un
                emu->support_tlv = 1;
                return put_user(SNDRV_EMU10K1_VERSION, (int __user *)argp);
        case SNDRV_EMU10K1_IOCTL_INFO:
-               info = kmalloc(sizeof(*info), GFP_KERNEL);
+               info = kzalloc(sizeof(*info), GFP_KERNEL);
                if (!info)
                        return -ENOMEM;
                snd_emu10k1_fx8010_info(emu, info);
index 1b2ce30..aa4c672 100644 (file)
@@ -365,8 +365,10 @@ enum {
  */
 #ifdef SUPPORT_VGA_SWITCHEROO
 #define use_vga_switcheroo(chip)       ((chip)->use_vga_switcheroo)
+#define needs_eld_notify_link(chip)    ((chip)->need_eld_notify_link)
 #else
 #define use_vga_switcheroo(chip)       0
+#define needs_eld_notify_link(chip)    false
 #endif
 
 #define CONTROLLER_IN_GPU(pci) (((pci)->device == 0x0a0c) || \
@@ -453,6 +455,7 @@ static inline void mark_runtime_wc(struct azx *chip, struct azx_dev *azx_dev,
 #endif
 
 static int azx_acquire_irq(struct azx *chip, int do_disconnect);
+static void set_default_power_save(struct azx *chip);
 
 /*
  * initialize the PCI registers
@@ -1201,6 +1204,10 @@ static int azx_runtime_idle(struct device *dev)
            azx_bus(chip)->codec_powered || !chip->running)
                return -EBUSY;
 
+       /* ELD notification gets broken when HD-audio bus is off */
+       if (needs_eld_notify_link(hda))
+               return -EBUSY;
+
        return 0;
 }
 
@@ -1298,6 +1305,36 @@ static bool azx_vs_can_switch(struct pci_dev *pci)
        return true;
 }
 
+/*
+ * The discrete GPU cannot power down unless the HDA controller runtime
+ * suspends, so activate runtime PM on codecs even if power_save == 0.
+ */
+static void setup_vga_switcheroo_runtime_pm(struct azx *chip)
+{
+       struct hda_intel *hda = container_of(chip, struct hda_intel, chip);
+       struct hda_codec *codec;
+
+       if (hda->use_vga_switcheroo && !hda->need_eld_notify_link) {
+               list_for_each_codec(codec, &chip->bus)
+                       codec->auto_runtime_pm = 1;
+               /* reset the power save setup */
+               if (chip->running)
+                       set_default_power_save(chip);
+       }
+}
+
+static void azx_vs_gpu_bound(struct pci_dev *pci,
+                            enum vga_switcheroo_client_id client_id)
+{
+       struct snd_card *card = pci_get_drvdata(pci);
+       struct azx *chip = card->private_data;
+       struct hda_intel *hda = container_of(chip, struct hda_intel, chip);
+
+       if (client_id == VGA_SWITCHEROO_DIS)
+               hda->need_eld_notify_link = 0;
+       setup_vga_switcheroo_runtime_pm(chip);
+}
+
 static void init_vga_switcheroo(struct azx *chip)
 {
        struct hda_intel *hda = container_of(chip, struct hda_intel, chip);
@@ -1306,6 +1343,7 @@ static void init_vga_switcheroo(struct azx *chip)
                dev_info(chip->card->dev,
                         "Handle vga_switcheroo audio client\n");
                hda->use_vga_switcheroo = 1;
+               hda->need_eld_notify_link = 1; /* cleared in gpu_bound op */
                chip->driver_caps |= AZX_DCAPS_PM_RUNTIME;
                pci_dev_put(p);
        }
@@ -1314,6 +1352,7 @@ static void init_vga_switcheroo(struct azx *chip)
 static const struct vga_switcheroo_client_ops azx_vs_ops = {
        .set_gpu_state = azx_vs_set_state,
        .can_switch = azx_vs_can_switch,
+       .gpu_bound = azx_vs_gpu_bound,
 };
 
 static int register_vga_switcheroo(struct azx *chip)
@@ -1339,6 +1378,7 @@ static int register_vga_switcheroo(struct azx *chip)
 #define init_vga_switcheroo(chip)              /* NOP */
 #define register_vga_switcheroo(chip)          0
 #define check_hdmi_disabled(pci)       false
+#define setup_vga_switcheroo_runtime_pm(chip)  /* NOP */
 #endif /* SUPPORT_VGA_SWITCHER */
 
 /*
@@ -1352,6 +1392,7 @@ static int azx_free(struct azx *chip)
 
        if (azx_has_pm_runtime(chip) && chip->running)
                pm_runtime_get_noresume(&pci->dev);
+       chip->running = 0;
 
        azx_del_card_list(chip);
 
@@ -2230,6 +2271,25 @@ static struct snd_pci_quirk power_save_blacklist[] = {
 };
 #endif /* CONFIG_PM */
 
+static void set_default_power_save(struct azx *chip)
+{
+       int val = power_save;
+
+#ifdef CONFIG_PM
+       if (pm_blacklist) {
+               const struct snd_pci_quirk *q;
+
+               q = snd_pci_quirk_lookup(chip->pci, power_save_blacklist);
+               if (q && val) {
+                       dev_info(chip->card->dev, "device %04x:%04x is on the power_save blacklist, forcing power_save to 0\n",
+                                q->subvendor, q->subdevice);
+                       val = 0;
+               }
+       }
+#endif /* CONFIG_PM */
+       snd_hda_set_power_save(&chip->bus, val * 1000);
+}
+
 /* number of codec slots for each chipset: 0 = default slots (i.e. 4) */
 static unsigned int azx_max_codecs[AZX_NUM_DRIVERS] = {
        [AZX_DRIVER_NVIDIA] = 8,
@@ -2241,9 +2301,7 @@ static int azx_probe_continue(struct azx *chip)
        struct hda_intel *hda = container_of(chip, struct hda_intel, chip);
        struct hdac_bus *bus = azx_bus(chip);
        struct pci_dev *pci = chip->pci;
-       struct hda_codec *codec;
        int dev = chip->dev_index;
-       int val;
        int err;
 
        hda->probe_continued = 1;
@@ -2322,31 +2380,13 @@ static int azx_probe_continue(struct azx *chip)
        if (err < 0)
                goto out_free;
 
+       setup_vga_switcheroo_runtime_pm(chip);
+
        chip->running = 1;
        azx_add_card_list(chip);
 
-       val = power_save;
-#ifdef CONFIG_PM
-       if (pm_blacklist) {
-               const struct snd_pci_quirk *q;
-
-               q = snd_pci_quirk_lookup(chip->pci, power_save_blacklist);
-               if (q && val) {
-                       dev_info(chip->card->dev, "device %04x:%04x is on the power_save blacklist, forcing power_save to 0\n",
-                                q->subvendor, q->subdevice);
-                       val = 0;
-               }
-       }
-#endif /* CONFIG_PM */
-       /*
-        * The discrete GPU cannot power down unless the HDA controller runtime
-        * suspends, so activate runtime PM on codecs even if power_save == 0.
-        */
-       if (use_vga_switcheroo(hda))
-               list_for_each_codec(codec, &chip->bus)
-                       codec->auto_runtime_pm = 1;
+       set_default_power_save(chip);
 
-       snd_hda_set_power_save(&chip->bus, val * 1000);
        if (azx_has_pm_runtime(chip))
                pm_runtime_put_autosuspend(&pci->dev);
 
index e3a3d31..f59719e 100644 (file)
@@ -37,6 +37,7 @@ struct hda_intel {
 
        /* vga_switcheroo setup */
        unsigned int use_vga_switcheroo:1;
+       unsigned int need_eld_notify_link:1;
        unsigned int vga_switcheroo_registered:1;
        unsigned int init_failed:1; /* delayed init failed */
 
index e359938..77b265b 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/io.h>
+#include <linux/iopoll.h>
 #include <linux/sizes.h>
 #include <linux/pm_runtime.h>
 
@@ -184,6 +185,24 @@ static void config_dma_descriptor_in_sram(void __iomem *acp_mmio,
        acp_reg_write(descr_info->xfer_val, acp_mmio, mmACP_SRBM_Targ_Idx_Data);
 }
 
+static void pre_config_reset(void __iomem *acp_mmio, u16 ch_num)
+{
+       u32 dma_ctrl;
+       int ret;
+
+       /* clear the reset bit */
+       dma_ctrl = acp_reg_read(acp_mmio, mmACP_DMA_CNTL_0 + ch_num);
+       dma_ctrl &= ~ACP_DMA_CNTL_0__DMAChRst_MASK;
+       acp_reg_write(dma_ctrl, acp_mmio, mmACP_DMA_CNTL_0 + ch_num);
+       /* check the reset bit before programming configuration registers */
+       ret = readl_poll_timeout(acp_mmio + ((mmACP_DMA_CNTL_0 + ch_num) * 4),
+                                dma_ctrl,
+                                !(dma_ctrl & ACP_DMA_CNTL_0__DMAChRst_MASK),
+                                100, ACP_DMA_RESET_TIME);
+       if (ret < 0)
+               pr_err("Failed to clear reset of channel : %d\n", ch_num);
+}
+
 /*
  * Initialize the DMA descriptor information for transfer between
  * system memory <-> ACP SRAM
@@ -236,6 +255,7 @@ static void set_acp_sysmem_dma_descriptors(void __iomem *acp_mmio,
                config_dma_descriptor_in_sram(acp_mmio, dma_dscr_idx,
                                              &dmadscr[i]);
        }
+       pre_config_reset(acp_mmio, ch);
        config_acp_dma_channel(acp_mmio, ch,
                               dma_dscr_idx - 1,
                               NUM_DSCRS_PER_CHANNEL,
@@ -275,6 +295,7 @@ static void set_acp_to_i2s_dma_descriptors(void __iomem *acp_mmio, u32 size,
                config_dma_descriptor_in_sram(acp_mmio, dma_dscr_idx,
                                              &dmadscr[i]);
        }
+       pre_config_reset(acp_mmio, ch);
        /* Configure the DMA channel with the above descriptore */
        config_acp_dma_channel(acp_mmio, ch, dma_dscr_idx - 1,
                               NUM_DSCRS_PER_CHANNEL,
index 275677d..4075541 100644 (file)
@@ -157,8 +157,8 @@ static const struct snd_kcontrol_new cs4265_snd_controls[] = {
        SOC_SINGLE("Validity Bit Control Switch", CS4265_SPDIF_CTL2,
                                3, 1, 0),
        SOC_ENUM("SPDIF Mono/Stereo", spdif_mono_stereo_enum),
-       SOC_SINGLE("MMTLR Data Switch", 0,
-                               1, 1, 0),
+       SOC_SINGLE("MMTLR Data Switch", CS4265_SPDIF_CTL2,
+                               0, 1, 0),
        SOC_ENUM("Mono Channel Select", spdif_mono_select_enum),
        SND_SOC_BYTES("C Data Buffer", CS4265_C_DATA_BUFF, 24),
 };
index 92b7125..1093f76 100644 (file)
@@ -520,6 +520,7 @@ static bool max98373_volatile_reg(struct device *dev, unsigned int reg)
 {
        switch (reg) {
        case MAX98373_R2000_SW_RESET ... MAX98373_R2009_INT_FLAG3:
+       case MAX98373_R203E_AMP_PATH_GAIN:
        case MAX98373_R2054_MEAS_ADC_PVDD_CH_READBACK:
        case MAX98373_R2055_MEAS_ADC_THERM_CH_READBACK:
        case MAX98373_R20B6_BDE_CUR_STATE_READBACK:
@@ -729,6 +730,7 @@ static int max98373_probe(struct snd_soc_component *component)
        /* Software Reset */
        regmap_write(max98373->regmap,
                MAX98373_R2000_SW_RESET, MAX98373_SOFT_RESET);
+       usleep_range(10000, 11000);
 
        /* IV default slot configuration */
        regmap_write(max98373->regmap,
@@ -817,6 +819,7 @@ static int max98373_resume(struct device *dev)
 
        regmap_write(max98373->regmap,
                MAX98373_R2000_SW_RESET, MAX98373_SOFT_RESET);
+       usleep_range(10000, 11000);
        regcache_cache_only(max98373->regmap, false);
        regcache_sync(max98373->regmap);
        return 0;
index dca82dd..32fe76c 100644 (file)
@@ -64,8 +64,8 @@ static const struct reg_sequence rt5514_patch[] = {
        {RT5514_ANA_CTRL_LDO10,         0x00028604},
        {RT5514_ANA_CTRL_ADCFED,        0x00000800},
        {RT5514_ASRC_IN_CTRL1,          0x00000003},
-       {RT5514_DOWNFILTER0_CTRL3,      0x10000352},
-       {RT5514_DOWNFILTER1_CTRL3,      0x10000352},
+       {RT5514_DOWNFILTER0_CTRL3,      0x10000342},
+       {RT5514_DOWNFILTER1_CTRL3,      0x10000342},
 };
 
 static const struct reg_default rt5514_reg[] = {
@@ -92,10 +92,10 @@ static const struct reg_default rt5514_reg[] = {
        {RT5514_ASRC_IN_CTRL1,          0x00000003},
        {RT5514_DOWNFILTER0_CTRL1,      0x00020c2f},
        {RT5514_DOWNFILTER0_CTRL2,      0x00020c2f},
-       {RT5514_DOWNFILTER0_CTRL3,      0x10000352},
+       {RT5514_DOWNFILTER0_CTRL3,      0x10000342},
        {RT5514_DOWNFILTER1_CTRL1,      0x00020c2f},
        {RT5514_DOWNFILTER1_CTRL2,      0x00020c2f},
-       {RT5514_DOWNFILTER1_CTRL3,      0x10000352},
+       {RT5514_DOWNFILTER1_CTRL3,      0x10000342},
        {RT5514_ANA_CTRL_LDO10,         0x00028604},
        {RT5514_ANA_CTRL_LDO18_16,      0x02000345},
        {RT5514_ANA_CTRL_ADC12,         0x0000a2a8},
index 640d400..afe7d5b 100644 (file)
@@ -750,8 +750,8 @@ static bool rt5682_readable_register(struct device *dev, unsigned int reg)
 }
 
 static const DECLARE_TLV_DB_SCALE(hp_vol_tlv, -2250, 150, 0);
-static const DECLARE_TLV_DB_SCALE(dac_vol_tlv, -65625, 375, 0);
-static const DECLARE_TLV_DB_SCALE(adc_vol_tlv, -17625, 375, 0);
+static const DECLARE_TLV_DB_SCALE(dac_vol_tlv, -6525, 75, 0);
+static const DECLARE_TLV_DB_SCALE(adc_vol_tlv, -1725, 75, 0);
 static const DECLARE_TLV_DB_SCALE(adc_bst_tlv, 0, 1200, 0);
 
 /* {0, +20, +24, +30, +35, +40, +44, +50, +52} dB */
@@ -1114,7 +1114,7 @@ static const struct snd_kcontrol_new rt5682_snd_controls[] = {
 
        /* DAC Digital Volume */
        SOC_DOUBLE_TLV("DAC1 Playback Volume", RT5682_DAC1_DIG_VOL,
-               RT5682_L_VOL_SFT, RT5682_R_VOL_SFT, 175, 0, dac_vol_tlv),
+               RT5682_L_VOL_SFT + 1, RT5682_R_VOL_SFT + 1, 86, 0, dac_vol_tlv),
 
        /* IN Boost Volume */
        SOC_SINGLE_TLV("CBJ Boost Volume", RT5682_CBJ_BST_CTRL,
@@ -1124,7 +1124,7 @@ static const struct snd_kcontrol_new rt5682_snd_controls[] = {
        SOC_DOUBLE("STO1 ADC Capture Switch", RT5682_STO1_ADC_DIG_VOL,
                RT5682_L_MUTE_SFT, RT5682_R_MUTE_SFT, 1, 1),
        SOC_DOUBLE_TLV("STO1 ADC Capture Volume", RT5682_STO1_ADC_DIG_VOL,
-               RT5682_L_VOL_SFT, RT5682_R_VOL_SFT, 127, 0, adc_vol_tlv),
+               RT5682_L_VOL_SFT + 1, RT5682_R_VOL_SFT + 1, 63, 0, adc_vol_tlv),
 
        /* ADC Boost Volume Control */
        SOC_DOUBLE_TLV("STO1 ADC Boost Gain Volume", RT5682_STO1_ADC_BOOST,
index d53680a..6df1586 100644 (file)
@@ -117,8 +117,7 @@ static int sigmadsp_ctrl_write(struct sigmadsp *sigmadsp,
        struct sigmadsp_control *ctrl, void *data)
 {
        /* safeload loads up to 20 bytes in a atomic operation */
-       if (ctrl->num_bytes > 4 && ctrl->num_bytes <= 20 && sigmadsp->ops &&
-           sigmadsp->ops->safeload)
+       if (ctrl->num_bytes <= 20 && sigmadsp->ops && sigmadsp->ops->safeload)
                return sigmadsp->ops->safeload(sigmadsp, ctrl->addr, data,
                        ctrl->num_bytes);
        else
index 14999b9..0d61455 100644 (file)
@@ -424,8 +424,10 @@ static void tas6424_fault_check_work(struct work_struct *work)
               TAS6424_FAULT_PVDD_UV |
               TAS6424_FAULT_VBAT_UV;
 
-       if (reg)
+       if (!reg) {
+               tas6424->last_fault1 = reg;
                goto check_global_fault2_reg;
+       }
 
        /*
         * Only flag errors once for a given occurrence. This is needed as
@@ -461,8 +463,10 @@ check_global_fault2_reg:
               TAS6424_FAULT_OTSD_CH3 |
               TAS6424_FAULT_OTSD_CH4;
 
-       if (!reg)
+       if (!reg) {
+               tas6424->last_fault2 = reg;
                goto check_warn_reg;
+       }
 
        if ((reg & TAS6424_FAULT_OTSD) && !(tas6424->last_fault2 & TAS6424_FAULT_OTSD))
                dev_crit(dev, "experienced a global overtemp shutdown\n");
@@ -497,8 +501,10 @@ check_warn_reg:
               TAS6424_WARN_VDD_OTW_CH3 |
               TAS6424_WARN_VDD_OTW_CH4;
 
-       if (!reg)
+       if (!reg) {
+               tas6424->last_warn = reg;
                goto out;
+       }
 
        if ((reg & TAS6424_WARN_VDD_UV) && !(tas6424->last_warn & TAS6424_WARN_VDD_UV))
                dev_warn(dev, "experienced a VDD under voltage condition\n");
index f27464c..7954196 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/i2c.h>
+#include <linux/acpi.h>
 
 #include "wm8804.h"
 
@@ -40,17 +41,29 @@ static const struct i2c_device_id wm8804_i2c_id[] = {
 };
 MODULE_DEVICE_TABLE(i2c, wm8804_i2c_id);
 
+#if defined(CONFIG_OF)
 static const struct of_device_id wm8804_of_match[] = {
        { .compatible = "wlf,wm8804", },
        { }
 };
 MODULE_DEVICE_TABLE(of, wm8804_of_match);
+#endif
+
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id wm8804_acpi_match[] = {
+       { "1AEC8804", 0 }, /* Wolfson PCI ID + part ID */
+       { "10138804", 0 }, /* Cirrus Logic PCI ID + part ID */
+       { },
+};
+MODULE_DEVICE_TABLE(acpi, wm8804_acpi_match);
+#endif
 
 static struct i2c_driver wm8804_i2c_driver = {
        .driver = {
                .name = "wm8804",
                .pm = &wm8804_pm,
-               .of_match_table = wm8804_of_match,
+               .of_match_table = of_match_ptr(wm8804_of_match),
+               .acpi_match_table = ACPI_PTR(wm8804_acpi_match),
        },
        .probe = wm8804_i2c_probe,
        .remove = wm8804_i2c_remove,
index 953d94d..ade34c2 100644 (file)
@@ -719,7 +719,7 @@ static int wm9712_probe(struct platform_device *pdev)
 
 static struct platform_driver wm9712_component_driver = {
        .driver = {
-               .name = "wm9712-component",
+               .name = "wm9712-codec",
        },
 
        .probe = wm9712_probe,
index d32844f..b6dc524 100644 (file)
@@ -575,6 +575,17 @@ static const struct dmi_system_id byt_rt5640_quirk_table[] = {
                                        BYT_RT5640_MONO_SPEAKER |
                                        BYT_RT5640_MCLK_EN),
        },
+       {       /* Linx Linx7 tablet */
+               .matches = {
+                       DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LINX"),
+                       DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "LINX7"),
+               },
+               .driver_data = (void *)(BYTCR_INPUT_DEFAULTS |
+                                       BYT_RT5640_MONO_SPEAKER |
+                                       BYT_RT5640_JD_NOT_INV |
+                                       BYT_RT5640_SSP0_AIF1 |
+                                       BYT_RT5640_MCLK_EN),
+       },
        {       /* MSI S100 tablet */
                .matches = {
                        DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Micro-Star International Co., Ltd."),
@@ -602,6 +613,21 @@ static const struct dmi_system_id byt_rt5640_quirk_table[] = {
                                        BYT_RT5640_SSP0_AIF1 |
                                        BYT_RT5640_MCLK_EN),
        },
+       {       /* Onda v975w */
+               .matches = {
+                       DMI_EXACT_MATCH(DMI_BOARD_VENDOR, "AMI Corporation"),
+                       DMI_EXACT_MATCH(DMI_BOARD_NAME, "Aptio CRB"),
+                       /* The above are too generic, also match BIOS info */
+                       DMI_EXACT_MATCH(DMI_BIOS_VERSION, "5.6.5"),
+                       DMI_EXACT_MATCH(DMI_BIOS_DATE, "07/25/2014"),
+               },
+               .driver_data = (void *)(BYT_RT5640_IN1_MAP |
+                                       BYT_RT5640_JD_SRC_JD2_IN4N |
+                                       BYT_RT5640_OVCD_TH_2000UA |
+                                       BYT_RT5640_OVCD_SF_0P75 |
+                                       BYT_RT5640_DIFF_MIC |
+                                       BYT_RT5640_MCLK_EN),
+       },
        {       /* Pipo W4 */
                .matches = {
                        DMI_EXACT_MATCH(DMI_BOARD_VENDOR, "AMI Corporation"),
index dce6494..1d17be0 100644 (file)
@@ -834,7 +834,7 @@ static int skl_first_init(struct hdac_bus *bus)
                return -ENXIO;
        }
 
-       skl_init_chip(bus, true);
+       snd_hdac_bus_reset_link(bus, true);
 
        snd_hdac_bus_parse_capabilities(bus);
 
index dc94c5c..c6b5157 100644 (file)
@@ -960,8 +960,10 @@ static int msm_routing_probe(struct snd_soc_component *c)
 {
        int i;
 
-       for (i = 0; i < MAX_SESSIONS; i++)
+       for (i = 0; i < MAX_SESSIONS; i++) {
                routing_data->sessions[i].port_id = -1;
+               routing_data->sessions[i].fedai_id = -1;
+       }
 
        return 0;
 }
index 3a3064d..051f964 100644 (file)
@@ -462,6 +462,11 @@ static void rsnd_adg_get_clkout(struct rsnd_priv *priv,
                goto rsnd_adg_get_clkout_end;
 
        req_size = prop->length / sizeof(u32);
+       if (req_size > REQ_SIZE) {
+               dev_err(dev,
+                       "too many clock-frequency, use top %d\n", REQ_SIZE);
+               req_size = REQ_SIZE;
+       }
 
        of_property_read_u32_array(np, "clock-frequency", req_rate, req_size);
        req_48kHz_rate = 0;
index f8425d8..d23c2bb 100644 (file)
@@ -478,7 +478,7 @@ static int rsnd_status_update(u32 *status,
                        (func_call && (mod)->ops->fn) ? #fn : "");      \
                if (func_call && (mod)->ops->fn)                        \
                        tmp = (mod)->ops->fn(mod, io, param);           \
-               if (tmp)                                                \
+               if (tmp && (tmp != -EPROBE_DEFER))                      \
                        dev_err(dev, "%s[%d] : %s error %d\n",          \
                                rsnd_mod_name(mod), rsnd_mod_id(mod),   \
                                                     #fn, tmp);         \
@@ -958,12 +958,23 @@ static void rsnd_soc_dai_shutdown(struct snd_pcm_substream *substream,
        rsnd_dai_stream_quit(io);
 }
 
+static int rsnd_soc_dai_prepare(struct snd_pcm_substream *substream,
+                               struct snd_soc_dai *dai)
+{
+       struct rsnd_priv *priv = rsnd_dai_to_priv(dai);
+       struct rsnd_dai *rdai = rsnd_dai_to_rdai(dai);
+       struct rsnd_dai_stream *io = rsnd_rdai_to_io(rdai, substream);
+
+       return rsnd_dai_call(prepare, io, priv);
+}
+
 static const struct snd_soc_dai_ops rsnd_soc_dai_ops = {
        .startup        = rsnd_soc_dai_startup,
        .shutdown       = rsnd_soc_dai_shutdown,
        .trigger        = rsnd_soc_dai_trigger,
        .set_fmt        = rsnd_soc_dai_set_fmt,
        .set_tdm_slot   = rsnd_soc_set_dai_tdm_slot,
+       .prepare        = rsnd_soc_dai_prepare,
 };
 
 void rsnd_parse_connect_common(struct rsnd_dai *rdai,
@@ -1550,6 +1561,14 @@ exit_snd_probe:
                rsnd_dai_call(remove, &rdai->capture, priv);
        }
 
+       /*
+        * adg is very special mod which can't use rsnd_dai_call(remove),
+        * and it registers ADG clock on probe.
+        * It should be unregister if probe failed.
+        * Mainly it is assuming -EPROBE_DEFER case
+        */
+       rsnd_adg_remove(priv);
+
        return ret;
 }
 
index fe63ef8..d65ea7b 100644 (file)
@@ -241,6 +241,10 @@ static int rsnd_dmaen_attach(struct rsnd_dai_stream *io,
        /* try to get DMAEngine channel */
        chan = rsnd_dmaen_request_channel(io, mod_from, mod_to);
        if (IS_ERR_OR_NULL(chan)) {
+               /* Let's follow when -EPROBE_DEFER case */
+               if (PTR_ERR(chan) == -EPROBE_DEFER)
+                       return PTR_ERR(chan);
+
                /*
                 * DMA failed. try to PIO mode
                 * see
index 96d9333..8f7a0ab 100644 (file)
@@ -280,6 +280,9 @@ struct rsnd_mod_ops {
        int (*nolock_stop)(struct rsnd_mod *mod,
                    struct rsnd_dai_stream *io,
                    struct rsnd_priv *priv);
+       int (*prepare)(struct rsnd_mod *mod,
+                      struct rsnd_dai_stream *io,
+                      struct rsnd_priv *priv);
 };
 
 struct rsnd_dai_stream;
@@ -309,6 +312,7 @@ struct rsnd_mod {
  * H   0: fallback
  * H   0: hw_params
  * H   0: pointer
+ * H   0: prepare
  */
 #define __rsnd_mod_shift_nolock_start  0
 #define __rsnd_mod_shift_nolock_stop   0
@@ -323,6 +327,7 @@ struct rsnd_mod {
 #define __rsnd_mod_shift_fallback      28 /* always called */
 #define __rsnd_mod_shift_hw_params     28 /* always called */
 #define __rsnd_mod_shift_pointer       28 /* always called */
+#define __rsnd_mod_shift_prepare       28 /* always called */
 
 #define __rsnd_mod_add_probe           0
 #define __rsnd_mod_add_remove          0
@@ -337,6 +342,7 @@ struct rsnd_mod {
 #define __rsnd_mod_add_fallback                0
 #define __rsnd_mod_add_hw_params       0
 #define __rsnd_mod_add_pointer         0
+#define __rsnd_mod_add_prepare         0
 
 #define __rsnd_mod_call_probe          0
 #define __rsnd_mod_call_remove         0
@@ -351,6 +357,7 @@ struct rsnd_mod {
 #define __rsnd_mod_call_pointer                0
 #define __rsnd_mod_call_nolock_start   0
 #define __rsnd_mod_call_nolock_stop    1
+#define __rsnd_mod_call_prepare                0
 
 #define rsnd_mod_to_priv(mod)  ((mod)->priv)
 #define rsnd_mod_name(mod)     ((mod)->ops->name)
index 8304e4e..3f880ec 100644 (file)
@@ -283,7 +283,7 @@ static int rsnd_ssi_master_clk_start(struct rsnd_mod *mod,
        if (rsnd_ssi_is_multi_slave(mod, io))
                return 0;
 
-       if (ssi->usrcnt > 1) {
+       if (ssi->rate) {
                if (ssi->rate != rate) {
                        dev_err(dev, "SSI parent/child should use same rate\n");
                        return -EINVAL;
@@ -434,7 +434,6 @@ static int rsnd_ssi_init(struct rsnd_mod *mod,
                         struct rsnd_priv *priv)
 {
        struct rsnd_ssi *ssi = rsnd_mod_to_ssi(mod);
-       int ret;
 
        if (!rsnd_ssi_is_run_mods(mod, io))
                return 0;
@@ -443,10 +442,6 @@ static int rsnd_ssi_init(struct rsnd_mod *mod,
 
        rsnd_mod_power_on(mod);
 
-       ret = rsnd_ssi_master_clk_start(mod, io);
-       if (ret < 0)
-               return ret;
-
        rsnd_ssi_config_init(mod, io);
 
        rsnd_ssi_register_setup(mod);
@@ -852,6 +847,13 @@ static int rsnd_ssi_pio_pointer(struct rsnd_mod *mod,
        return 0;
 }
 
+static int rsnd_ssi_prepare(struct rsnd_mod *mod,
+                           struct rsnd_dai_stream *io,
+                           struct rsnd_priv *priv)
+{
+       return rsnd_ssi_master_clk_start(mod, io);
+}
+
 static struct rsnd_mod_ops rsnd_ssi_pio_ops = {
        .name   = SSI_NAME,
        .probe  = rsnd_ssi_common_probe,
@@ -864,6 +866,7 @@ static struct rsnd_mod_ops rsnd_ssi_pio_ops = {
        .pointer = rsnd_ssi_pio_pointer,
        .pcm_new = rsnd_ssi_pcm_new,
        .hw_params = rsnd_ssi_hw_params,
+       .prepare = rsnd_ssi_prepare,
 };
 
 static int rsnd_ssi_dma_probe(struct rsnd_mod *mod,
@@ -940,6 +943,7 @@ static struct rsnd_mod_ops rsnd_ssi_dma_ops = {
        .pcm_new = rsnd_ssi_pcm_new,
        .fallback = rsnd_ssi_fallback,
        .hw_params = rsnd_ssi_hw_params,
+       .prepare = rsnd_ssi_prepare,
 };
 
 int rsnd_ssi_is_dma_mode(struct rsnd_mod *mod)
index 9cfe10d..473eefe 100644 (file)
@@ -1447,7 +1447,7 @@ static int soc_link_dai_widgets(struct snd_soc_card *card,
        sink = codec_dai->playback_widget;
        source = cpu_dai->capture_widget;
        if (sink && source) {
-               ret = snd_soc_dapm_new_pcm(card, dai_link->params,
+               ret = snd_soc_dapm_new_pcm(card, rtd, dai_link->params,
                                           dai_link->num_params,
                                           source, sink);
                if (ret != 0) {
@@ -1460,7 +1460,7 @@ static int soc_link_dai_widgets(struct snd_soc_card *card,
        sink = cpu_dai->playback_widget;
        source = codec_dai->capture_widget;
        if (sink && source) {
-               ret = snd_soc_dapm_new_pcm(card, dai_link->params,
+               ret = snd_soc_dapm_new_pcm(card, rtd, dai_link->params,
                                           dai_link->num_params,
                                           source, sink);
                if (ret != 0) {
index 7e96793..461d951 100644 (file)
@@ -3652,6 +3652,7 @@ static int snd_soc_dai_link_event(struct snd_soc_dapm_widget *w,
 {
        struct snd_soc_dapm_path *source_p, *sink_p;
        struct snd_soc_dai *source, *sink;
+       struct snd_soc_pcm_runtime *rtd = w->priv;
        const struct snd_soc_pcm_stream *config = w->params + w->params_select;
        struct snd_pcm_substream substream;
        struct snd_pcm_hw_params *params = NULL;
@@ -3711,6 +3712,7 @@ static int snd_soc_dai_link_event(struct snd_soc_dapm_widget *w,
                goto out;
        }
        substream.runtime = runtime;
+       substream.private_data = rtd;
 
        switch (event) {
        case SND_SOC_DAPM_PRE_PMU:
@@ -3895,6 +3897,7 @@ outfree_w_param:
 }
 
 int snd_soc_dapm_new_pcm(struct snd_soc_card *card,
+                        struct snd_soc_pcm_runtime *rtd,
                         const struct snd_soc_pcm_stream *params,
                         unsigned int num_params,
                         struct snd_soc_dapm_widget *source,
@@ -3963,6 +3966,7 @@ int snd_soc_dapm_new_pcm(struct snd_soc_card *card,
 
        w->params = params;
        w->num_params = num_params;
+       w->priv = rtd;
 
        ret = snd_soc_dapm_add_path(&card->dapm, source, w, NULL, NULL);
        if (ret)
index 439b8a2..195ba48 100755 (executable)
@@ -1325,7 +1325,7 @@ class Tui(object):
         msg = ''
         while True:
             self.screen.erase()
-            self.screen.addstr(0, 0, 'Set update interval (defaults to %fs).' %
+            self.screen.addstr(0, 0, 'Set update interval (defaults to %.1fs).' %
                                DELAY_DEFAULT, curses.A_BOLD)
             self.screen.addstr(4, 0, msg)
             self.screen.addstr(2, 0, 'Change delay from %.1fs to ' %
index 853b95d..2011376 100644 (file)
@@ -15,7 +15,6 @@
        {0x400, "INST_STORAGE"}, \
        {0x480, "INST_SEGMENT"}, \
        {0x500, "EXTERNAL"}, \
-       {0x501, "EXTERNAL_LEVEL"}, \
        {0x502, "EXTERNAL_HV"}, \
        {0x600, "ALIGNMENT"}, \
        {0x700, "PROGRAM"}, \
index 72c25a3..d9a7254 100644 (file)
@@ -6,7 +6,7 @@ TEST_PROGS := run.sh
 
 include ../lib.mk
 
-all:
+all: khdr
        @for DIR in $(SUBDIRS); do              \
                BUILD_TARGET=$(OUTPUT)/$$DIR;   \
                mkdir $$BUILD_TARGET  -p;       \
index e036952..88cfe88 100644 (file)
@@ -10,6 +10,8 @@ $(TEST_GEN_FILES): ipcsocket.c ionutils.c
 
 TEST_PROGS := ion_test.sh
 
+KSFT_KHDR_INSTALL := 1
+top_srcdir = ../../../../..
 include ../../lib.mk
 
 $(OUTPUT)/ionapp_export: ionapp_export.c ipcsocket.c ionutils.c
index 1c5d2b2..14c9fe2 100644 (file)
@@ -89,17 +89,28 @@ int cg_read(const char *cgroup, const char *control, char *buf, size_t len)
 int cg_read_strcmp(const char *cgroup, const char *control,
                   const char *expected)
 {
-       size_t size = strlen(expected) + 1;
+       size_t size;
        char *buf;
+       int ret;
+
+       /* Handle the case of comparing against empty string */
+       if (!expected)
+               size = 32;
+       else
+               size = strlen(expected) + 1;
 
        buf = malloc(size);
        if (!buf)
                return -1;
 
-       if (cg_read(cgroup, control, buf, size))
+       if (cg_read(cgroup, control, buf, size)) {
+               free(buf);
                return -1;
+       }
 
-       return strcmp(expected, buf);
+       ret = strcmp(expected, buf);
+       free(buf);
+       return ret;
 }
 
 int cg_read_strstr(const char *cgroup, const char *control, const char *needle)
@@ -337,3 +348,24 @@ int is_swap_enabled(void)
 
        return cnt > 1;
 }
+
+int set_oom_adj_score(int pid, int score)
+{
+       char path[PATH_MAX];
+       int fd, len;
+
+       sprintf(path, "/proc/%d/oom_score_adj", pid);
+
+       fd = open(path, O_WRONLY | O_APPEND);
+       if (fd < 0)
+               return fd;
+
+       len = dprintf(fd, "%d", score);
+       if (len < 0) {
+               close(fd);
+               return len;
+       }
+
+       close(fd);
+       return 0;
+}
index 1ff6f9f..9ac8b79 100644 (file)
@@ -40,3 +40,4 @@ extern int get_temp_fd(void);
 extern int alloc_pagecache(int fd, size_t size);
 extern int alloc_anon(const char *cgroup, void *arg);
 extern int is_swap_enabled(void);
+extern int set_oom_adj_score(int pid, int score);
index cf0bddc..28d321b 100644 (file)
@@ -2,6 +2,7 @@
 #define _GNU_SOURCE
 
 #include <linux/limits.h>
+#include <linux/oom.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -202,6 +203,36 @@ static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
        return 0;
 }
 
+static int alloc_anon_noexit(const char *cgroup, void *arg)
+{
+       int ppid = getppid();
+
+       if (alloc_anon(cgroup, arg))
+               return -1;
+
+       while (getppid() == ppid)
+               sleep(1);
+
+       return 0;
+}
+
+/*
+ * Wait until processes are killed asynchronously by the OOM killer
+ * If we exceed a timeout, fail.
+ */
+static int cg_test_proc_killed(const char *cgroup)
+{
+       int limit;
+
+       for (limit = 10; limit > 0; limit--) {
+               if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
+                       return 0;
+
+               usleep(100000);
+       }
+       return -1;
+}
+
 /*
  * First, this test creates the following hierarchy:
  * A       memory.min = 50M,  memory.max = 200M
@@ -964,6 +995,177 @@ cleanup:
        return ret;
 }
 
+/*
+ * This test disables swapping and tries to allocate anonymous memory
+ * up to OOM with memory.group.oom set. Then it checks that all
+ * processes in the leaf (but not the parent) were killed.
+ */
+static int test_memcg_oom_group_leaf_events(const char *root)
+{
+       int ret = KSFT_FAIL;
+       char *parent, *child;
+
+       parent = cg_name(root, "memcg_test_0");
+       child = cg_name(root, "memcg_test_0/memcg_test_1");
+
+       if (!parent || !child)
+               goto cleanup;
+
+       if (cg_create(parent))
+               goto cleanup;
+
+       if (cg_create(child))
+               goto cleanup;
+
+       if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+               goto cleanup;
+
+       if (cg_write(child, "memory.max", "50M"))
+               goto cleanup;
+
+       if (cg_write(child, "memory.swap.max", "0"))
+               goto cleanup;
+
+       if (cg_write(child, "memory.oom.group", "1"))
+               goto cleanup;
+
+       cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
+       cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
+       cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
+       if (!cg_run(child, alloc_anon, (void *)MB(100)))
+               goto cleanup;
+
+       if (cg_test_proc_killed(child))
+               goto cleanup;
+
+       if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
+               goto cleanup;
+
+       if (cg_read_key_long(parent, "memory.events", "oom_kill ") != 0)
+               goto cleanup;
+
+       ret = KSFT_PASS;
+
+cleanup:
+       if (child)
+               cg_destroy(child);
+       if (parent)
+               cg_destroy(parent);
+       free(child);
+       free(parent);
+
+       return ret;
+}
+
+/*
+ * This test disables swapping and tries to allocate anonymous memory
+ * up to OOM with memory.group.oom set. Then it checks that all
+ * processes in the parent and leaf were killed.
+ */
+static int test_memcg_oom_group_parent_events(const char *root)
+{
+       int ret = KSFT_FAIL;
+       char *parent, *child;
+
+       parent = cg_name(root, "memcg_test_0");
+       child = cg_name(root, "memcg_test_0/memcg_test_1");
+
+       if (!parent || !child)
+               goto cleanup;
+
+       if (cg_create(parent))
+               goto cleanup;
+
+       if (cg_create(child))
+               goto cleanup;
+
+       if (cg_write(parent, "memory.max", "80M"))
+               goto cleanup;
+
+       if (cg_write(parent, "memory.swap.max", "0"))
+               goto cleanup;
+
+       if (cg_write(parent, "memory.oom.group", "1"))
+               goto cleanup;
+
+       cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
+       cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
+       cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
+
+       if (!cg_run(child, alloc_anon, (void *)MB(100)))
+               goto cleanup;
+
+       if (cg_test_proc_killed(child))
+               goto cleanup;
+       if (cg_test_proc_killed(parent))
+               goto cleanup;
+
+       ret = KSFT_PASS;
+
+cleanup:
+       if (child)
+               cg_destroy(child);
+       if (parent)
+               cg_destroy(parent);
+       free(child);
+       free(parent);
+
+       return ret;
+}
+
+/*
+ * This test disables swapping and tries to allocate anonymous memory
+ * up to OOM with memory.group.oom set. Then it checks that all
+ * processes were killed except those set with OOM_SCORE_ADJ_MIN
+ */
+static int test_memcg_oom_group_score_events(const char *root)
+{
+       int ret = KSFT_FAIL;
+       char *memcg;
+       int safe_pid;
+
+       memcg = cg_name(root, "memcg_test_0");
+
+       if (!memcg)
+               goto cleanup;
+
+       if (cg_create(memcg))
+               goto cleanup;
+
+       if (cg_write(memcg, "memory.max", "50M"))
+               goto cleanup;
+
+       if (cg_write(memcg, "memory.swap.max", "0"))
+               goto cleanup;
+
+       if (cg_write(memcg, "memory.oom.group", "1"))
+               goto cleanup;
+
+       safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
+       if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
+               goto cleanup;
+
+       cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
+       if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
+               goto cleanup;
+
+       if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
+               goto cleanup;
+
+       if (kill(safe_pid, SIGKILL))
+               goto cleanup;
+
+       ret = KSFT_PASS;
+
+cleanup:
+       if (memcg)
+               cg_destroy(memcg);
+       free(memcg);
+
+       return ret;
+}
+
+
 #define T(x) { x, #x }
 struct memcg_test {
        int (*fn)(const char *root);
@@ -978,6 +1180,9 @@ struct memcg_test {
        T(test_memcg_oom_events),
        T(test_memcg_swap_max),
        T(test_memcg_sock),
+       T(test_memcg_oom_group_leaf_events),
+       T(test_memcg_oom_group_parent_events),
+       T(test_memcg_oom_group_score_events),
 };
 #undef T
 
diff --git a/tools/testing/selftests/efivarfs/config b/tools/testing/selftests/efivarfs/config
new file mode 100644 (file)
index 0000000..4e151f1
--- /dev/null
@@ -0,0 +1 @@
+CONFIG_EFIVAR_FS=y
index ff8feca..ad1eeb1 100644 (file)
@@ -18,6 +18,7 @@ TEST_GEN_FILES := \
 
 TEST_PROGS := run.sh
 
+top_srcdir = ../../../../..
 include ../../lib.mk
 
 $(TEST_GEN_FILES): $(HEADERS)
index 1bbb475..4665cdb 100644 (file)
@@ -21,11 +21,8 @@ endef
 CFLAGS += -O2 -g -std=gnu99 -Wall -I../../../../usr/include/
 LDLIBS += -lmount -I/usr/include/libmount
 
-$(BINARIES): ../../../gpio/gpio-utils.o ../../../../usr/include/linux/gpio.h
+$(BINARIES):| khdr
+$(BINARIES): ../../../gpio/gpio-utils.o
 
 ../../../gpio/gpio-utils.o:
        make ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) -C ../../../gpio
-
-../../../../usr/include/linux/gpio.h:
-       make -C ../../../.. headers_install INSTALL_HDR_PATH=$(shell pwd)/../../../../usr/
-
index 15e6b75..a3edb2c 100644 (file)
@@ -19,7 +19,6 @@
 #define KSFT_FAIL  1
 #define KSFT_XFAIL 2
 #define KSFT_XPASS 3
-/* Treat skip as pass */
 #define KSFT_SKIP  4
 
 /* counters */
index 4202139..5c34752 100644 (file)
@@ -1,4 +1,5 @@
 cr4_cpuid_sync_test
+platform_info_test
 set_sregs_test
 sync_regs_test
 vmx_tsc_adjust_test
index 03b0f55..ec32dad 100644 (file)
@@ -6,7 +6,8 @@ UNAME_M := $(shell uname -m)
 LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c
 LIBKVM_x86_64 = lib/x86.c lib/vmx.c
 
-TEST_GEN_PROGS_x86_64 = set_sregs_test
+TEST_GEN_PROGS_x86_64 = platform_info_test
+TEST_GEN_PROGS_x86_64 += set_sregs_test
 TEST_GEN_PROGS_x86_64 += sync_regs_test
 TEST_GEN_PROGS_x86_64 += vmx_tsc_adjust_test
 TEST_GEN_PROGS_x86_64 += cr4_cpuid_sync_test
@@ -20,7 +21,7 @@ INSTALL_HDR_PATH = $(top_srcdir)/usr
 LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
 LINUX_TOOL_INCLUDE = $(top_srcdir)tools/include
 CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_TOOL_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -I..
-LDFLAGS += -lpthread
+LDFLAGS += -pthread
 
 # After inclusion, $(OUTPUT) is defined and
 # $(TEST_GEN_PROGS) starts with $(OUTPUT)/
@@ -37,9 +38,6 @@ $(LIBKVM_OBJ): $(OUTPUT)/%.o: %.c
 $(OUTPUT)/libkvm.a: $(LIBKVM_OBJ)
        $(AR) crs $@ $^
 
-$(LINUX_HDR_PATH):
-       make -C $(top_srcdir) headers_install
-
-all: $(STATIC_LIBS) $(LINUX_HDR_PATH)
+all: $(STATIC_LIBS)
 $(TEST_GEN_PROGS): $(STATIC_LIBS)
-$(TEST_GEN_PROGS) $(LIBKVM_OBJ): | $(LINUX_HDR_PATH)
+$(STATIC_LIBS):| khdr
index bb5a25f..3acf9a9 100644 (file)
@@ -50,6 +50,7 @@ enum vm_mem_backing_src_type {
 };
 
 int kvm_check_cap(long cap);
+int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
 
 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
 void kvm_vm_free(struct kvm_vm *vmp);
@@ -108,6 +109,9 @@ void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid,
                          struct kvm_vcpu_events *events);
 void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid,
                          struct kvm_vcpu_events *events);
+uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index);
+void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
+       uint64_t msr_value);
 
 const char *exit_reason_str(unsigned int exit_reason);
 
index e9ba389..6fd8c08 100644 (file)
@@ -63,6 +63,29 @@ int kvm_check_cap(long cap)
        return ret;
 }
 
+/* VM Enable Capability
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   cap - Capability
+ *
+ * Output Args: None
+ *
+ * Return: On success, 0. On failure a TEST_ASSERT failure is produced.
+ *
+ * Enables a capability (KVM_CAP_*) on the VM.
+ */
+int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap)
+{
+       int ret;
+
+       ret = ioctl(vm->fd, KVM_ENABLE_CAP, cap);
+       TEST_ASSERT(ret == 0, "KVM_ENABLE_CAP IOCTL failed,\n"
+               "  rc: %i errno: %i", ret, errno);
+
+       return ret;
+}
+
 static void vm_open(struct kvm_vm *vm, int perm)
 {
        vm->kvm_fd = open(KVM_DEV_PATH, perm);
@@ -1220,6 +1243,72 @@ void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid,
                ret, errno);
 }
 
+/* VCPU Get MSR
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   vcpuid - VCPU ID
+ *   msr_index - Index of MSR
+ *
+ * Output Args: None
+ *
+ * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced.
+ *
+ * Get value of MSR for VCPU.
+ */
+uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index)
+{
+       struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+       struct {
+               struct kvm_msrs header;
+               struct kvm_msr_entry entry;
+       } buffer = {};
+       int r;
+
+       TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+       buffer.header.nmsrs = 1;
+       buffer.entry.index = msr_index;
+       r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header);
+       TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n"
+               "  rc: %i errno: %i", r, errno);
+
+       return buffer.entry.data;
+}
+
+/* VCPU Set MSR
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   vcpuid - VCPU ID
+ *   msr_index - Index of MSR
+ *   msr_value - New value of MSR
+ *
+ * Output Args: None
+ *
+ * Return: On success, nothing. On failure a TEST_ASSERT is produced.
+ *
+ * Set value of MSR for VCPU.
+ */
+void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
+       uint64_t msr_value)
+{
+       struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+       struct {
+               struct kvm_msrs header;
+               struct kvm_msr_entry entry;
+       } buffer = {};
+       int r;
+
+       TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+       memset(&buffer, 0, sizeof(buffer));
+       buffer.header.nmsrs = 1;
+       buffer.entry.index = msr_index;
+       buffer.entry.data = msr_value;
+       r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header);
+       TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n"
+               "  rc: %i errno: %i", r, errno);
+}
+
 /* VM VCPU Args Set
  *
  * Input Args:
diff --git a/tools/testing/selftests/kvm/platform_info_test.c b/tools/testing/selftests/kvm/platform_info_test.c
new file mode 100644 (file)
index 0000000..3764e71
--- /dev/null
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for x86 KVM_CAP_MSR_PLATFORM_INFO
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Verifies expected behavior of controlling guest access to
+ * MSR_PLATFORM_INFO.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "x86.h"
+
+#define VCPU_ID 0
+#define MSR_PLATFORM_INFO_MAX_TURBO_RATIO 0xff00
+
+static void guest_code(void)
+{
+       uint64_t msr_platform_info;
+
+       for (;;) {
+               msr_platform_info = rdmsr(MSR_PLATFORM_INFO);
+               GUEST_SYNC(msr_platform_info);
+               asm volatile ("inc %r11");
+       }
+}
+
+static void set_msr_platform_info_enabled(struct kvm_vm *vm, bool enable)
+{
+       struct kvm_enable_cap cap = {};
+
+       cap.cap = KVM_CAP_MSR_PLATFORM_INFO;
+       cap.flags = 0;
+       cap.args[0] = (int)enable;
+       vm_enable_cap(vm, &cap);
+}
+
+static void test_msr_platform_info_enabled(struct kvm_vm *vm)
+{
+       struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+       struct guest_args args;
+
+       set_msr_platform_info_enabled(vm, true);
+       vcpu_run(vm, VCPU_ID);
+       TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+                       "Exit_reason other than KVM_EXIT_IO: %u (%s),\n",
+                       run->exit_reason,
+                       exit_reason_str(run->exit_reason));
+       guest_args_read(vm, VCPU_ID, &args);
+       TEST_ASSERT(args.port == GUEST_PORT_SYNC,
+                       "Received IO from port other than PORT_HOST_SYNC: %u\n",
+                       run->io.port);
+       TEST_ASSERT((args.arg1 & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) ==
+               MSR_PLATFORM_INFO_MAX_TURBO_RATIO,
+               "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.",
+               MSR_PLATFORM_INFO_MAX_TURBO_RATIO);
+}
+
+static void test_msr_platform_info_disabled(struct kvm_vm *vm)
+{
+       struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+
+       set_msr_platform_info_enabled(vm, false);
+       vcpu_run(vm, VCPU_ID);
+       TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN,
+                       "Exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s)\n",
+                       run->exit_reason,
+                       exit_reason_str(run->exit_reason));
+}
+
+int main(int argc, char *argv[])
+{
+       struct kvm_vm *vm;
+       struct kvm_run *state;
+       int rv;
+       uint64_t msr_platform_info;
+
+       /* Tell stdout not to buffer its content */
+       setbuf(stdout, NULL);
+
+       rv = kvm_check_cap(KVM_CAP_MSR_PLATFORM_INFO);
+       if (!rv) {
+               fprintf(stderr,
+                       "KVM_CAP_MSR_PLATFORM_INFO not supported, skip test\n");
+               exit(KSFT_SKIP);
+       }
+
+       vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+       msr_platform_info = vcpu_get_msr(vm, VCPU_ID, MSR_PLATFORM_INFO);
+       vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO,
+               msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO);
+       test_msr_platform_info_disabled(vm);
+       test_msr_platform_info_enabled(vm);
+       vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, msr_platform_info);
+
+       kvm_vm_free(vm);
+
+       return 0;
+}
index 17ab366..0a8e758 100644 (file)
@@ -16,8 +16,20 @@ TEST_GEN_PROGS := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS))
 TEST_GEN_PROGS_EXTENDED := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS_EXTENDED))
 TEST_GEN_FILES := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_FILES))
 
+top_srcdir ?= ../../../..
+include $(top_srcdir)/scripts/subarch.include
+ARCH           ?= $(SUBARCH)
+
 all: $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES)
 
+.PHONY: khdr
+khdr:
+       make ARCH=$(ARCH) -C $(top_srcdir) headers_install
+
+ifdef KSFT_KHDR_INSTALL
+$(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES):| khdr
+endif
+
 .ONESHELL:
 define RUN_TEST_PRINT_RESULT
        TEST_HDR_MSG="selftests: "`basename $$PWD`:" $$BASENAME_TEST";  \
index 2fde301..a7e8cd5 100644 (file)
@@ -2,3 +2,4 @@ CONFIG_MEMORY_HOTPLUG=y
 CONFIG_MEMORY_HOTPLUG_SPARSE=y
 CONFIG_NOTIFIER_ERROR_INJECTION=y
 CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m
+CONFIG_MEMORY_HOTREMOVE=y
index 9cca68e..919aa2a 100644 (file)
@@ -15,6 +15,7 @@ TEST_GEN_FILES += udpgso udpgso_bench_tx udpgso_bench_rx
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
 TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls
 
+KSFT_KHDR_INSTALL := 1
 include ../lib.mk
 
 $(OUTPUT)/reuseport_bpf_numa: LDFLAGS += -lnuma
index b3ebf26..8fdfeaf 100644 (file)
@@ -502,6 +502,55 @@ TEST_F(tls, recv_peek_multiple)
        EXPECT_EQ(memcmp(test_str, buf, send_len), 0);
 }
 
+TEST_F(tls, recv_peek_multiple_records)
+{
+       char const *test_str = "test_read_peek_mult_recs";
+       char const *test_str_first = "test_read_peek";
+       char const *test_str_second = "_mult_recs";
+       int len;
+       char buf[64];
+
+       len = strlen(test_str_first);
+       EXPECT_EQ(send(self->fd, test_str_first, len, 0), len);
+
+       len = strlen(test_str_second) + 1;
+       EXPECT_EQ(send(self->fd, test_str_second, len, 0), len);
+
+       len = sizeof(buf);
+       memset(buf, 0, len);
+       EXPECT_NE(recv(self->cfd, buf, len, MSG_PEEK), -1);
+
+       /* MSG_PEEK can only peek into the current record. */
+       len = strlen(test_str_first) + 1;
+       EXPECT_EQ(memcmp(test_str_first, buf, len), 0);
+
+       len = sizeof(buf);
+       memset(buf, 0, len);
+       EXPECT_NE(recv(self->cfd, buf, len, 0), -1);
+
+       /* Non-MSG_PEEK will advance strparser (and therefore record)
+        * however.
+        */
+       len = strlen(test_str) + 1;
+       EXPECT_EQ(memcmp(test_str, buf, len), 0);
+
+       /* MSG_MORE will hold current record open, so later MSG_PEEK
+        * will see everything.
+        */
+       len = strlen(test_str_first);
+       EXPECT_EQ(send(self->fd, test_str_first, len, MSG_MORE), len);
+
+       len = strlen(test_str_second) + 1;
+       EXPECT_EQ(send(self->fd, test_str_second, len, 0), len);
+
+       len = sizeof(buf);
+       memset(buf, 0, len);
+       EXPECT_NE(recv(self->cfd, buf, len, MSG_PEEK), -1);
+
+       len = strlen(test_str) + 1;
+       EXPECT_EQ(memcmp(test_str, buf, len), 0);
+}
+
 TEST_F(tls, pollin)
 {
        char const *test_str = "test_poll";
index a728040..14cfcf0 100644 (file)
@@ -5,6 +5,7 @@ TEST_PROGS := hwtstamp_config rxtimestamp timestamping txtimestamp
 
 all: $(TEST_PROGS)
 
+top_srcdir = ../../../../..
 include ../../lib.mk
 
 clean:
index 9881876..e94b7b1 100644 (file)
@@ -26,10 +26,6 @@ TEST_PROGS := run_vmtests
 
 include ../lib.mk
 
-$(OUTPUT)/userfaultfd: ../../../../usr/include/linux/kernel.h
 $(OUTPUT)/userfaultfd: LDLIBS += -lpthread
 
 $(OUTPUT)/mlock-random-test: LDLIBS += -lcap
-
-../../../../usr/include/linux/kernel.h:
-       make -C ../../../.. headers_install